
|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
| Segerstolpe | Human pancreas | 12 | 2133 | 22757 | Smart-Seq2 | E-MTAB-5061 |

In [1]:
import numpy as np
import pandas as pd
# Load the file "Filtered_Segerstolpe_HumanPancreas_data.csv", using the first column as the index, which serves as unique cell identifiers.
data_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Segerstolpe/Filtered_Segerstolpe_HumanPancreas_data.csv', index_col=0)
print(data_df.iloc[:5, :5])
print(data_df.shape)
print(data_df.columns[0])

        SGIP1  AZIN2  CLIC4  AGBL4  NECAP2
AZ_A10      0      0      3      0       0
AZ_A11      0      0      0      0       0
AZ_A12      0      0      0      0       0
AZ_A2      32      0      1      0       0
AZ_A6       0      0      0      0       0
(2133, 22757)
SGIP1


In [2]:
# Load cell type annotation data

label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Segerstolpe/Labels.csv', header=0)
print(label_df.iloc[:5,:])
print(label_df.shape)
label_df = label_df.set_index(data_df.index)
print(label_df.iloc[:5,:])

       x
0  delta
1  alpha
2  delta
3  gamma
4  alpha
(2133, 1)
            x
AZ_A10  delta
AZ_A11  alpha
AZ_A12  delta
AZ_A2   gamma
AZ_A6   alpha


In [3]:
genes = data_df.columns.values
print(genes)
print(len(genes))

['SGIP1' 'AZIN2' 'CLIC4' ... 'ERCC_0.11444092:mix1_0.22888184:mix2'
 'ERCC_0.05722046:mix1_0.11444092:mix2'
 'ERCC_0.01430512:mix1_0.02861023:mix2']
22757


In [5]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

alpha: 872
ductal: 381
beta: 263
gamma: 195
acinar: 182
delta: 110
PSC: 54
co-expression: 39
endothelial: 15
epsilon: 7
MHC class II: 5
mast: 5
unclassified endocrine: 5


In [6]:
# Filter cell types with a count of 10 or more cells, excluding 'co-expression'.
selected_cell_types = cell_type_counts[(cell_type_counts >= 10) & (cell_type_counts.index != 'co-expression')].index

# Retain the corresponding cell data in the gene expression dataset and update the label data accordingly.
data_df_selected = data_df.loc[label_df[label_df['x'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['x'].isin(selected_cell_types)]
print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])
print(label_df.shape)
print(label_df.iloc[:5, :])


(2072, 22757)
        SGIP1  AZIN2  CLIC4  AGBL4  NECAP2  SLC45A1  TGFBR3  DBT  RFWD2  \
AZ_A10      0      0      3      0       0        0       0   55      4   
AZ_A11      0      0      0      0       0        0       0    0      9   
AZ_A12      0      0      0      0       0        0       0    0      0   
AZ_A2      32      0      1      0       0        0       1    0      0   
AZ_A6       0      0      0      0       0        0       0    0      0   

        C1orf21  ...  ERCC_14.6484375:mix1_29.296875:mix2  \
AZ_A10        0  ...                                    0   
AZ_A11      148  ...                                    0   
AZ_A12        0  ...                                    0   
AZ_A2        99  ...                                    0   
AZ_A6         2  ...                                    0   

        ERCC_7.32421875:mix1_14.6484375:mix2  \
AZ_A10                                     1   
AZ_A11                                     0   
AZ_A12                  

In [7]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")


alpha: 872
ductal: 381
beta: 263
gamma: 195
acinar: 182
delta: 110
PSC: 54
endothelial: 15


In [8]:
# Calculate how many cells express each gene.
# Here, an expression value greater than 0 indicates the gene is expressed in a cell.
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)
# Print the number of genes expressed in fewer than 10 cells.
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))
# Select genes expressed in at least 10 cells.
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the genes meeting the criteria in the dataset.
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape of the filtered data and the first few rows.
print(data_df_filtered.shape)

# Convert all data in data_df_filtered to the float32 data type.
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())


4025
(2072, 18732)
        SGIP1  AZIN2  CLIC4  AGBL4  NECAP2  SLC45A1  TGFBR3   DBT  RFWD2  \
AZ_A10    0.0    0.0    3.0    0.0     0.0      0.0     0.0  55.0    4.0   
AZ_A11    0.0    0.0    0.0    0.0     0.0      0.0     0.0   0.0    9.0   
AZ_A12    0.0    0.0    0.0    0.0     0.0      0.0     0.0   0.0    0.0   
AZ_A2    32.0    0.0    1.0    0.0     0.0      0.0     1.0   0.0    0.0   
AZ_A6     0.0    0.0    0.0    0.0     0.0      0.0     0.0   0.0    0.0   

        C1orf21  ...  ERCC_58.59375:mix1_117.1875:mix2  \
AZ_A10      0.0  ...                               0.0   
AZ_A11    148.0  ...                               0.0   
AZ_A12      0.0  ...                               0.0   
AZ_A2      99.0  ...                               0.0   
AZ_A6       2.0  ...                               0.0   

        ERCC_29.296875:mix1_58.59375:mix2  \
AZ_A10                                0.0   
AZ_A11                                0.0   
AZ_A12                                0.

In [9]:
str_labels = np.unique(label_df.values).tolist()
label = [str_labels.index(x) for x in label_df.values]
# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]
barcode = data_df_filtered.index.values
gene_symbol = data_df_filtered.columns.values

print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(len(label))
print(type(barcode))
print(barcode)
print(len(barcode))
print(type(gene_symbol))
print(gene_symbol)
print(len(gene_symbol))

<class 'list'>
['PSC', 'acinar', 'alpha', 'beta', 'delta', 'ductal', 'endothelial', 'gamma']
<class 'list'>
[4, 2, 4, 7, 2, 1, 3, 4, 4, 2, 2, 2, 2, 1, 2, 2, 3, 4, 2, 5, 3, 2, 2, 5, 7, 3, 2, 2, 4, 3, 2, 7, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 0, 1, 2, 5, 2, 2, 3, 2, 2, 7, 4, 1, 3, 7, 3, 7, 3, 2, 3, 2, 1, 2, 2, 4, 4, 1, 4, 2, 3, 3, 3, 2, 7, 3, 7, 2, 1, 7, 2, 3, 3, 3, 2, 3, 4, 1, 3, 5, 2, 2, 3, 4, 3, 2, 4, 3, 3, 4, 1, 4, 3, 2, 4, 5, 1, 2, 3, 2, 3, 5, 2, 3, 3, 2, 3, 2, 2, 2, 3, 1, 1, 2, 1, 1, 4, 7, 2, 7, 4, 2, 4, 2, 1, 7, 3, 7, 7, 2, 2, 3, 3, 2, 3, 2, 3, 2, 7, 7, 3, 1, 4, 3, 5, 3, 4, 3, 3, 3, 2, 2, 2, 3, 3, 2, 2, 2, 0, 3, 2, 2, 5, 3, 5, 2, 2, 2, 2, 4, 2, 2, 2, 6, 1, 2, 2, 2, 5, 5, 2, 2, 3, 2, 3, 4, 7, 4, 2, 3, 2, 3, 2, 5, 5, 2, 5, 1, 2, 2, 3, 2, 5, 2, 2, 2, 2, 1, 2, 1, 7, 2, 2, 2, 2, 2, 2, 7, 7, 2, 2, 2, 2, 2, 3, 2, 2, 2, 7, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 3, 2, 2, 5, 2, 3, 2, 1, 2, 2, 2, 2, 2, 4, 2, 1, 3, 5, 7, 2, 2, 5, 2, 5, 1, 2, 3, 7, 2, 2, 4, 3, 7, 2, 5, 1, 3, 2, 4, 2, 

In [10]:
# Create a mapping dictionary from cell type labels to numeric indices
label_to_index = {label1: i for i, label1 in enumerate(str_labels)}
# Print the mapping of cell type labels to their numeric indices
for label1, index in label_to_index.items():
    print(f"{label1}: {index}")

PSC: 0
acinar: 1
alpha: 2
beta: 3
delta: 4
ductal: 5
endothelial: 6
gamma: 7


In [11]:
print(len(gene_symbol))
print(len(barcode))
print(original_labels)

18732
2072
['delta', 'alpha', 'delta', 'gamma', 'alpha', 'acinar', 'beta', 'delta', 'delta', 'alpha', 'alpha', 'alpha', 'alpha', 'acinar', 'alpha', 'alpha', 'beta', 'delta', 'alpha', 'ductal', 'beta', 'alpha', 'alpha', 'ductal', 'gamma', 'beta', 'alpha', 'alpha', 'delta', 'beta', 'alpha', 'gamma', 'alpha', 'alpha', 'alpha', 'alpha', 'alpha', 'beta', 'alpha', 'alpha', 'beta', 'beta', 'alpha', 'PSC', 'acinar', 'alpha', 'ductal', 'alpha', 'alpha', 'beta', 'alpha', 'alpha', 'gamma', 'delta', 'acinar', 'beta', 'gamma', 'beta', 'gamma', 'beta', 'alpha', 'beta', 'alpha', 'acinar', 'alpha', 'alpha', 'delta', 'delta', 'acinar', 'delta', 'alpha', 'beta', 'beta', 'beta', 'alpha', 'gamma', 'beta', 'gamma', 'alpha', 'acinar', 'gamma', 'alpha', 'beta', 'beta', 'beta', 'alpha', 'beta', 'delta', 'acinar', 'beta', 'ductal', 'alpha', 'alpha', 'beta', 'delta', 'beta', 'alpha', 'delta', 'beta', 'beta', 'delta', 'acinar', 'delta', 'beta', 'alpha', 'delta', 'ductal', 'acinar', 'alpha', 'beta', 'alpha', 'bet

In [12]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (2072, 18732)
shape of cell labels: 2072
number of cell types: 8


In [13]:
# Save the data
data_dict = {}
data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/Segerstolpe.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
