
|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
|Baron (Human) | Human pancreas | 14 | 8569 | 17499 | inDrop | GSE84133 |

In [1]:
import numpy as np
import pandas as pd

# Read the Filtered_Baron_HumanPancreas_data.csv file, with the first column as the index, which is the unique identifier for cells.
data_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Baron Human/Filtered_Baron_HumanPancreas_data.csv', index_col=0)
print(data_df.iloc[:5, :5])
print(data_df.shape)
print(data_df.columns[0])

                             A1BG  A1CF  A2M  A4GALT  AAAS
human1_lib1.final_cell_0001     0     4    0       0     0
human1_lib1.final_cell_0002     0     0    0       0     0
human1_lib1.final_cell_0003     0     0    0       0     0
human1_lib1.final_cell_0004     0     0    0       0     1
human1_lib1.final_cell_0005     0     0    0       0     0
(8569, 17499)
A1BG


In [2]:
gene_symbol = data_df.columns.values
print(gene_symbol)
print(len(gene_symbol))

['A1BG' 'A1CF' 'A2M' ... 'ZZEF1' 'ZZZ3' 'pk']
17499


In [3]:
# Load cell type annotation data

label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Baron Human/Labels.csv', header=0)
print(label_df.iloc[:5,:])
print(label_df.shape)
label_df = label_df.set_index(data_df.index)
print(label_df.iloc[:5,:])

        x
0  acinar
1  acinar
2  acinar
3  acinar
4  acinar
(8569, 1)
                                  x
human1_lib1.final_cell_0001  acinar
human1_lib1.final_cell_0002  acinar
human1_lib1.final_cell_0003  acinar
human1_lib1.final_cell_0004  acinar
human1_lib1.final_cell_0005  acinar


In [4]:
# Calculate the number of cells for each cell type
cell_type_counts = label_df['x'].value_counts()

# Filter cell types with at least 10 cells
selected_cell_types = cell_type_counts[cell_type_counts >= 10].index

# Retain corresponding cell data in gene expression data and update label data accordingly
data_df_selected = data_df.loc[label_df[label_df['x'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['x'].isin(selected_cell_types)]

print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])
print(label_df.shape)
print(label_df.iloc[:5, :])

(8562, 17499)
                             A1BG  A1CF  A2M  A4GALT  AAAS  AACS  AACSP1  \
human1_lib1.final_cell_0001     0     4    0       0     0     0       0   
human1_lib1.final_cell_0002     0     0    0       0     0     2       0   
human1_lib1.final_cell_0003     0     0    0       0     0     0       0   
human1_lib1.final_cell_0004     0     0    0       0     1     0       0   
human1_lib1.final_cell_0005     0     0    0       0     0     0       0   

                             AADAC  AADACL2  AADACP1  ...  ZWILCH  ZWINT  \
human1_lib1.final_cell_0001      6        0        0  ...       0      0   
human1_lib1.final_cell_0002      8        0        0  ...       0      0   
human1_lib1.final_cell_0003      0        0        0  ...       0      0   
human1_lib1.final_cell_0004      0        0        0  ...       1      0   
human1_lib1.final_cell_0005      0        0        0  ...       0      0   

                             ZXDA  ZXDB  ZXDC  ZYG11B  ZYX  ZZEF1  ZZZ3 

In [5]:
# Calculate the number of cells in which each gene is expressed
# Here, an expression value greater than 0 indicates that the gene is expressed in the cell
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)

print(genes_expressed_in_cells)
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))

# Filter genes expressed in at least 10 cells
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the genes that meet the criteria in the data
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape and the first few rows of the filtered data
print(data_df_filtered.shape)
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())


A1BG        42
A1CF      1408
A2M        273
A4GALT     444
AAAS      1215
          ... 
ZYG11B    1394
ZYX       2689
ZZEF1     1225
ZZZ3       872
pk        1189
Length: 17499, dtype: int64
2390
(8562, 15109)
                             A1BG  A1CF  A2M  A4GALT  AAAS  AACS  AADAC  \
human1_lib1.final_cell_0001     0     4    0       0     0     0      6   
human1_lib1.final_cell_0002     0     0    0       0     0     2      8   
human1_lib1.final_cell_0003     0     0    0       0     0     0      0   
human1_lib1.final_cell_0004     0     0    0       0     1     0      0   
human1_lib1.final_cell_0005     0     0    0       0     0     0      0   

                             AADACP1  AADAT  AAED1  ...  ZWILCH  ZWINT  ZXDA  \
human1_lib1.final_cell_0001        0      0      0  ...       0      0     0   
human1_lib1.final_cell_0002        0      0      0  ...       0      0     0   
human1_lib1.final_cell_0003        0      0      0  ...       0      0     0   
human1_lib1.final

In [6]:
gene_symbol = data_df_filtered.columns.values
print(gene_symbol)
print(len(gene_symbol))

['A1BG' 'A1CF' 'A2M' ... 'ZZEF1' 'ZZZ3' 'pk']
15109


In [7]:
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

beta: 2525
alpha: 2326
ductal: 1077
acinar: 958
delta: 601
activated_stellate: 284
gamma: 255
endothelial: 252
quiescent_stellate: 173
macrophage: 55
mast: 25
epsilon: 18
schwann: 13
t_cell: 7


In [8]:
str_labels = np.unique(label_df.values).tolist()
label = [str_labels.index(x) for x in label_df.values]

# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]
barcode = data_df_filtered.index.values
gene_symbol = gene_symbol
print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(type(barcode))
print(barcode)
print(type(gene_symbol))
print(gene_symbol)

<class 'list'>
['acinar', 'activated_stellate', 'alpha', 'beta', 'delta', 'ductal', 'endothelial', 'epsilon', 'gamma', 'macrophage', 'mast', 'quiescent_stellate', 'schwann']
<class 'list'>
[0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 4, 3, 1, 3, 0, 5, 3, 3, 2, 5, 3, 4, 2, 3, 0, 0, 3, 4, 4, 4, 4, 3, 0, 3, 3, 3, 4, 1, 3, 4, 1, 1, 1, 0, 1, 4, 3, 4, 3, 0, 3, 3, 1, 3, 3, 3, 3, 5, 2, 3, 5, 5, 0, 2, 3, 4, 2, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 0, 1, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 4, 3, 3, 3, 3, 0, 4, 3, 3, 4, 3, 3, 3, 4, 3, 4, 3, 3, 3, 7, 3, 3, 3, 4, 3, 4, 3, 2, 2, 3, 0, 2, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 4, 3, 5, 5, 3, 3, 3, 3, 4, 3, 1, 3, 3, 2, 3, 2, 3, 8, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3, 0, 4, 3, 4, 3, 0, 2, 3, 2, 3, 2, 6, 3, 3, 2, 4, 3, 3, 4, 3, 8, 2, 3, 3, 4, 2, 5, 3, 2, 3, 5, 2, 5, 3, 3, 2, 3, 3, 11, 5, 5, 6, 1, 3, 4, 8, 3, 3, 3, 2, 3, 3, 4, 5, 3, 4, 4, 3, 4, 3, 1, 3, 3, 4, 3, 11, 3, 0, 2, 4, 3, 2, 3, 6, 4, 4, 3, 3, 4, 8, 2, 3, 3, 3, 5, 4, 2, 3, 8, 3, 3, 3, 11, 

In [9]:
print(len(gene_symbol))
print(len(barcode))
print(original_labels)
print(len(original_labels))

15109
8562
['acinar', 'acinar', 'acinar', 'acinar', 'acinar', 'acinar', 'beta', 'acinar', 'acinar', 'acinar', 'acinar', 'acinar', 'delta', 'delta', 'beta', 'activated_stellate', 'beta', 'acinar', 'ductal', 'beta', 'beta', 'alpha', 'ductal', 'beta', 'delta', 'alpha', 'beta', 'acinar', 'acinar', 'beta', 'delta', 'delta', 'delta', 'delta', 'beta', 'acinar', 'beta', 'beta', 'beta', 'delta', 'activated_stellate', 'beta', 'delta', 'activated_stellate', 'activated_stellate', 'activated_stellate', 'acinar', 'activated_stellate', 'delta', 'beta', 'delta', 'beta', 'acinar', 'beta', 'beta', 'activated_stellate', 'beta', 'beta', 'beta', 'beta', 'ductal', 'alpha', 'beta', 'ductal', 'ductal', 'acinar', 'alpha', 'beta', 'delta', 'alpha', 'beta', 'beta', 'beta', 'delta', 'beta', 'beta', 'delta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'acinar', 'activated_stellate', 'delta', 'delta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'delta', 'delta', 'beta', 'delt

In [10]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (8562, 15109)
shape of cell labels: 8562
number of cell types: 13


In [11]:
# Convert all data types in data_df_filtered to float32
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())

                             A1BG  A1CF  A2M  A4GALT  AAAS  AACS  AADAC  \
human1_lib1.final_cell_0001   0.0   4.0  0.0     0.0   0.0   0.0    6.0   
human1_lib1.final_cell_0002   0.0   0.0  0.0     0.0   0.0   2.0    8.0   
human1_lib1.final_cell_0003   0.0   0.0  0.0     0.0   0.0   0.0    0.0   
human1_lib1.final_cell_0004   0.0   0.0  0.0     0.0   1.0   0.0    0.0   
human1_lib1.final_cell_0005   0.0   0.0  0.0     0.0   0.0   0.0    0.0   

                             AADACP1  AADAT  AAED1  ...  ZWILCH  ZWINT  ZXDA  \
human1_lib1.final_cell_0001      0.0    0.0    0.0  ...     0.0    0.0   0.0   
human1_lib1.final_cell_0002      0.0    0.0    0.0  ...     0.0    0.0   0.0   
human1_lib1.final_cell_0003      0.0    0.0    0.0  ...     0.0    0.0   0.0   
human1_lib1.final_cell_0004      0.0    0.0    1.0  ...     1.0    0.0   0.0   
human1_lib1.final_cell_0005      0.0    0.0    0.0  ...     0.0    0.0   0.0   

                             ZXDB  ZXDC  ZYG11B  ZYX  ZZEF1  ZZZ3   

In [12]:
# Save the data
data_dict = {}
data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/Baron_Human.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
