|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
|TM | Mouse | 55 | 54865 | 19791 | 10X Genomics | GSE109774 |

In [1]:
import numpy as np
import pandas as pd

# Load the file "Filtered_TM_data.csv",using the first column as the index, which serves as unique cell identifiers.
data_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/TM/Filtered_TM_data.csv', index_col=0)
print(data_df.iloc[:5, :5])
print(data_df.shape)
print(data_df.columns[0])

                           Xkr4  Rp1  Sox17  Mrpl15  Lypla1
10X_P4_3_AAAGTAGAGATGCCAG     0    0      0       1       0
10X_P4_3_AACCGCGTCCAACCAA     0    0      0       0       2
10X_P4_3_AACTCCCGTCGGGTCT     0    0      0       0       1
10X_P4_3_AACTCTTAGTTGCAGG     0    0      0       0       2
10X_P4_3_AACTCTTTCATAACCG     0    0      0       0       0
(54865, 19791)
Xkr4


In [2]:
# Load the cell type annotation data.
# The cell type annotations include both "class" and "subclass".
label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/TM/Labels.csv', header=0)
# Print the first few rows and the shape of the dataset.
print(label_df.iloc[:5, :])
print(label_df.shape)
# Align the index of label_df with that of data_df.
label_df = label_df.set_index(data_df.index)
# Print the first few rows of the updated label_df.
print(label_df.iloc[:5, :])


                         x
0             bladder cell
1             bladder cell
2             bladder cell
3  bladder urothelial cell
4             bladder cell
(54865, 1)
                                                 x
10X_P4_3_AAAGTAGAGATGCCAG             bladder cell
10X_P4_3_AACCGCGTCCAACCAA             bladder cell
10X_P4_3_AACTCCCGTCGGGTCT             bladder cell
10X_P4_3_AACTCTTAGTTGCAGG  bladder urothelial cell
10X_P4_3_AACTCTTTCATAACCG             bladder cell


In [3]:
genes = data_df.columns.values
print(genes)
print(len(genes))

['Xkr4' 'Rp1' 'Sox17' ... 'LOC100041346' 'Sly' 'Erdr1']
19791


In [4]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

B cell: 8294
mesenchymal cell: 7848
basal cell of epidermis: 4424
T cell: 4409
stromal cell: 3240
keratinocyte: 3076
endothelial cell: 2843
hepatocyte: 1764
immature T cell: 1354
macrophage: 1320
bladder cell: 1203
kidney proximal straight tubule epithelial cell: 1198
bladder urothelial cell: 1167
blood cell: 1139
mesenchymal stem cell: 1136
natural killer cell: 1054
epithelial cell: 871
granulocyte: 725
monocyte: 525
kidney loop of Henle ascending limb epithelial cell: 471
lung endothelial cell: 462
luminal epithelial cell of mammary gland: 459
kidney collecting duct epithelial cell: 443
kidney capillary endothelial cell: 392
hematopoietic precursor cell: 392
basal cell: 392
granulocytopoietic cell: 378
neuroendocrine cell: 362
skeletal muscle satellite cell: 354
alveolar macrophage: 345
leukocyte: 324
proerythroblast: 265
late pro-B cell: 265
promonocyte: 257
fibroblast: 226
non-classical monocyte: 220
classical monocyte: 161
erythroblast: 155
immature B cell: 113
type II pneumocyte:

In [5]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
# Filter cell types with a count of 10 or more cells.
selected_cell_types = cell_type_counts[cell_type_counts >= 10].index
# Retain the corresponding cell data in the gene expression dataset and update the label data accordingly.
data_df_selected = data_df.loc[label_df[label_df['x'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['x'].isin(selected_cell_types)]
# Print the shape of the filtered datasets.
print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])
print(label_df.shape)
print(label_df.iloc[:5, :])


(54865, 19791)
                           Xkr4  Rp1  Sox17  Mrpl15  Lypla1  Tcea1  Rgs20  \
10X_P4_3_AAAGTAGAGATGCCAG     0    0      0       1       0      0      0   
10X_P4_3_AACCGCGTCCAACCAA     0    0      0       0       2      3      0   
10X_P4_3_AACTCCCGTCGGGTCT     0    0      0       0       1      1      0   
10X_P4_3_AACTCTTAGTTGCAGG     0    0      0       0       2      0      0   
10X_P4_3_AACTCTTTCATAACCG     0    0      0       0       0      1      0   

                           Atp6v1h  Oprk1  Npbwr1  ...  Uty  Ddx3y  Usp9y  \
10X_P4_3_AAAGTAGAGATGCCAG        0      0       0  ...    0      2      0   
10X_P4_3_AACCGCGTCCAACCAA        0      0       0  ...    0      1      0   
10X_P4_3_AACTCCCGTCGGGTCT        2      0       0  ...    0      4      0   
10X_P4_3_AACTCTTAGTTGCAGG        1      0       0  ...    0      1      0   
10X_P4_3_AACTCTTTCATAACCG        1      0       0  ...    0      0      0   

                           Rbmy1a1  LOC100040223  LOC434960

In [6]:
# Calculate how many cells express each gene.
# Here, an expression value greater than 0 indicates the gene is expressed in a cell.
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)
# Print the number of genes expressed in fewer than 10 cells.
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))
# Select genes expressed in at least 10 cells.
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the genes meeting the criteria in the dataset.
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape of the filtered data and the first few rows.
print(data_df_filtered.shape)

# Convert all data in data_df_filtered to the float32 data type.
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())

1724
(54865, 18067)
                           Xkr4  Rp1  Sox17  Mrpl15  Lypla1  Tcea1  Rgs20  \
10X_P4_3_AAAGTAGAGATGCCAG   0.0  0.0    0.0     1.0     0.0    0.0    0.0   
10X_P4_3_AACCGCGTCCAACCAA   0.0  0.0    0.0     0.0     2.0    3.0    0.0   
10X_P4_3_AACTCCCGTCGGGTCT   0.0  0.0    0.0     0.0     1.0    1.0    0.0   
10X_P4_3_AACTCTTAGTTGCAGG   0.0  0.0    0.0     0.0     2.0    0.0    0.0   
10X_P4_3_AACTCTTTCATAACCG   0.0  0.0    0.0     0.0     0.0    1.0    0.0   

                           Atp6v1h  Oprk1  Npbwr1  ...  G530011O06Rik  Vamp7  \
10X_P4_3_AAAGTAGAGATGCCAG      0.0    0.0     0.0  ...            0.0    0.0   
10X_P4_3_AACCGCGTCCAACCAA      0.0    0.0     0.0  ...            0.0    3.0   
10X_P4_3_AACTCCCGTCGGGTCT      2.0    0.0     0.0  ...            0.0    0.0   
10X_P4_3_AACTCTTAGTTGCAGG      1.0    0.0     0.0  ...            0.0    0.0   
10X_P4_3_AACTCTTTCATAACCG      1.0    0.0     0.0  ...            0.0    0.0   

                           Spry3  Tm

In [8]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

B cell: 8294
mesenchymal cell: 7848
basal cell of epidermis: 4424
T cell: 4409
stromal cell: 3240
keratinocyte: 3076
endothelial cell: 2843
hepatocyte: 1764
immature T cell: 1354
macrophage: 1320
bladder cell: 1203
kidney proximal straight tubule epithelial cell: 1198
bladder urothelial cell: 1167
blood cell: 1139
mesenchymal stem cell: 1136
natural killer cell: 1054
epithelial cell: 871
granulocyte: 725
monocyte: 525
kidney loop of Henle ascending limb epithelial cell: 471
lung endothelial cell: 462
luminal epithelial cell of mammary gland: 459
kidney collecting duct epithelial cell: 443
kidney capillary endothelial cell: 392
hematopoietic precursor cell: 392
basal cell: 392
granulocytopoietic cell: 378
neuroendocrine cell: 362
skeletal muscle satellite cell: 354
alveolar macrophage: 345
leukocyte: 324
proerythroblast: 265
late pro-B cell: 265
promonocyte: 257
fibroblast: 226
non-classical monocyte: 220
classical monocyte: 161
erythroblast: 155
immature B cell: 113
type II pneumocyte:

In [9]:
str_labels = np.unique(label_df.values).tolist()
label = [str_labels.index(x) for x in label_df.values]
# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]
barcode = data_df_filtered.index.values
gene_symbol = data_df_filtered.columns.values
print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(len(label))
print(type(barcode))
print(barcode)
print(len(barcode))
print(type(gene_symbol))
print(gene_symbol)
print(len(gene_symbol))

<class 'list'>
['B cell', 'DN1 thymic pro-T cell', 'Fraction A pre-pro B cell', 'Langerhans cell', 'T cell', 'alveolar macrophage', 'basal cell', 'basal cell of epidermis', 'basophil', 'bladder cell', 'bladder urothelial cell', 'blood cell', 'cardiac muscle cell', 'ciliated columnar cell of tracheobronchial tree', 'classical monocyte', 'dendritic cell', 'duct epithelial cell', 'early pro-B cell', 'endocardial cell', 'endothelial cell', 'endothelial cell of hepatic sinusoid', 'epithelial cell', 'erythroblast', 'fibroblast', 'granulocyte', 'granulocytopoietic cell', 'hematopoietic precursor cell', 'hepatocyte', 'immature B cell', 'immature T cell', 'keratinocyte', 'kidney capillary endothelial cell', 'kidney cell', 'kidney collecting duct epithelial cell', 'kidney loop of Henle ascending limb epithelial cell', 'kidney proximal straight tubule epithelial cell', 'late pro-B cell', 'leukocyte', 'luminal epithelial cell of mammary gland', 'lung endothelial cell', 'macrophage', 'mast cell', '

In [10]:
# Create a mapping dictionary from cell type labels to numeric indices
label_to_index = {label1: i for i, label1 in enumerate(str_labels)}
# Print the mapping of cell type labels to their numeric indices
for label1, index in label_to_index.items():
    print(f"{label1}: {index}")

B cell: 0
DN1 thymic pro-T cell: 1
Fraction A pre-pro B cell: 2
Langerhans cell: 3
T cell: 4
alveolar macrophage: 5
basal cell: 6
basal cell of epidermis: 7
basophil: 8
bladder cell: 9
bladder urothelial cell: 10
blood cell: 11
cardiac muscle cell: 12
ciliated columnar cell of tracheobronchial tree: 13
classical monocyte: 14
dendritic cell: 15
duct epithelial cell: 16
early pro-B cell: 17
endocardial cell: 18
endothelial cell: 19
endothelial cell of hepatic sinusoid: 20
epithelial cell: 21
erythroblast: 22
fibroblast: 23
granulocyte: 24
granulocytopoietic cell: 25
hematopoietic precursor cell: 26
hepatocyte: 27
immature B cell: 28
immature T cell: 29
keratinocyte: 30
kidney capillary endothelial cell: 31
kidney cell: 32
kidney collecting duct epithelial cell: 33
kidney loop of Henle ascending limb epithelial cell: 34
kidney proximal straight tubule epithelial cell: 35
late pro-B cell: 36
leukocyte: 37
luminal epithelial cell of mammary gland: 38
lung endothelial cell: 39
macrophage: 40

In [11]:
print(len(gene_symbol))
print(len(barcode))
print(len(label))
print(label_df.shape)

18067
54865
54865
(54865, 1)


In [12]:
print(original_labels)

['bladder cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'leukocyte', 'bladder urothelial cell', 'bladder cell', 'bladder urothelial cell', 'bladder urothelial cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder urothelial cell', 'bladder urothelial cell', 'bladder urothelial cell', 'bladder urothelial cell', 'bladder cell', 'bladder cell', 'bladder cell', 'leukocyte', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder cell', 'bladder urothelial cell', 'bladder cell', 'bladder urothelial cell', 'e

In [13]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (54865, 18067)
shape of cell labels: 54865
number of cell types: 55


In [14]:
# Save the data
data_dict = {}
data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/TM.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
