|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
|AMB | Mouse Brain | 4 | 12832 | 42625 | Smart-Seq2 | GSE115746 |

#### 一 Load AMB gene expression data file

In [1]:
import numpy as np
import pandas as pd

# Load the Filtered_mouse_allen_brain_data.csv file, with the first column as the index representing unique cell identifiers.
data_df = pd.read_csv('../../wdata/data/scRNAseq_Benchmark_datasets/Intra-dataset/AMB/Filtered_mouse_allen_brain_data.csv', index_col=0)
print(data_df.iloc[:5, :5])
print(data_df.shape)
print(data_df.columns[0])

                     0610005C13Rik  0610006L08Rik  0610007P14Rik  \
F1S4_160108_001_A01              0              0             79   
F1S4_160108_001_B01              0              0            123   
F1S4_160108_001_C01              0              0             89   
F1S4_160108_001_D01              0              0            115   
F1S4_160108_001_E01              0              0            402   

                     0610009B22Rik  0610009E02Rik  
F1S4_160108_001_A01            145              1  
F1S4_160108_001_B01            178             26  
F1S4_160108_001_C01             69              0  
F1S4_160108_001_D01              0              0  
F1S4_160108_001_E01            208              0  
(12832, 42625)
0610005C13Rik


In [2]:
# Load cell type annotation data
# The cell types are categorized into 'class' and 'subclass'
label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/AMB/Labels.csv', header=0)
print(label_df.iloc[:5,:])
print(label_df.shape)
label_df = label_df.set_index(data_df.index)
print(label_df.iloc[:5,:])

       Class Subclass             cluster
0  GABAergic      Vip  Vip Arhgap36 Hmcn1
1  GABAergic    Lamp5          Lamp5 Lsp1
2  GABAergic    Lamp5          Lamp5 Lsp1
3  GABAergic      Vip  Vip Crispld2 Htr2c
4  GABAergic    Lamp5   Lamp5 Plch2 Dock5
(12832, 3)
                         Class Subclass             cluster
F1S4_160108_001_A01  GABAergic      Vip  Vip Arhgap36 Hmcn1
F1S4_160108_001_B01  GABAergic    Lamp5          Lamp5 Lsp1
F1S4_160108_001_C01  GABAergic    Lamp5          Lamp5 Lsp1
F1S4_160108_001_D01  GABAergic      Vip  Vip Crispld2 Htr2c
F1S4_160108_001_E01  GABAergic    Lamp5   Lamp5 Plch2 Dock5


In [4]:
genes = data_df.columns.values
print(genes)
print(len(genes))

['0610005C13Rik' '0610006L08Rik' '0610007P14Rik' ... 'a' 'l7Rn6'
 'n-R5s136']
42625


In [5]:
# Calculate the number of cells for each major cell type
cell_type_counts = label_df['Class'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")
print("\n")

# Calculate the number of cells for each cell subtype
cell_subtype_counts = label_df['Subclass'].value_counts()
for cell_type, count in cell_subtype_counts.items():
    print(f"{cell_type}: {count}")

Glutamatergic: 7155
GABAergic: 5643
Non-Neuronal: 28
Endothelial: 6


L6 IT: 1848
Sst: 1600
Vip: 1554
L4: 1348
Pvalb: 1266
Lamp5: 1067
L2/3 IT: 973
L6 CT: 953
L5 IT: 836
L5 PT: 539
L6b: 343
NP: 314
Sncg: 123
Serpinf1: 25
Astro: 11
VLMC: 11
Meis2: 8
Oligo: 5
Endo: 5
Macrophage: 1
SMC: 1
CR: 1


In [6]:
# Filter out cell types with at least 10 cells
selected_cell_types = cell_type_counts[cell_type_counts >= 10].index

# Retain corresponding cell data in the gene expression dataset and apply the same filter to the label dataset
data_df_selected = data_df.loc[label_df[label_df['Class'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['Class'].isin(selected_cell_types)]

# Print the shape and a preview of the filtered gene expression data
print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])

# Print the shape and a preview of the filtered label data
print(label_df.shape)
print(label_df.iloc[:5, :])

(12826, 42625)
                     0610005C13Rik  0610006L08Rik  0610007P14Rik  \
F1S4_160108_001_A01              0              0             79   
F1S4_160108_001_B01              0              0            123   
F1S4_160108_001_C01              0              0             89   
F1S4_160108_001_D01              0              0            115   
F1S4_160108_001_E01              0              0            402   

                     0610009B22Rik  0610009E02Rik  0610009L18Rik  \
F1S4_160108_001_A01            145              1             46   
F1S4_160108_001_B01            178             26             23   
F1S4_160108_001_C01             69              0             13   
F1S4_160108_001_D01              0              0              0   
F1S4_160108_001_E01            208              0             42   

                     0610009O20Rik  0610010B08Rik  0610010F05Rik  \
F1S4_160108_001_A01            123              0              0   
F1S4_160108_001_B01            

In [7]:
# Calculate the number of cells in which each gene is expressed
# Assume that a gene is expressed in a cell if its expression value is greater than 0
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)

# Print the number of genes expressed in fewer than 10 cells
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))

# Select genes expressed in at least 10 cells
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the selected genes in the dataset
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape and a preview of the filtered dataset
print(data_df_filtered.shape)

# Convert all data types in data_df_filtered to float32
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())

5226
(12826, 37399)
                     0610005C13Rik  0610006L08Rik  0610007P14Rik  \
F1S4_160108_001_A01            0.0            0.0           79.0   
F1S4_160108_001_B01            0.0            0.0          123.0   
F1S4_160108_001_C01            0.0            0.0           89.0   
F1S4_160108_001_D01            0.0            0.0          115.0   
F1S4_160108_001_E01            0.0            0.0          402.0   

                     0610009B22Rik  0610009E02Rik  0610009L18Rik  \
F1S4_160108_001_A01          145.0            1.0           46.0   
F1S4_160108_001_B01          178.0           26.0           23.0   
F1S4_160108_001_C01           69.0            0.0           13.0   
F1S4_160108_001_D01            0.0            0.0            0.0   
F1S4_160108_001_E01          208.0            0.0           42.0   

                     0610009O20Rik  0610010B08Rik  0610010F05Rik  \
F1S4_160108_001_A01          123.0            0.0            0.0   
F1S4_160108_001_B01       

In [8]:
str_labels = np.unique(label_df['Class'].values).tolist()
label = [str_labels.index(x) for x in label_df['Class'].values]

# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]

barcode = data_df_filtered.index.values
gene_symbol = data_df_filtered.columns.values 

print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(len(label))
print(type(barcode))
print(barcode)
print(len(barcode))
print(type(gene_symbol))
print(gene_symbol)
print(len(gene_symbol))

<class 'list'>
['GABAergic', 'Glutamatergic', 'Non-Neuronal']
<class 'list'>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0

In [9]:
# Create a mapping dictionary from cell type labels to numeric indices
label_to_index = {label1: i for i, label1 in enumerate(str_labels)}

# Print the mapping of cell type labels to numeric indices
for label1, index in label_to_index.items():
    print(f"{label1}: {index}")

GABAergic: 0
Glutamatergic: 1
Non-Neuronal: 2


In [10]:
print(len(gene_symbol))
print(len(barcode))
print(len(label))
print(label_df.shape)

37399
12826
12826
(12826, 3)


In [11]:
print(original_labels)
print(len(original_labels))

['GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic', 'GABAergic'

In [12]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (12826, 37399)
shape of cell labels: 12826
number of cell types: 3


In [13]:
# Save the data
data_dict = {}
data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/AMB.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
