|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
|Baron (Mouse) | Mouse pancreas | 13 | 1886 | 14861 | inDrop | GSE84133 |

In [1]:
import numpy as np
import pandas as pd

# Load the Filtered_MousePancreas_data.csv file, with the first column as the index representing unique cell identifiers.
data_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Baron Mouse/Filtered_MousePancreas_data.csv', index_col=0)

# Display the first 5 rows and 5 columns of the dataset
print(data_df.iloc[:5, :5])

# Print the shape of the dataset
print(data_df.shape)

# Print the name of the first column in the dataset
print(data_df.columns[0])

                             X0610007P14Rik  X0610009B22Rik  X0610009E02Rik  \
mouse1_lib1.final_cell_0001               0               0               0   
mouse1_lib1.final_cell_0002               2               0               0   
mouse1_lib1.final_cell_0003               0               0               1   
mouse1_lib1.final_cell_0004               0               0               0   
mouse1_lib1.final_cell_0005               1               1               0   

                             X0610009L18Rik  X0610009O20Rik  
mouse1_lib1.final_cell_0001               0               0  
mouse1_lib1.final_cell_0002               0               0  
mouse1_lib1.final_cell_0003               0               0  
mouse1_lib1.final_cell_0004               0               1  
mouse1_lib1.final_cell_0005               0               0  
(1886, 14861)
X0610007P14Rik


In [4]:
# Load cell type annotation data
label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Baron Mouse/Labels.csv', header=0)
print(label_df.iloc[:5,:])
print(label_df.shape)
label_df = label_df.set_index(data_df.index)
print(label_df.iloc[:5,:])

         x
0     beta
1   ductal
2    delta
3  schwann
4    delta
(1886, 1)
                                   x
mouse1_lib1.final_cell_0001     beta
mouse1_lib1.final_cell_0002   ductal
mouse1_lib1.final_cell_0003    delta
mouse1_lib1.final_cell_0004  schwann
mouse1_lib1.final_cell_0005    delta


In [5]:
# Calculate the number of cells for each cell type
cell_type_counts = label_df['x'].value_counts()

# Print the cell type and its corresponding cell count
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

beta: 894
ductal: 275
delta: 218
alpha: 191
endothelial: 139
quiescent_stellate: 47
gamma: 41
macrophage: 36
activated_stellate: 14
B_cell: 10
immune_other: 8
T_cell: 7
schwann: 6


In [6]:
# Calculate the number of cells for each cell type
cell_type_counts = label_df['x'].value_counts()

# Filter out cell types with at least 10 cells
selected_cell_types = cell_type_counts[cell_type_counts >= 10].index

# Retain corresponding cell data in the gene expression dataset and apply the same filter to the label dataset
data_df_selected = data_df.loc[label_df[label_df['x'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['x'].isin(selected_cell_types)]

# Print the shape and a preview of the filtered gene expression data
print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])

# Print the shape and a preview of the filtered label data
print(label_df.shape)
print(label_df.iloc[:5, :])


(1865, 14861)
                             X0610007P14Rik  X0610009B22Rik  X0610009E02Rik  \
mouse1_lib1.final_cell_0001               0               0               0   
mouse1_lib1.final_cell_0002               2               0               0   
mouse1_lib1.final_cell_0003               0               0               1   
mouse1_lib1.final_cell_0005               1               1               0   
mouse1_lib1.final_cell_0006               0               0               0   

                             X0610009L18Rik  X0610009O20Rik  X0610010F05Rik  \
mouse1_lib1.final_cell_0001               0               0               2   
mouse1_lib1.final_cell_0002               0               0               1   
mouse1_lib1.final_cell_0003               0               0               0   
mouse1_lib1.final_cell_0005               0               0               0   
mouse1_lib1.final_cell_0006               0               1               0   

                             X061001

In [7]:
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

beta: 894
ductal: 275
delta: 218
alpha: 191
endothelial: 139
quiescent_stellate: 47
gamma: 41
macrophage: 36
activated_stellate: 14
B_cell: 10
immune_other: 8
T_cell: 7
schwann: 6


In [8]:
# Calculate the number of cells in which each gene is expressed
# Assume that a gene is expressed in a cell if its expression value is greater than 0
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)

# Print the number of cells expressing each gene
print(genes_expressed_in_cells)

# Print the count of genes expressed in fewer than 10 cells
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))

# Select genes expressed in at least 10 cells
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the selected genes in the dataset
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape and a preview of the filtered dataset
print(data_df_filtered.shape)
print(data_df_filtered.iloc[:5, :])

# Display the data types of the first few columns in the filtered dataset
print(data_df_filtered.dtypes.head())

X0610007P14Rik    193
X0610009B22Rik    255
X0610009E02Rik     74
X0610009L18Rik     48
X0610009O20Rik    116
                 ... 
Zyg11b            385
Zyx               265
Zzef1             353
Zzz3              341
l7Rn6             219
Length: 14861, dtype: int64
2353
(1865, 12508)
                             X0610007P14Rik  X0610009B22Rik  X0610009E02Rik  \
mouse1_lib1.final_cell_0001               0               0               0   
mouse1_lib1.final_cell_0002               2               0               0   
mouse1_lib1.final_cell_0003               0               0               1   
mouse1_lib1.final_cell_0005               1               1               0   
mouse1_lib1.final_cell_0006               0               0               0   

                             X0610009L18Rik  X0610009O20Rik  X0610010F05Rik  \
mouse1_lib1.final_cell_0001               0               0               2   
mouse1_lib1.final_cell_0002               0               0               1   

In [9]:
gene_symbol = data_df_filtered.columns.values
print(gene_symbol)
print(len(gene_symbol))

['X0610007P14Rik' 'X0610009B22Rik' 'X0610009E02Rik' ... 'Zzef1' 'Zzz3'
 'l7Rn6']
12508


In [10]:
# Calculate the number of cells for each cell type
cell_type_counts = label_df['x'].value_counts()

# Print the cell type and its corresponding cell count
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")


beta: 894
ductal: 275
delta: 218
alpha: 191
endothelial: 139
quiescent_stellate: 47
gamma: 41
macrophage: 36
activated_stellate: 14
B_cell: 10


In [11]:
# Convert all data types in data_df_filtered to float32
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())

                             X0610007P14Rik  X0610009B22Rik  X0610009E02Rik  \
mouse1_lib1.final_cell_0001             0.0             0.0             0.0   
mouse1_lib1.final_cell_0002             2.0             0.0             0.0   
mouse1_lib1.final_cell_0003             0.0             0.0             1.0   
mouse1_lib1.final_cell_0005             1.0             1.0             0.0   
mouse1_lib1.final_cell_0006             0.0             0.0             0.0   

                             X0610009L18Rik  X0610009O20Rik  X0610010F05Rik  \
mouse1_lib1.final_cell_0001             0.0             0.0             2.0   
mouse1_lib1.final_cell_0002             0.0             0.0             1.0   
mouse1_lib1.final_cell_0003             0.0             0.0             0.0   
mouse1_lib1.final_cell_0005             0.0             0.0             0.0   
mouse1_lib1.final_cell_0006             0.0             1.0             0.0   

                             X0610010K14Rik  X0610

In [12]:
str_labels = np.unique(label_df.values).tolist()
label = [str_labels.index(x) for x in label_df.values]

# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]
barcode = data_df_filtered.index.values
gene_symbol = data_df_filtered.columns.values
print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(type(barcode))
print(barcode)
print(type(gene_symbol))
print(gene_symbol)
print(len(gene_symbol))

<class 'list'>
['B_cell', 'activated_stellate', 'alpha', 'beta', 'delta', 'ductal', 'endothelial', 'gamma', 'macrophage', 'quiescent_stellate']
<class 'list'>
[3, 5, 4, 4, 3, 3, 3, 3, 5, 5, 3, 3, 4, 3, 5, 3, 9, 5, 5, 3, 3, 3, 4, 9, 6, 5, 5, 3, 5, 5, 5, 7, 3, 3, 5, 5, 5, 4, 3, 3, 5, 4, 6, 3, 3, 2, 5, 3, 5, 7, 3, 5, 5, 5, 3, 5, 6, 5, 3, 3, 5, 5, 6, 3, 3, 5, 3, 6, 5, 3, 3, 6, 3, 3, 5, 3, 5, 9, 3, 5, 5, 4, 2, 5, 3, 5, 5, 3, 5, 4, 6, 3, 3, 5, 4, 3, 5, 5, 6, 3, 6, 3, 5, 9, 3, 9, 3, 3, 9, 4, 8, 3, 3, 6, 8, 3, 3, 3, 6, 3, 4, 3, 8, 3, 9, 5, 3, 3, 3, 3, 3, 5, 3, 5, 4, 3, 3, 2, 5, 3, 3, 3, 3, 6, 4, 3, 3, 3, 3, 5, 3, 5, 3, 5, 4, 6, 3, 5, 8, 4, 3, 3, 5, 4, 7, 3, 3, 3, 3, 3, 3, 3, 5, 5, 3, 5, 3, 3, 3, 6, 5, 5, 5, 3, 6, 6, 7, 3, 3, 5, 5, 5, 9, 3, 3, 3, 3, 5, 5, 5, 6, 6, 4, 3, 9, 6, 3, 5, 3, 4, 3, 3, 5, 3, 4, 3, 3, 4, 5, 1, 3, 3, 4, 4, 5, 7, 3, 3, 7, 5, 6, 9, 6, 5, 9, 5, 5, 3, 3, 4, 3, 5, 4, 4, 3, 3, 5, 5, 6, 3, 9, 4, 5, 5, 3, 5, 3, 5, 5, 5, 0, 5, 5, 5, 3, 5, 3, 5, 5, 5, 5, 6, 6, 5, 3, 4, 2, 3, 3, 5, 

In [13]:
# Create a mapping dictionary from cell type labels to numeric indices
label_to_index = {label1: i for i, label1 in enumerate(str_labels)}

# Print the mapping of cell type labels to their numeric indices
for label1, index in label_to_index.items():
    print(f"{label1}: {index}")


B_cell: 0
activated_stellate: 1
alpha: 2
beta: 3
delta: 4
ductal: 5
endothelial: 6
gamma: 7
macrophage: 8
quiescent_stellate: 9


In [14]:
print(len(gene_symbol))
print(len(barcode))
print(len(label))
print(label_df.shape)

12508
1865
1865
(1865, 1)


In [15]:
print(original_labels)

['beta', 'ductal', 'delta', 'delta', 'beta', 'beta', 'beta', 'beta', 'ductal', 'ductal', 'beta', 'beta', 'delta', 'beta', 'ductal', 'beta', 'quiescent_stellate', 'ductal', 'ductal', 'beta', 'beta', 'beta', 'delta', 'quiescent_stellate', 'endothelial', 'ductal', 'ductal', 'beta', 'ductal', 'ductal', 'ductal', 'gamma', 'beta', 'beta', 'ductal', 'ductal', 'ductal', 'delta', 'beta', 'beta', 'ductal', 'delta', 'endothelial', 'beta', 'beta', 'alpha', 'ductal', 'beta', 'ductal', 'gamma', 'beta', 'ductal', 'ductal', 'ductal', 'beta', 'ductal', 'endothelial', 'ductal', 'beta', 'beta', 'ductal', 'ductal', 'endothelial', 'beta', 'beta', 'ductal', 'beta', 'endothelial', 'ductal', 'beta', 'beta', 'endothelial', 'beta', 'beta', 'ductal', 'beta', 'ductal', 'quiescent_stellate', 'beta', 'ductal', 'ductal', 'delta', 'alpha', 'ductal', 'beta', 'ductal', 'ductal', 'beta', 'ductal', 'delta', 'endothelial', 'beta', 'beta', 'ductal', 'delta', 'beta', 'ductal', 'ductal', 'endothelial', 'beta', 'endothelial',

In [16]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (1865, 12508)
shape of cell labels: 1865
number of cell types: 10


In [17]:
# Save the data
data_dict = {}
data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/Baron_Mouse.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
