
|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
| Muraro | Human pancreas | 9 | 2122 | 18915 | CEL-Seq2 | GSE85241 |

In [1]:
import numpy as np
import pandas as pd

# Load the file "Filtered_Muraro_HumanPancreas_data.csv", using the first column as the index, which serves as unique cell identifiers.
data_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Muraro/Filtered_Muraro_HumanPancreas_data.csv', index_col=0)
print(data_df.iloc[:5, :5])
print(data_df.shape)
print(data_df.columns[0])

          A1BG__chr19  A1CF__chr10  A2M-AS1__chr12  A2ML1__chr12  A2M__chr12
D28.1_1      0.000000     6.071431             0.0           0.0    0.000000
D28.1_2      0.000000     0.000000             0.0           0.0    5.049473
D28.1_3      1.001958     2.007853             0.0           0.0    0.000000
D28.1_4      1.001958     6.071431             0.0           0.0    0.000000
D28.1_13     0.000000     0.000000             0.0           0.0    0.000000
(2122, 18915)
A1BG__chr19


In [3]:
# Load cell type annotation data

label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Muraro/Labels.csv', header=0)
print(label_df.iloc[:5,:])
print(label_df.shape)
label_df = label_df.set_index(data_df.index)
print(label_df.iloc[:5,:])

             x
0        alpha
1  endothelial
2        delta
3         beta
4         duct
(2122, 1)
                    x
D28.1_1         alpha
D28.1_2   endothelial
D28.1_3         delta
D28.1_4          beta
D28.1_13         duct


In [5]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")


alpha: 812
beta: 448
duct: 245
acinar: 219
delta: 193
pp: 101
mesenchymal: 80
endothelial: 21
epsilon: 3


In [6]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
# Filter cell types with a count of 10 or more cells.
selected_cell_types = cell_type_counts[cell_type_counts >= 10].index
# Retain the corresponding cell data in the gene expression dataset and update the label data accordingly.
data_df_selected = data_df.loc[label_df[label_df['x'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['x'].isin(selected_cell_types)]
print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])
print(label_df.shape)
print(label_df.iloc[:5, :])


(2119, 18915)
          A1BG__chr19  A1CF__chr10  A2M-AS1__chr12  A2ML1__chr12  A2M__chr12  \
D28.1_1      0.000000     6.071431             0.0           0.0    0.000000   
D28.1_2      0.000000     0.000000             0.0           0.0    5.049473   
D28.1_3      1.001958     2.007853             0.0           0.0    0.000000   
D28.1_4      1.001958     6.071431             0.0           0.0    0.000000   
D28.1_13     0.000000     0.000000             0.0           0.0    0.000000   

          A4GALT__chr22  A4GNT__chr3  AAAS__chr12  AACSP1__chr5  AACS__chr12  \
D28.1_1        0.000000          0.0     1.001958           0.0     1.001958   
D28.1_2        1.001958          0.0     0.000000           0.0     1.001958   
D28.1_3        0.000000          0.0     0.000000           0.0     1.001958   
D28.1_4        0.000000          0.0     1.001958           0.0     4.031579   
D28.1_13       0.000000          0.0     0.000000           0.0     0.000000   

          ...  ZWILCH__c

In [7]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

alpha: 812
beta: 448
duct: 245
acinar: 219
delta: 193
pp: 101
mesenchymal: 80
endothelial: 21


In [8]:
# Calculate how many cells express each gene.
# Here, an expression value greater than 0 indicates the gene is expressed in a cell.
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)
# Print the number of genes expressed in fewer than 10 cells.
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))
# Select genes expressed in at least 10 cells.
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the genes meeting the criteria in the dataset.
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape of the filtered data and the first few rows.
print(data_df_filtered.shape)

# Convert all data in data_df_filtered to float32 type.
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())


2798
(2119, 16117)
          A1BG__chr19  A1CF__chr10  A2M-AS1__chr12  A2ML1__chr12  A2M__chr12  \
D28.1_1      0.000000     6.071431             0.0           0.0    0.000000   
D28.1_2      0.000000     0.000000             0.0           0.0    5.049473   
D28.1_3      1.001958     2.007854             0.0           0.0    0.000000   
D28.1_4      1.001958     6.071431             0.0           0.0    0.000000   
D28.1_13     0.000000     0.000000             0.0           0.0    0.000000   

          A4GALT__chr22  AAAS__chr12  AACS__chr12  AADAC__chr3  AADAT__chr4  \
D28.1_1        0.000000     1.001958     1.001958          0.0          0.0   
D28.1_2        1.001958     0.000000     1.001958          0.0          0.0   
D28.1_3        0.000000     0.000000     1.001958          0.0          0.0   
D28.1_4        0.000000     1.001958     4.031579          0.0          0.0   
D28.1_13       0.000000     0.000000     0.000000          0.0          0.0   

          ...  ZWILCH__ch

In [10]:
str_labels = np.unique(label_df.values).tolist()
label = [str_labels.index(x) for x in label_df.values]
# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]
barcode = data_df_filtered.index.values
gene_symbol = data_df_filtered.columns.values

print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(len(label))
print(type(barcode))
print(barcode)
print(len(barcode))
print(type(gene_symbol))
print(gene_symbol)
print(len(gene_symbol))

<class 'list'>
['acinar', 'alpha', 'beta', 'delta', 'duct', 'endothelial', 'mesenchymal', 'pp']
<class 'list'>
[1, 5, 3, 2, 4, 1, 1, 5, 1, 1, 0, 0, 1, 1, 1, 2, 2, 0, 1, 4, 0, 3, 0, 1, 2, 4, 0, 0, 0, 1, 1, 7, 4, 4, 7, 0, 0, 1, 3, 4, 4, 0, 1, 0, 1, 4, 0, 4, 4, 0, 4, 1, 4, 1, 2, 0, 4, 6, 0, 1, 0, 1, 1, 0, 0, 7, 4, 1, 4, 1, 2, 0, 0, 5, 2, 4, 1, 0, 0, 0, 1, 7, 1, 0, 1, 2, 1, 1, 1, 7, 1, 3, 4, 7, 0, 0, 2, 0, 4, 2, 0, 2, 1, 0, 2, 1, 5, 0, 4, 0, 0, 0, 4, 4, 1, 3, 0, 4, 1, 0, 1, 1, 4, 0, 1, 0, 3, 1, 0, 4, 4, 1, 0, 0, 1, 4, 3, 0, 1, 4, 1, 0, 4, 7, 1, 4, 1, 1, 2, 1, 7, 1, 4, 1, 4, 0, 4, 0, 4, 0, 0, 4, 1, 0, 1, 0, 1, 0, 4, 4, 4, 1, 2, 4, 0, 0, 1, 1, 0, 7, 4, 1, 5, 3, 1, 1, 1, 3, 1, 7, 1, 7, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 7, 1, 3, 1, 7, 2, 1, 1, 2, 2, 1, 3, 1, 3, 1, 1, 2, 3, 1, 1, 2, 2, 4, 1, 3, 3, 7, 1, 1, 6, 1, 4, 7, 7, 0, 1, 2, 7, 7, 7, 3, 2, 1, 2, 2, 2, 1, 2, 2, 3, 2, 7, 1, 2, 1, 7, 2, 3, 2, 6, 1, 3, 2, 1, 4, 1, 3, 3, 2, 4, 1, 1, 3, 2, 7, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 7, 7, 1, 

In [11]:
# Create a mapping dictionary from cell type labels to numeric indices
label_to_index = {label1: i for i, label1 in enumerate(str_labels)}
# Print the mapping of cell type labels to their numeric indices
for label1, index in label_to_index.items():
    print(f"{label1}: {index}")

acinar: 0
alpha: 1
beta: 2
delta: 3
duct: 4
endothelial: 5
mesenchymal: 6
pp: 7


In [12]:
print(len(gene_symbol))
print(len(barcode))
print(original_labels)

16117
2119
['alpha', 'endothelial', 'delta', 'beta', 'duct', 'alpha', 'alpha', 'endothelial', 'alpha', 'alpha', 'acinar', 'acinar', 'alpha', 'alpha', 'alpha', 'beta', 'beta', 'acinar', 'alpha', 'duct', 'acinar', 'delta', 'acinar', 'alpha', 'beta', 'duct', 'acinar', 'acinar', 'acinar', 'alpha', 'alpha', 'pp', 'duct', 'duct', 'pp', 'acinar', 'acinar', 'alpha', 'delta', 'duct', 'duct', 'acinar', 'alpha', 'acinar', 'alpha', 'duct', 'acinar', 'duct', 'duct', 'acinar', 'duct', 'alpha', 'duct', 'alpha', 'beta', 'acinar', 'duct', 'mesenchymal', 'acinar', 'alpha', 'acinar', 'alpha', 'alpha', 'acinar', 'acinar', 'pp', 'duct', 'alpha', 'duct', 'alpha', 'beta', 'acinar', 'acinar', 'endothelial', 'beta', 'duct', 'alpha', 'acinar', 'acinar', 'acinar', 'alpha', 'pp', 'alpha', 'acinar', 'alpha', 'beta', 'alpha', 'alpha', 'alpha', 'pp', 'alpha', 'delta', 'duct', 'pp', 'acinar', 'acinar', 'beta', 'acinar', 'duct', 'beta', 'acinar', 'beta', 'alpha', 'acinar', 'beta', 'alpha', 'endothelial', 'acinar', 'du

In [13]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (2119, 16117)
shape of cell labels: 2119
number of cell types: 8


In [14]:
# Save the data
data_dict = {}

data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/Muraro.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
