
|Data set |      Tissue    |cell type num| cell_num | gene_num | Protocol | Accession ID|
|---------|----------------|-------------|----------|----------|----------|-------------|
|Zheng 68K | Human PBMC | 11 | 65943 | 20387 | 10X Genomics | SRP073767 |

In [1]:
import numpy as np
import pandas as pd

#  Load the file "Filtered_68K_PBMC_data.csv", using the first column as the index, which serves as unique cell identifiers.
data_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Zheng 68K/Filtered_68K_PBMC_data.csv', index_col=0)
print(data_df.iloc[:5, :5])
print(data_df.shape)
print(data_df.columns[0])

                  AL627309.1  AP006222.2  RP11-206L10.3  RP11-206L10.2  \
AAACATACACCCAA-1           0           0              0              0   
AAACATACCCCTCA-1           0           0              0              0   
AAACATACCGGAGA-1           0           0              0              0   
AAACATACTAACCG-1           0           0              0              0   
AAACATACTCTTCA-1           0           0              0              0   

                  RP11-206L10.9  
AAACATACACCCAA-1              0  
AAACATACCCCTCA-1              0  
AAACATACCGGAGA-1              0  
AAACATACTAACCG-1              0  
AAACATACTCTTCA-1              0  
(65943, 20387)
AL627309.1


In [2]:
# Load cell type annotation data
label_df = pd.read_csv('../../data/scRNAseq_Benchmark_datasets/Zheng 68K/Labels.csv', header=0)
print(label_df.iloc[:5,:])
print(label_df.shape)
label_df = label_df.set_index(data_df.index)
print(label_df.iloc[:5,:])

                              x
0              CD8+ Cytotoxic T
1  CD8+/CD45RA+ Naive Cytotoxic
2           CD4+/CD45RO+ Memory
3                       CD19+ B
4               CD4+/CD25 T Reg
(65943, 1)
                                             x
AAACATACACCCAA-1              CD8+ Cytotoxic T
AAACATACCCCTCA-1  CD8+/CD45RA+ Naive Cytotoxic
AAACATACCGGAGA-1           CD4+/CD45RO+ Memory
AAACATACTAACCG-1                       CD19+ B
AAACATACTCTTCA-1               CD4+/CD25 T Reg


In [5]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

CD8+ Cytotoxic T: 20307
CD8+/CD45RA+ Naive Cytotoxic: 16361
CD56+ NK: 8522
CD4+/CD25 T Reg: 6116
CD19+ B: 5579
CD4+/CD45RO+ Memory: 3031
Dendritic: 1946
CD14+ Monocyte: 1944
CD4+/CD45RA+/CD25- Naive T: 1857
CD34+: 188
CD4+ T Helper2: 92


In [6]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
# Filter cell types with at least 10 cells.
selected_cell_types = cell_type_counts[cell_type_counts >= 10].index
# Retain the corresponding cell data in the gene expression dataset and update the label data accordingly.
data_df_selected = data_df.loc[label_df[label_df['x'].isin(selected_cell_types)].index]
label_df = label_df.loc[label_df['x'].isin(selected_cell_types)]
# Print the shape of the filtered datasets.
print(data_df_selected.shape)
print(data_df_selected.iloc[:5, :])
print(label_df.shape)
print(label_df.iloc[:5, :])


(65943, 20387)
                  AL627309.1  AP006222.2  RP11-206L10.3  RP11-206L10.2  \
AAACATACACCCAA-1           0           0              0              0   
AAACATACCCCTCA-1           0           0              0              0   
AAACATACCGGAGA-1           0           0              0              0   
AAACATACTAACCG-1           0           0              0              0   
AAACATACTCTTCA-1           0           0              0              0   

                  RP11-206L10.9  FAM87B  LINC00115  FAM41C  RP11-54O7.1  \
AAACATACACCCAA-1              0       0          0       0            0   
AAACATACCCCTCA-1              0       0          0       0            0   
AAACATACCGGAGA-1              0       0          0       0            0   
AAACATACTAACCG-1              0       0          0       0            0   
AAACATACTCTTCA-1              0       0          0       0            0   

                  SAMD11  ...  AC145212.1  MGC39584  AC011043.1  ZNF84.1  \
AAACATACACCCA

In [7]:
# Calculate how many cells express each gene.
# Here, an expression value greater than 0 indicates the gene is expressed in a cell.
genes_expressed_in_cells = (data_df_selected > 0).sum(axis=0)
# Print the number of genes expressed in fewer than 10 cells.
print(len(genes_expressed_in_cells[genes_expressed_in_cells < 10]))
# Select genes expressed in at least 10 cells.
genes_to_keep = genes_expressed_in_cells[genes_expressed_in_cells >= 10].index

# Retain only the genes meeting the criteria in the dataset.
data_df_filtered = data_df_selected[genes_to_keep]

# Display the shape of the filtered data and the first few rows.
print(data_df_filtered.shape)

# Convert all data in data_df_filtered to the float32 data type.
data_df_filtered = data_df_filtered.astype('float32')
print(data_df_filtered.iloc[:5, :])
print(data_df_filtered.dtypes.head())


5112
(65943, 15275)
                  AL627309.1  RP11-206L10.2  RP11-206L10.9  LINC00115  FAM41C  \
AAACATACACCCAA-1         0.0            0.0            0.0        0.0     0.0   
AAACATACCCCTCA-1         0.0            0.0            0.0        0.0     0.0   
AAACATACCGGAGA-1         0.0            0.0            0.0        0.0     0.0   
AAACATACTAACCG-1         0.0            0.0            0.0        0.0     0.0   
AAACATACTCTTCA-1         0.0            0.0            0.0        0.0     0.0   

                  NOC2L  KLHL17  PLEKHN1  RP11-54O7.17  HES4  ...  MT-ND6  \
AAACATACACCCAA-1    0.0     0.0      0.0           0.0   0.0  ...     0.0   
AAACATACCCCTCA-1    0.0     0.0      0.0           0.0   0.0  ...     0.0   
AAACATACCGGAGA-1    0.0     0.0      0.0           0.0   0.0  ...     0.0   
AAACATACTAACCG-1    0.0     0.0      0.0           0.0   0.0  ...     0.0   
AAACATACTCTTCA-1    0.0     0.0      0.0           0.0   0.0  ...     0.0   

                  MT-CYB  AC14

In [8]:
# Calculate the number of cells for each cell type.
cell_type_counts = label_df['x'].value_counts()
for cell_type, count in cell_type_counts.items():
    print(f"{cell_type}: {count}")

CD8+ Cytotoxic T: 20307
CD8+/CD45RA+ Naive Cytotoxic: 16361
CD56+ NK: 8522
CD4+/CD25 T Reg: 6116
CD19+ B: 5579
CD4+/CD45RO+ Memory: 3031
Dendritic: 1946
CD14+ Monocyte: 1944
CD4+/CD45RA+/CD25- Naive T: 1857
CD34+: 188
CD4+ T Helper2: 92


In [9]:
str_labels = np.unique(label_df.values).tolist()
label = [str_labels.index(x) for x in label_df.values]
# Convert numeric labels back to string labels
original_labels = [str_labels[x] for x in label]
barcode = data_df_filtered.index.values
gene_symbol = data_df_filtered.columns.values
print(type(str_labels))
print(str_labels)
print(type(label))
print(label)
print(len(label))
print(type(barcode))
print(barcode)
print(len(barcode))
print(type(gene_symbol))
print(gene_symbol)
print(len(gene_symbol))

<class 'list'>
['CD14+ Monocyte', 'CD19+ B', 'CD34+', 'CD4+ T Helper2', 'CD4+/CD25 T Reg', 'CD4+/CD45RA+/CD25- Naive T', 'CD4+/CD45RO+ Memory', 'CD56+ NK', 'CD8+ Cytotoxic T', 'CD8+/CD45RA+ Naive Cytotoxic', 'Dendritic']
<class 'list'>
[8, 9, 6, 1, 4, 4, 6, 8, 8, 7, 6, 9, 4, 8, 8, 8, 6, 8, 1, 9, 3, 7, 9, 1, 8, 7, 4, 4, 9, 9, 8, 9, 5, 9, 8, 9, 8, 4, 7, 7, 8, 7, 4, 4, 2, 8, 6, 7, 6, 7, 8, 10, 7, 1, 4, 5, 1, 7, 9, 8, 9, 9, 4, 1, 4, 8, 8, 5, 1, 7, 9, 1, 4, 4, 4, 8, 8, 8, 8, 1, 7, 8, 1, 0, 6, 7, 9, 9, 9, 9, 8, 9, 4, 5, 9, 9, 9, 5, 5, 4, 8, 1, 9, 5, 8, 8, 9, 8, 9, 8, 8, 9, 9, 8, 9, 1, 9, 1, 7, 1, 6, 5, 8, 9, 1, 8, 9, 9, 4, 8, 1, 7, 7, 0, 0, 4, 1, 4, 1, 10, 8, 9, 9, 8, 1, 1, 8, 0, 9, 8, 4, 5, 9, 6, 8, 9, 9, 9, 10, 4, 9, 8, 8, 9, 8, 8, 9, 7, 1, 9, 7, 9, 8, 0, 4, 1, 8, 5, 7, 9, 9, 7, 8, 3, 8, 8, 9, 7, 9, 8, 9, 9, 9, 9, 4, 7, 9, 8, 0, 9, 7, 7, 4, 9, 8, 9, 6, 1, 7, 1, 9, 9, 9, 6, 7, 7, 9, 8, 9, 1, 8, 8, 1, 1, 8, 10, 4, 9, 7, 9, 4, 9, 9, 7, 1, 7, 10, 7, 9, 7, 9, 9, 8, 6, 5, 7, 5, 7, 8, 0, 9, 9, 6,

In [10]:
# Create a mapping dictionary from cell type labels to numeric indices
label_to_index = {label1: i for i, label1 in enumerate(str_labels)}
# Print the mapping of cell type labels to their numeric indices
for label1, index in label_to_index.items():
    print(f"{label1}: {index}")

CD14+ Monocyte: 0
CD19+ B: 1
CD34+: 2
CD4+ T Helper2: 3
CD4+/CD25 T Reg: 4
CD4+/CD45RA+/CD25- Naive T: 5
CD4+/CD45RO+ Memory: 6
CD56+ NK: 7
CD8+ Cytotoxic T: 8
CD8+/CD45RA+ Naive Cytotoxic: 9
Dendritic: 10


In [11]:
print(len(gene_symbol))
print(len(barcode))
print(original_labels)

15275
65943
['CD8+ Cytotoxic T', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD4+/CD45RO+ Memory', 'CD19+ B', 'CD4+/CD25 T Reg', 'CD4+/CD25 T Reg', 'CD4+/CD45RO+ Memory', 'CD8+ Cytotoxic T', 'CD8+ Cytotoxic T', 'CD56+ NK', 'CD4+/CD45RO+ Memory', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD4+/CD25 T Reg', 'CD8+ Cytotoxic T', 'CD8+ Cytotoxic T', 'CD8+ Cytotoxic T', 'CD4+/CD45RO+ Memory', 'CD8+ Cytotoxic T', 'CD19+ B', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD4+ T Helper2', 'CD56+ NK', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD19+ B', 'CD8+ Cytotoxic T', 'CD56+ NK', 'CD4+/CD25 T Reg', 'CD4+/CD25 T Reg', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD8+ Cytotoxic T', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD4+/CD45RA+/CD25- Naive T', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD8+ Cytotoxic T', 'CD8+/CD45RA+ Naive Cytotoxic', 'CD8+ Cytotoxic T', 'CD4+/CD25 T Reg', 'CD56+ NK', 'CD56+ NK', 'CD8+ Cytotoxic T', 'CD56+ NK', 'CD4+/CD25 T Reg', 'CD4+/CD25 T Reg', 'CD34+', 'CD8+ Cytotoxic T', 'CD4+/CD45RO+ Memory', 'CD56+ NK'

In [12]:
print('shape of expression matrix [#cells,#genes]:', data_df_filtered.shape)
print('shape of cell labels:', len(label))
print('number of cell types:', len(str_labels))

shape of expression matrix [#cells,#genes]: (65943, 15275)
shape of cell labels: 65943
number of cell types: 11


In [13]:
# Save the data
data_dict = {}
data_dict['gene_symbol'] = gene_symbol
data_dict['count'] = data_df_filtered.values
data_dict['str_labels'] = str_labels
data_dict['label'] = label
data_dict['barcode'] = barcode
save_file = '../../dataset/pre_data/scRNAseq_datasets/Zheng68K.npz'
np.savez(save_file, **data_dict)

print('Finished.')

Finished.
