In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import os, sys
sys.path.append('./src')
from scipy import sparse
import random
import utils
random.seed(0)
np.random.seed(0)

# 4. Preprocessing COVID Dataset


In [2]:
# read_data = sc.read_mtx(r'./data/covid/GSE158055_covid19_counts.mtx')
read_data = sc.read_h5ad(r'./data/covid/covid_counts.h5ad')

In [4]:
counts = read_data.copy()

In [4]:
sc.pp.filter_genes(counts, min_cells = 2000)
print(counts.shape)
sc.pp.normalize_per_cell(counts)
sc.pp.log1p(counts)

(1462702, 19717)


In [2]:
meta = pd.read_csv(r'./data/covid/GSE158055_cell_annotation.csv')
barcode = pd.read_csv(r'./data/covid/GSE158055_covid19_barcodes.tsv')
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False) 
sample_meta = pd.read_csv(r'./data/covid/batch_meta.csv')

In [58]:
sample_meta[sample_meta['severity'] == 'severe/critical'].shape
# sample_meta[sample_meta['severity'] == 'mild/moderate'].shape

(134, 5)

### Printing sample id based on different severity number matches the number at the end of barcode

In [61]:
a = ['convalescence', 'progression']
b = ['severe/critical', 'mild/moderate']
info_dict = {}
for i in a:
    this_dict = {}
    for j in b:
        sample_ids = sample_meta[(sample_meta['Sample time'] == i) & (sample_meta['severity'] == j)]['Sample name']
        samples = merge_meta[merge_meta['sampleID'].isin(sample_ids)]['cellName']
        sample_n = set([i.split('-')[-1] for i in samples])
        print(i, j, np.array(sample_n))
        print('!!!!')

convalescence severe/critical {'130', '167', '262', '50', '166', '169', '184', '164', '263', '104', '162', '192', '200', '186', '168', '261', '265', '185', '188', '37', '40', '165', '85', '126', '86', '120', '47', '183', '194', '88', '264', '260', '38', '125', '127', '189', '128', '44', '43', '138', '266', '163', '119', '191', '187', '259', '51', '161', '89', '160', '182'}
!!!!
convalescence mild/moderate {'215', '135', '179', '172', '216', '29', '28', '157', '159', '210', '124', '41', '15', '176', '156', '66', '171', '180', '198', '173', '27', '75', '14', '154', '133', '202', '218', '193', '175', '123', '178', '63', '76', '195', '155', '174', '122', '134', '212', '177', '91', '209', '181', '153', '64', '208', '58', '196', '197', '203', '206', '217', '255', '170', '158', '36', '199', '214', '42', '141', '48', '257', '207', '35', '147', '65', '205', '59', '39', '254', '190', '129', '152', '211', '256', '258', '145', '204', '73', '146', '201', '90', '57', '60', '151', '121', '74', '213',

In [27]:
merge_meta = pd.read_csv(r'./data/covid/merge_meta.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [22]:
merge_meta['majorType'].unique()

array(['Mono', 'B', 'Macro', 'CD4', 'CD8', 'NK', 'DC', 'Mega', 'Neu',
       'Plasma', 'Epi', 'Mast'], dtype=object)

In [7]:
sc.pp.highly_variable_genes(counts, n_top_genes = 2000)
counts = counts[:, counts.var["highly_variable"]]
print(counts.shape)

(1462702, 2000)


In [8]:
counts.write(r'./data/covid/covid_counts.h5ad')

### Calculate the sample ids contained in each batch

In [111]:
batch_dict = {}

for i in range(1, 18):
    if i < 10:
        this_idx = sample_meta[sample_meta['batch_id'] == 'Batch' + '0' + str(i)].index
    else :
        this_idx = sample_meta[sample_meta['batch_id'] == 'Batch' + str(i)].index
    this_batch = sample_meta.iloc[this_idx, :]['Sample name'].values
    batch_dict['Batch'+str(i)] = this_batch
batch_dict

{'Batch1': array(['S-S070-1', 'S-S070-2', 'S-S070-3', 'S-S069-1', 'S-S069-2',
        'S-S069-3', 'S-S071-1', 'S-S071-2', 'S-S072-1', 'S-S072-2',
        'S-S072-3', 'S-M056', 'S-M057'], dtype=object),
 'Batch2': array(['S-M044-1', 'S-M043-1', 'S-M048', 'S-M044-2', 'S-M043-2', 'S-S054',
        'S-S056', 'S-M042-1', 'S-M041-1', 'S-M049', 'S-M046', 'S-M047',
        'S-S055', 'S-S057', 'S-M045', 'S-M041-2', 'S-M042-2', 'S-HC008',
        'S-HC009', 'S-HC010', 'S-HC011', 'S-HC012'], dtype=object),
 'Batch3': array(['S-M055', 'S-M053', 'S-S067', 'S-S065', 'S-M051', 'S-HC013',
        'S-HC014', 'S-HC015', 'S-HC016', 'S-HC017', 'S-S064', 'S-M054',
        'S-M052', 'S-S068', 'S-S066', 'S-S058', 'S-S059', 'S-S060',
        'S-M050', 'S-S061', 'S-S062', 'S-S063'], dtype=object),
 'Batch4': array(['S-M061-1', 'S-M061-2', 'S-HC018-1', 'S-HC018-2', 'S-S073-1',
        'S-S073-2', 'S-S074-1', 'S-S074-2', 'S-M062-1', 'S-M062-2',
        'S-M058-1', 'S-M058-2', 'S-M063-1', 'S-M063-2', 'S-HC019-1',

In [15]:
# this is the merged meta file
merge_meta

Unnamed: 0,cellName,sampleID,celltype,majorType,batch_id,severity,Sample time,Sampling day
0,AACAGGGGTCGGATTT-0,S-S070-1,Mono_c1-CD14-CCL3,Mono,Batch01,severe/critical,progression,7
1,AACCAACGTCCGAAAG-0,S-S070-1,B_c02-MS4A1-CD27,B,Batch01,severe/critical,progression,7
2,AACCTTTGTAGCACGA-0,S-S070-1,B_c01-TCL1A,B,Batch01,severe/critical,progression,7
3,AAGCATCTCTATCGCC-0,S-S070-1,Mono_c2-CD14-HLA-DPB1,Mono,Batch01,severe/critical,progression,7
4,AATCACGGTCATAAAG-0,S-S070-1,B_c01-TCL1A,B,Batch01,severe/critical,progression,7
...,...,...,...,...,...,...,...,...
1462697,TTTGTCATCCACGCAG-283,S-S053,Mega,Mega,Batch17,severe/critical,progression,9
1462698,TTTGTCATCCGCTGTT-283,S-S053,B_c06-MKI67,Plasma,Batch17,severe/critical,progression,9
1462699,TTTGTCATCGTCGTTC-283,S-S053,T_CD8_c01-LEF1,CD8,Batch17,severe/critical,progression,9
1462700,TTTGTCATCTGTACGA-283,S-S053,NK_c01-FCGR3A,NK,Batch17,severe/critical,progression,9


### use pd.merge to combine the original meta file with some information from the batch_meta file

In [17]:
# meta['batch_id'] = sample_meta['batch_id']
# sample_meta = sample_meta.rename(columns = {'Sample name':'sampleID'})
# sample_meta
# merge_meta = pd.merge(meta,sample_meta, on='sampleID')
# merge_meta.to_csv('./data/covid/merge_meta.csv', index = None)
# merge_meta = pd.read_csv('./data/covid/merge_meta.csv')
merge_meta['Sample time'].value_counts()

convalescence    787987
progression      509715
control          165000
Name: Sample time, dtype: int64

## Seperating file based on sample ID

In [20]:
counts.shape

(1462702, 2000)

In [32]:
info_dict = {}

sev_lst = ['severe/critical', 'mild/moderate', 'control']
critical_dict = {}
moderate_dict = {}
control_dict = {}
# for i in sev_lst:
#     info_dict[i.split('/')[-1]] = []
for i in range(284):
    this_meta = merge_meta[merge_meta['cellName'].str[-len(str(i))-1:] == '-'+str(i)]
    this_idx = merge_meta[merge_meta['cellName'].str[-len(str(i))-1:] == '-'+str(i)].index
    this_counts = counts.X[this_idx]
    severity = this_meta['severity'].unique()[0].split('/')[-1]
    print(i, severity)
    if severity == 'critical':
        critical_dict['sample'+str(i)] = this_counts.shape
    elif severity == 'moderate':
        moderate_dict['sample'+str(i)] = this_counts.shape
    elif severity == 'control':
        control_dict['sample'+str(i)] = this_counts.shape
#     if not os.path.exists(r'./data/covid/processed_covid_sample/{}/'.format(severity)):
#         os.makedirs(r'./data/covid/processed_covid_sample/{}/'.format(severity))
    sparse.save_npz(r'./data/covid/processed_covid_sample/{}/mtx_batch_{}_{}.npz'.format(severity, i, severity), sparse.coo_matrix(this_counts))
    this_meta.to_csv(r'./data/covid/processed_covid_sample/{}/meta_batch_{}_{}.csv'.format(severity, i, severity),index=False)
#     print(this_counts.shape, i, 'finished')
info_dict['critical'] = critical_dict
info_dict['moderate'] = moderate_dict
info_dict['control'] = control_dict

0 critical
1 critical
2 critical
3 critical
4 critical
5 critical
6 critical
7 critical
8 critical
9 critical
10 critical
11 moderate
12 moderate
13 moderate
14 moderate
15 moderate
16 moderate
17 moderate
18 critical
19 critical
20 moderate
21 moderate
22 moderate
23 moderate
24 moderate
25 critical
26 critical
27 moderate
28 moderate
29 moderate
30 control
31 control
32 control
33 control
34 control
35 moderate
36 moderate
37 critical
38 critical
39 moderate
40 critical
41 moderate
42 moderate
43 critical
44 critical
45 critical
46 critical
47 critical
48 moderate
49 critical
50 critical
51 critical
52 control
53 control
54 control
55 control
56 control
57 moderate
58 moderate
59 moderate
60 moderate
61 control
62 control
63 moderate
64 moderate
65 moderate
66 moderate
67 control
68 control
69 critical
70 critical
71 critical
72 critical
73 moderate
74 moderate
75 moderate
76 moderate
77 critical
78 critical
79 critical
80 critical
81 control
82 control
83 critical
84 critical
85 cri

In [48]:
a = pd.DataFrame(info_dict)
a.to_csv(r'./data/covid/processed_covid_sample/batch_info.csv')
b = pd.read_csv(r'./data/covid/processed_covid_sample/batch_info.csv', index_col = 0)
b

Unnamed: 0,critical,moderate,control
sample0,"(239, 2000)",,
sample1,"(2944, 2000)",,
sample2,"(2101, 2000)",,
sample3,"(637, 2000)",,
sample4,"(155, 2000)",,
...,...,...,...
sample249,,,"(5966, 2000)"
sample250,,,"(4860, 2000)"
sample251,,,"(4689, 2000)"
sample252,,,"(5228, 2000)"
