In [1]:
import os
import sys 
import json
import pickle

import torch
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

from scipy import io

# Data Manipulation: convert h5ad to mtx

In [2]:
COVID_H5AD_PATH = "/home/wsg/BM/data/COVID19/E-MTAB-10026/covid_portal_210320_with_raw.h5ad"
covid = sc.read_h5ad(COVID_H5AD_PATH)

In [3]:
print(type(covid.X))
covid

<class 'scipy.sparse.csr.csr_matrix'>


AnnData object with n_obs × n_vars = 647366 × 24929
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [4]:
covid_rna = covid[:, covid.var["feature_types"] == "Gene Expression"]
covid_adt = covid[:, covid.var["feature_types"] == "Antibody Capture"]

In [6]:
print(covid_rna)
print(covid_adt)

View of AnnData object with n_obs × n_vars = 647366 × 24737
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'
View of AnnData object with n_obs × n_vars = 647366 × 192
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Wor

In [7]:
print(covid_rna.X[:10,:10].todense())
print(type(covid_rna.X))

[[0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        1.3266901
  0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.       ]]
<class 'annd

In [8]:
print(covid_rna.layers['raw'][:10,:10].todense())
print(type(covid_rna.layers['raw'].copy()))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
<class 'scipy.sparse.csr.csr_matrix'>


In [9]:
covid_rna.layers['processed'] = covid_rna.X.copy()
covid_rna.X = covid_rna.layers['raw'].copy()
covid_rna.X.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
del covid_rna.layers['processed']
del covid_rna.layers["raw"]

In [11]:
covid_rna.X

<647366x24737 sparse matrix of type '<class 'numpy.float32'>'
	with 856243643 stored elements in Compressed Sparse Row format>

In [12]:
# covid_rna.X = covid_rna.X.astype(np.int32)
print(covid_rna.X[:10,:10].todense())
print(type(covid_rna.X.copy()))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
<class 'scipy.sparse.csr.csr_matrix'>


In [13]:
covid_rna.var_names

Index(['MIR1302-2HG', 'AL627309.1', 'AL627309.3', 'AL627309.2', 'AL669831.2',
       'FAM87B', 'LINC00115', 'FAM41C', 'AL645608.2', 'SAMD11',
       ...
       'MAFIP', 'AC011043.1', 'AL592183.1', 'AC007325.1', 'AC007325.4',
       'AC007325.2', 'AL354822.1', 'AC233755.2', 'AC233755.1', 'AC240274.1'],
      dtype='object', length=24737)

In [20]:
output_path = "/home/wsg/BM/data/COVID19/RNA+ADT/RawData"
covid_rna.write_h5ad("{}/COVID19-CITE_seq-raw-RNA-counts.h5ad".format(output_path))

In [30]:
output_path = "/home/wsg/BM/data/COVID19/RNA+ADT/RawData"
covid_rna = sc.read_h5ad("{}/COVID19-CITE_seq-raw-RNA-counts.h5ad".format(output_path))

In [38]:
covid_rna.obs['barcode'] = covid_rna.obs_names

In [41]:
covid_rna.obs.to_csv("{}/metadata.csv".format(output_path))

In [47]:
adt_names = [name.split('AB_')[1] for name in covid_adt.var_names if len(name.split('AB_')) > 1]
covid_adt.var_names = adt_names
covid_adt.var_names

Index(['CD80', 'CD86', 'CD274', 'PDCD1LG2', 'ICOSLG', 'ITGAM', 'OX40L',
       'TNFSF9', 'PVR', 'NECTIN2',
       ...
       'CD101', 'IL21R', 'C5AR1', 'HLA-F', 'NLRP2', 'Podocalyxin', 'GGT1',
       'c-Met', 'LIGHT', 'DR3'],
      dtype='object', length=192)

In [48]:
covid_adt.layers['processed'] = covid_adt.X.copy()
covid_adt.X = covid_adt.layers['raw'].copy()
covid_adt.X.todense()

matrix([[ 0.,  0.,  2., ...,  7.,  3.,  3.],
        [ 1.,  0.,  2., ...,  5.,  4.,  5.],
        [ 3.,  0.,  3., ...,  5.,  7.,  5.],
        ...,
        [13.,  3.,  8., ..., 18., 10., 23.],
        [12.,  0., 10., ..., 17., 19., 25.],
        [50.,  9., 39., ..., 48., 56., 48.]], dtype=float32)

In [49]:
del covid_adt.layers['processed']
del covid_adt.layers["raw"]

In [50]:
# covid_adt.X = covid_adt.X.astype(np.int32)
print(covid_adt.X[:10,:10].todense())
print(type(covid_adt.X.copy()))

[[  0.   0.   2.   6.   2.   8.   1.   4.   1.   1.]
 [  1.   0.   2.   2.   6.   7.   2.   3.   1.   0.]
 [  3.   0.   3.   4.   6.   7.   3.   3.   3.   1.]
 [  7.   0.   4.   3.   3.  13.   2.   5.   0.   3.]
 [  6.   0.   2.   1.   3.   6.   2.   5.   2.   0.]
 [  5.   5.   9.   2.  10. 108.   4.  16.  13.   7.]
 [  4.   0.   0.   1.   7.   5.   2.   2.   3.   2.]
 [  6.   3.   3.   3.   5.  95.   2.   5.  14.   5.]
 [  5.   0.   0.   3.   6.  14.   1.   3.   1.   5.]
 [  3.   0.   2.   1.   1.   7.   2.   1.   0.   3.]]
<class 'scipy.sparse.csr.csr_matrix'>


In [51]:
output_path = "/home/wsg/BM/data/COVID19/RNA+ADT/RawData"
# save hd5
covid_adt.write_h5ad("{}/COVID19-CITE_seq-raw-ADT-counts.h5ad".format(output_path))

In [54]:
# Save RNA
## makr dir
!mkdir /home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-RNA-counts.mtx
## save X to mtx
io.mmwrite('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-RNA-counts.mtx/matrix', covid_rna.X.T)
## save barcodes
with open('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-RNA-counts.mtx/barcodes.tsv', 'w') as f:
    for item in covid_rna.obs_names:
        f.write(item + '\n')      
## save features
with open('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-RNA-counts.mtx/features.tsv', 'w') as f:
    for item in covid_rna.var_names:
        f.write(item + '\n')
## gzip file
!gzip /home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-RNA-counts.mtx/*
## save metadata
covid_rna.obs.to_csv('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/metadata.csv')

In [57]:
# Save ADT
## makr dir
!mkdir /home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-ADT-counts.mtx
## save X to mtx
io.mmwrite('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-ADT-counts.mtx/matrix', covid_adt.X.T)
## save barcodes
with open('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-ADT-counts.mtx/barcodes.tsv', 'w') as f:
    for item in covid_adt.obs_names:
        f.write(item + '\n')      
## save features
with open('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-ADT-counts.mtx/features.tsv', 'w') as f:
    for item in covid_adt.var_names:
        f.write(item + '\n')
## gzip file
!gzip /home/wsg/BM/data/COVID19/RNA+ADT/RawData/COVID19-CITE_seq-raw-ADT-counts.mtx/*
## save metadata
covid_adt.obs.to_csv('/home/wsg/BM/data/COVID19/RNA+ADT/RawData/metadata.csv')