### **Curating lung_atlas_integrated_RNA.cellxgene.h5ad**

Article:  Integrated histopathology, spatial and single cell transcriptomics resolve cellular drivers of early and late alveolar damage in COVID-19

DOI: https://doi.org/10.1101/2023.12.20.572494

Data Source : https://covid19-multiomicatlas.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch126/cellgen/team283/tl7/share/sc_ref/Noseda_Bayraktar_2021_Integrated_COVID19_LungCellAtlas_global_annotated_object_NormCountIncluded.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 536120 × 34091
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex', 'age', 'ethnicity', 'smoking_status', 'health_status', 'region', 'version_10X', 'scn_source', 'percent_mito', 'percent_ribo', 'scrublet.scores_samples', 'scrublet.predicted_db_samples', 'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples', 'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    uns: 'cell_state_v2_colors'
    obsm: 'X_harmony', 'X_pca', 'X_threeD_umap', 'X_umap'
    layers: 'norm_count'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<536120x34091 sparse matrix of type '<class 'numpy.float32'>'
	with 901308752 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 10)	1.0
  (0, 32)	1.0
  (0, 39)	3.0
  (0, 44)	1.0
  (0, 61)	1.0
  (0, 66)	1.0
  (0, 68)	3.0
  (0, 70)	1.0
  (0, 72)	1.0
  (0, 87)	1.0
  (0, 94)	1.0
  (0, 103)	10.0
  (0, 128)	1.0
  (0, 129)	5.0
  (0, 132)	2.0
  (0, 134)	3.0
  (0, 164)	1.0
  (0, 188)	1.0
  (0, 193)	1.0
  (0, 289)	1.0
  (0, 295)	1.0
  (0, 324)	13.0
  (0, 326)	21.0
  (0, 327)	1.0
  (0, 328)	1.0
  :	:
  (536119, 22252)	1.0
  (536119, 22270)	1.0
  (536119, 22271)	1.0
  (536119, 22467)	1.0
  (536119, 22474)	1.0
  (536119, 22481)	1.0
  (536119, 22748)	1.0
  (536119, 24530)	2.0
  (536119, 25499)	1.0
  (536119, 25592)	1.0
  (536119, 29084)	1.0
  (536119, 29284)	2.0
  (536119, 29567)	1.0
  (536119, 29629)	1.0
  (536119, 29830)	1.0
  (536119, 30608)	1.0
  (536119, 30609)	1.0
  (536119, 32113)	1.0
  (536119, 32302)	1.0
  (536119, 32466)	1.0
  (536119, 32605)	1.0
  (536119, 32748)	1.0
  (536119, 33121)	1.0
  (536119, 33208)	10.0
  (536119, 33211)	17.0


##### **Raw counts matrix**

In [11]:
adata.layers.keys()

KeysView(Layers with keys: norm_count)

In [12]:
adata.layers['counts'] = adata.X.copy()

In [13]:
adata.X = adata.layers['norm_count'].copy()

In [14]:
print(adata.X)

  (0, 10)	0.8854585473085288
  (0, 32)	0.8854585473085288
  (0, 39)	1.662464252791253
  (0, 44)	0.8854585473085288
  (0, 61)	0.8854585473085288
  (0, 66)	0.8854585473085288
  (0, 68)	1.662464252791253
  (0, 70)	0.8854585473085288
  (0, 72)	0.8854585473085288
  (0, 87)	0.8854585473085288
  (0, 94)	0.8854585473085288
  (0, 103)	2.7239863430579807
  (0, 128)	0.8854585473085288
  (0, 129)	2.0943890805352936
  (0, 132)	1.347603271291527
  (0, 134)	1.662464252791253
  (0, 164)	0.8854585473085288
  (0, 188)	0.8854585473085288
  (0, 193)	0.8854585473085288
  (0, 289)	0.8854585473085288
  (0, 295)	0.8854585473085288
  (0, 324)	2.9710934191312695
  (0, 326)	3.430950653143841
  (0, 327)	0.8854585473085288
  (0, 328)	0.8854585473085288
  :	:
  (536119, 22252)	2.9419791860624107
  (536119, 22270)	2.9419791860624107
  (536119, 22271)	2.9419791860624107
  (536119, 22467)	2.9419791860624107
  (536119, 22474)	2.9419791860624107
  (536119, 22481)	2.9419791860624107
  (536119, 22748)	2.9419791860624107
 

In [15]:
print(adata.layers['counts'])

  (0, 10)	1.0
  (0, 32)	1.0
  (0, 39)	3.0
  (0, 44)	1.0
  (0, 61)	1.0
  (0, 66)	1.0
  (0, 68)	3.0
  (0, 70)	1.0
  (0, 72)	1.0
  (0, 87)	1.0
  (0, 94)	1.0
  (0, 103)	10.0
  (0, 128)	1.0
  (0, 129)	5.0
  (0, 132)	2.0
  (0, 134)	3.0
  (0, 164)	1.0
  (0, 188)	1.0
  (0, 193)	1.0
  (0, 289)	1.0
  (0, 295)	1.0
  (0, 324)	13.0
  (0, 326)	21.0
  (0, 327)	1.0
  (0, 328)	1.0
  :	:
  (536119, 22252)	1.0
  (536119, 22270)	1.0
  (536119, 22271)	1.0
  (536119, 22467)	1.0
  (536119, 22474)	1.0
  (536119, 22481)	1.0
  (536119, 22748)	1.0
  (536119, 24530)	2.0
  (536119, 25499)	1.0
  (536119, 25592)	1.0
  (536119, 29084)	1.0
  (536119, 29284)	2.0
  (536119, 29567)	1.0
  (536119, 29629)	1.0
  (536119, 29830)	1.0
  (536119, 30608)	1.0
  (536119, 30609)	1.0
  (536119, 32113)	1.0
  (536119, 32302)	1.0
  (536119, 32466)	1.0
  (536119, 32605)	1.0
  (536119, 32748)	1.0
  (536119, 33121)	1.0
  (536119, 33208)	10.0
  (536119, 33211)	17.0


In [16]:
counts_layer = adata.layers['counts']
shape = counts_layer.shape
num_elements = shape[0] * shape[1]

print(f'The number of elements in adata.layers["counts"] is: {num_elements}')

The number of elements in adata.layers["counts"] is: 18276866920


In [17]:

counts = adata.layers['counts']


non_zero_mask = counts != 0


counts[non_zero_mask] = counts[non_zero_mask].round().astype(int)


adata.layers['counts'] = counts


In [18]:
counts_layer = adata.layers['counts']
shape = counts_layer.shape

num_elements = shape[0] * shape[1]

print(f'The number of elements in adata.layers["counts"] is: {num_elements}')

The number of elements in adata.layers["counts"] is: 18276866920


In [19]:
print(adata.layers['counts'])

  (0, 10)	1.0
  (0, 32)	1.0
  (0, 39)	3.0
  (0, 44)	1.0
  (0, 61)	1.0
  (0, 66)	1.0
  (0, 68)	3.0
  (0, 70)	1.0
  (0, 72)	1.0
  (0, 87)	1.0
  (0, 94)	1.0
  (0, 103)	10.0
  (0, 128)	1.0
  (0, 129)	5.0
  (0, 132)	2.0
  (0, 134)	3.0
  (0, 164)	1.0
  (0, 188)	1.0
  (0, 193)	1.0
  (0, 289)	1.0
  (0, 295)	1.0
  (0, 324)	13.0
  (0, 326)	21.0
  (0, 327)	1.0
  (0, 328)	1.0
  :	:
  (536119, 22252)	1.0
  (536119, 22270)	1.0
  (536119, 22271)	1.0
  (536119, 22467)	1.0
  (536119, 22474)	1.0
  (536119, 22481)	1.0
  (536119, 22748)	1.0
  (536119, 24530)	2.0
  (536119, 25499)	1.0
  (536119, 25592)	1.0
  (536119, 29084)	1.0
  (536119, 29284)	2.0
  (536119, 29567)	1.0
  (536119, 29629)	1.0
  (536119, 29830)	1.0
  (536119, 30608)	1.0
  (536119, 30609)	1.0
  (536119, 32113)	1.0
  (536119, 32302)	1.0
  (536119, 32466)	1.0
  (536119, 32605)	1.0
  (536119, 32748)	1.0
  (536119, 33121)	1.0
  (536119, 33208)	10.0
  (536119, 33211)	17.0


In [20]:
import anndata
import numpy as np

print("Decimal numbers present:", np.any(adata.layers['counts'].data % 1 != 0) if 'counts' in adata.layers else "Layer 'counts' not found")


Decimal numbers present: False


In [21]:
del adata.layers['norm_count']

In [22]:
araw = ad.AnnData(X=adata.layers['counts'], obs=adata.obs, var=adata.var)

In [23]:
del adata.layers['counts']

##### **Variables(var)**

In [24]:
adata.var

Unnamed: 0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable
RP11-34P13.7,0.000679,0.000693,0.000752,0.922408,0
FO538757.2,0.029907,0.044360,0.051360,0.863724,0
AP006222.2,0.015043,0.021931,0.024225,0.905319,0
RP4-669L17.10,0.008075,0.009453,0.011783,0.802251,0
RP5-857K21.4,0.007793,0.009004,0.011293,0.797376,0
...,...,...,...,...,...
AL008633.1,0.047178,0.172950,0.081874,2.112395,1
AC244213.1,0.000976,0.001004,0.001112,0.903572,0
AC074387.1,0.001127,0.001286,0.001297,0.991599,0
IGF2.1,0.000578,0.001074,0.000633,1.696698,1


In [25]:
ensembl_data = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Gene_lists_czi/GRCh38-3.0.0.gene_names.csv')

In [26]:
ensembl_dict = dict(zip(ensembl_data['gene_symbol'], ensembl_data['gene_id']))

In [27]:
ensembl_dict

{'MIR1302-2HG': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'AL627309.1': 'ENSG00000238009',
 'AL627309.3': 'ENSG00000239945',
 'AL627309.2': 'ENSG00000239906',
 'AL627309.4': 'ENSG00000241599',
 'AL732372.1': 'ENSG00000236601',
 'OR4F29': 'ENSG00000284733',
 'AC114498.1': 'ENSG00000235146',
 'OR4F16': 'ENSG00000284662',
 'AL669831.2': 'ENSG00000229905',
 'AL669831.5': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'AL645608.7': 'ENSG00000272438',
 'AL645608.3': 'ENSG00000230699',
 'AL645608.5': 'ENSG00000241180',
 'AL645608.1': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'PERM1': 'ENSG00000187642',
 'AL645608.8': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AL645608.2': 'ENSG00000224969',
 'AGRN': 'ENSG00000188157',
 'AL645608.9': 'ENSG00000273443',
 'RNF223

In [28]:
adata.var['gene_id'] = adata.var_names.map(ensembl_dict)
araw.var['gene_id'] = araw.var_names.map(ensembl_dict)


In [29]:
adata.var

Unnamed: 0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id
RP11-34P13.7,0.000679,0.000693,0.000752,0.922408,0,
FO538757.2,0.029907,0.044360,0.051360,0.863724,0,
AP006222.2,0.015043,0.021931,0.024225,0.905319,0,
RP4-669L17.10,0.008075,0.009453,0.011783,0.802251,0,
RP5-857K21.4,0.007793,0.009004,0.011293,0.797376,0,
...,...,...,...,...,...,...
AL008633.1,0.047178,0.172950,0.081874,2.112395,1,
AC244213.1,0.000976,0.001004,0.001112,0.903572,0,
AC074387.1,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033
IGF2.1,0.000578,0.001074,0.000633,1.696698,1,


In [30]:
araw.var

Unnamed: 0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id
RP11-34P13.7,0.000679,0.000693,0.000752,0.922408,0,
FO538757.2,0.029907,0.044360,0.051360,0.863724,0,
AP006222.2,0.015043,0.021931,0.024225,0.905319,0,
RP4-669L17.10,0.008075,0.009453,0.011783,0.802251,0,
RP5-857K21.4,0.007793,0.009004,0.011293,0.797376,0,
...,...,...,...,...,...,...
AL008633.1,0.047178,0.172950,0.081874,2.112395,1,
AC244213.1,0.000976,0.001004,0.001112,0.903572,0,
AC074387.1,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033
IGF2.1,0.000578,0.001074,0.000633,1.696698,1,


In [31]:
nan_count = adata.var['gene_id'].isna().sum()
print("Number of NaN values in adata.obs['gene_id']: ", nan_count)

Number of NaN values in adata.obs['gene_id']:  10565


In [32]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [33]:
adata.var.index = adata.var['gene_id'] 
araw.var.index = araw.var['gene_id']

In [34]:
# Load the approved genes file.

In [35]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [36]:
#Create a dictionary from the approved genes file 

In [37]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [38]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [39]:
len(genedict)

119799

In [40]:
#Filter out the genes which are not in the approved genes file.

In [41]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [42]:
len(var_to_keep_adata)

23330

In [43]:
len(var_to_keep_araw)

23330

In [44]:
adata.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,0.000679,0.000693,0.000752,0.922408,0,,RP11-34P13.7
,0.029907,0.044360,0.051360,0.863724,0,,FO538757.2
,0.015043,0.021931,0.024225,0.905319,0,,AP006222.2
,0.008075,0.009453,0.011783,0.802251,0,,RP4-669L17.10
,0.007793,0.009004,0.011293,0.797376,0,,RP5-857K21.4
...,...,...,...,...,...,...,...
,0.047178,0.172950,0.081874,2.112395,1,,AL008633.1
,0.000976,0.001004,0.001112,0.903572,0,,AC244213.1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033,AC074387.1
,0.000578,0.001074,0.000633,1.696698,1,,IGF2.1


In [45]:
araw.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,0.000679,0.000693,0.000752,0.922408,0,,RP11-34P13.7
,0.029907,0.044360,0.051360,0.863724,0,,FO538757.2
,0.015043,0.021931,0.024225,0.905319,0,,AP006222.2
,0.008075,0.009453,0.011783,0.802251,0,,RP4-669L17.10
,0.007793,0.009004,0.011293,0.797376,0,,RP5-857K21.4
...,...,...,...,...,...,...,...
,0.047178,0.172950,0.081874,2.112395,1,,AL008633.1
,0.000976,0.001004,0.001112,0.903572,0,,AC244213.1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033,AC074387.1
,0.000578,0.001074,0.000633,1.696698,1,,IGF2.1


In [46]:
# Modify the anndata object by filtering out the filtered genes.

In [47]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [48]:
adata.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0,ENSG00000177757,FAM87B
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0,ENSG00000225880,LINC00115
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0,ENSG00000230368,FAM41C
ENSG00000187634,0.015952,0.027739,0.025884,1.071678,1,ENSG00000187634,SAMD11
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0,ENSG00000188976,NOC2L
...,...,...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0,ENSG00000203618,GP1BB
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0,ENSG00000268292,AC006547.3
ENSG00000211675,0.036180,4.161019,0.062768,46.427851,1,ENSG00000211675,IGLC1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033,AC074387.1


In [49]:
araw.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0,ENSG00000177757,FAM87B
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0,ENSG00000225880,LINC00115
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0,ENSG00000230368,FAM41C
ENSG00000187634,0.015952,0.027739,0.025884,1.071678,1,ENSG00000187634,SAMD11
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0,ENSG00000188976,NOC2L
...,...,...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0,ENSG00000203618,GP1BB
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0,ENSG00000268292,AC006547.3
ENSG00000211675,0.036180,4.161019,0.062768,46.427851,1,ENSG00000211675,IGLC1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033,AC074387.1


feature is filtered

In [50]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [51]:
adata.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id,gene_name,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0,ENSG00000177757,FAM87B,False
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0,ENSG00000225880,LINC00115,False
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0,ENSG00000230368,FAM41C,False
ENSG00000187634,0.015952,0.027739,0.025884,1.071678,1,ENSG00000187634,SAMD11,False
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0,ENSG00000188976,NOC2L,False
...,...,...,...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0,ENSG00000203618,GP1BB,False
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0,ENSG00000268292,AC006547.3,False
ENSG00000211675,0.036180,4.161019,0.062768,46.427851,1,ENSG00000211675,IGLC1,False
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033,AC074387.1,False


In [52]:
araw.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,gene_id,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0,ENSG00000177757,FAM87B
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0,ENSG00000225880,LINC00115
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0,ENSG00000230368,FAM41C
ENSG00000187634,0.015952,0.027739,0.025884,1.071678,1,ENSG00000187634,SAMD11
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0,ENSG00000188976,NOC2L
...,...,...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0,ENSG00000203618,GP1BB
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0,ENSG00000268292,AC006547.3
ENSG00000211675,0.036180,4.161019,0.062768,46.427851,1,ENSG00000211675,IGLC1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,ENSG00000282033,AC074387.1


In [53]:
del adata.var['gene_id']
del adata.var['gene_name']
del araw.var['gene_name']
del araw.var['gene_id']

#### **obs (Cell metadata)**

In [54]:
#view obs

In [55]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,percent_mito,percent_ribo,scrublet.scores_samples,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,10.452862,19.268015,0.020584,0,0.002438,singlet,3076,1313,AT2,EP.AT2
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,7.321662,16.132199,0.019374,0,0.002932,singlet,2578,1170,AT2,EP.AT2
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,18.080614,6.833013,0.038855,0,0.000613,singlet,2802,1371,B & Plasma cells,B.Doublets
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,13.388076,12.674045,0.041154,0,0.000432,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,3.820896,13.970149,0.038855,0,0.001419,singlet,3158,1566,AT1,EP.AT1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.221402,0.590406,0.012835,0,0.064667,singlet,2210,852,T & NK cells,T.CD4+
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.373692,0.298954,0.037929,0,0.037017,singlet,2174,968,B & Plasma cells,B.Plasma.IgA
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.340301,0.388916,0.057826,0,0.142415,singlet,2343,1326,T & NK cells,T.CD4+
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.725611,0.376243,0.213389,0,0.816544,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar


In [56]:
# view the column names in obs

In [57]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2'],
      dtype='object')

#### **assay_ontology_term_id**

In [58]:
list(adata.obs['version_10X'].unique())

["10X V1(5')", '10X V2', '10X V3', "10X V1.1(5')", '10X V3.1']

In [59]:
adata.obs['barcodes'] = adata.obs_names

In [60]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [61]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [62]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [63]:
adata.obs['assays'] = adata.obs['barcodes'].map(mapping)

In [64]:
list(adata.obs['assays'].unique())

['3pv2_5pv1_5pv2',
 '3pv2_5pv1_5pv2+multiome',
 '3pv2_5pv1_5pv2+3pv3',
 nan,
 '3pv3',
 '3pv3+multiome']

In [65]:
import pandas as pd
import scanpy as sc

unique_assays = adata.obs['version_10X'].unique()
print("Unique values in 'assay':", unique_assays)

for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['version_10X'] == assay, 'assays'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10X V1(5')', '10X V2', '10X V3', '10X V1.1(5')', '10X V3.1']
Categories (5, object): ['10X V1(5')', '10X V1.1(5')', '10X V2', '10X V3', '10X V3.1']
Unique 'assays' for assay 10X V1(5'): ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+multiome' '3pv2_5pv1_5pv2+3pv3' nan]
Unique 'assays' for assay 10X V2: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' nan '3pv2_5pv1_5pv2+multiome']
Unique 'assays' for assay 10X V3: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome']
Unique 'assays' for assay 10X V1.1(5'): ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome' '3pv2_5pv1_5pv2'
 '3pv2_5pv1_5pv2+multiome']
Unique 'assays' for assay 10X V3.1: ['3pv3' '3pv2_5pv1_5pv2+3pv3' '3pv3+multiome' nan]


In [66]:
mapping ={"10X V1(5')":"EFO:0011025", '10X V2':'EFO:0009899', '10X V3':'EFO:0009922', "10X V1.1(5')":'EFO:0011025', '10X V3.1':'EFO:0009922'}

In [67]:
adata.obs['assay_ontology_term_id'] = adata.obs['version_10X'].map(mapping)

In [68]:
import pandas as pd
import scanpy as sc

unique_assays = adata.obs['version_10X'].unique()
print("Unique values in 'assay':", unique_assays)

for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['version_10X'] == assay, 'assay_ontology_term_id'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10X V1(5')', '10X V2', '10X V3', '10X V1.1(5')', '10X V3.1']
Categories (5, object): ['10X V1(5')', '10X V1.1(5')', '10X V2', '10X V3', '10X V3.1']
Unique 'assays' for assay 10X V1(5'): ['EFO:0011025']
Unique 'assays' for assay 10X V2: ['EFO:0009899']
Unique 'assays' for assay 10X V3: ['EFO:0009922']
Unique 'assays' for assay 10X V1.1(5'): ['EFO:0011025']
Unique 'assays' for assay 10X V3.1: ['EFO:0009922']


In [69]:
condition1 = (adata.obs['assays'] == '3pv3') & (adata.obs['version_10X'] == "10X V1.1(5')")
condition2 = (adata.obs['assays'] == '3pv3+multiome') & (adata.obs['version_10X'] == "10X V1.1(5')")

adata.obs.loc[condition1 | condition2, 'assay_ontology_term_id'] = 'EFO:0009922'

In [70]:
import pandas as pd
import scanpy as sc

unique_assays = adata.obs['version_10X'].unique()
print("Unique values in 'assay':", unique_assays)

for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['version_10X'] == assay, 'assay_ontology_term_id'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)


Unique values in 'assay': ['10X V1(5')', '10X V2', '10X V3', '10X V1.1(5')', '10X V3.1']
Categories (5, object): ['10X V1(5')', '10X V1.1(5')', '10X V2', '10X V3', '10X V3.1']
Unique 'assays' for assay 10X V1(5'): ['EFO:0011025']
Unique 'assays' for assay 10X V2: ['EFO:0009899']
Unique 'assays' for assay 10X V3: ['EFO:0009922']
Unique 'assays' for assay 10X V1.1(5'): ['EFO:0009922' 'EFO:0011025']
Unique 'assays' for assay 10X V3.1: ['EFO:0009922']


In [71]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,barcodes,assays,assay_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.002438,singlet,3076,1313,AT2,EP.AT2,AAACCTGAGACCTTTG,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.002932,singlet,2578,1170,AT2,EP.AT2,AAACCTGAGAGATGAG,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.000613,singlet,2802,1371,B & Plasma cells,B.Doublets,AAACCTGAGCGTCTAT,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.000432,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,AAACCTGAGGCACATG,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.001419,singlet,3158,1566,AT1,EP.AT1,AAACCTGCAACTTGAC,3pv2_5pv1_5pv2,EFO:0011025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.064667,singlet,2210,852,T & NK cells,T.CD4+,TTTGACTTCAAGTCGT,3pv3,EFO:0009922
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.037017,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,TTTGGTTAGGATGCGT,3pv2_5pv1_5pv2+3pv3,EFO:0009922
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.142415,singlet,2343,1326,T & NK cells,T.CD4+,TTTGGTTCATACTGAC,3pv3,EFO:0009922
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.816544,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,TTTGTTGAGCAGGCTA,3pv3,EFO:0009922


In [72]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [73]:
# view adata.obs

In [74]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,barcodes,assays,assay_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.002438,singlet,3076,1313,AT2,EP.AT2,AAACCTGAGACCTTTG,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.002932,singlet,2578,1170,AT2,EP.AT2,AAACCTGAGAGATGAG,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.000613,singlet,2802,1371,B & Plasma cells,B.Doublets,AAACCTGAGCGTCTAT,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.000432,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,AAACCTGAGGCACATG,3pv2_5pv1_5pv2,EFO:0011025
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.001419,singlet,3158,1566,AT1,EP.AT1,AAACCTGCAACTTGAC,3pv2_5pv1_5pv2,EFO:0011025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.064667,singlet,2210,852,T & NK cells,T.CD4+,TTTGACTTCAAGTCGT,3pv3,EFO:0009922
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.037017,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,TTTGGTTAGGATGCGT,3pv2_5pv1_5pv2+3pv3,EFO:0009922
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.142415,singlet,2343,1326,T & NK cells,T.CD4+,TTTGGTTCATACTGAC,3pv3,EFO:0009922
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.816544,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,TTTGTTGAGCAGGCTA,3pv3,EFO:0009922


In [75]:
del adata.obs['barcodes']
del adata.obs['assays']

#### **cell_type_ontology_term_id**

In [76]:
#identify the column in adata.obs related. to cell type annotation

In [77]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id'],
      dtype='object')

In [78]:
list(adata.obs['cell_state_v2'].unique())

['EP.AT2',
 'B.Doublets',
 'MYE.Monocyte',
 'EP.AT1',
 'EP.Ciliated',
 'MYE.Macrophage.Alveolar',
 'T.CD4+',
 'EC.Lymphatic',
 'NK',
 'MYE.Macrophage.Intravascular',
 'EP.Secretory',
 'EC.Venous.Pulmonary',
 'EC.Doublets',
 'FB.Adventitial',
 'EC.Capillary',
 'FB.Myofibroblast',
 'EC.Aerocyte',
 'Vascular.Smooth.Muscle',
 'FB.Alveolar',
 'EC.Arterial',
 'Proliferating.MYE',
 'Proliferating.T.NK',
 'EP.Doublets',
 'B.Plasma.IgA',
 'T.CD8+',
 'EC.Venous.Systemic',
 'MYE.Macrophage.HSP',
 'Mast',
 'Pericytes',
 'B.Plasma.IgG',
 'FB.Basal',
 'MYE.Macrophage.Alveolar.Metallothionein.CCL',
 'MYE.Macrophage.CHIT1-like',
 'FB.Doublets',
 'B.Cell',
 'Erythrocytes',
 'Mesothelial']

In [79]:

df = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_nanostring/Suppl_info/covid_nano_celltype.csv')


In [80]:
mapping = df.set_index('Original Cell Type')['CL ID'].to_dict()

In [81]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_state_v2'].map(mapping)

In [82]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [83]:
# view adata.obs

In [84]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scrublet.scores_samples,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.020584,0,0.002438,singlet,3076,1313,AT2,EP.AT2,EFO:0011025,CL:0002063
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.019374,0,0.002932,singlet,2578,1170,AT2,EP.AT2,EFO:0011025,CL:0002063
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.038855,0,0.000613,singlet,2802,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.041154,0,0.000432,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.038855,0,0.001419,singlet,3158,1566,AT1,EP.AT1,EFO:0011025,CL:0002062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.012835,0,0.064667,singlet,2210,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.037929,0,0.037017,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.057826,0,0.142415,singlet,2343,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.213389,0,0.816544,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583


#### **development_stage_ontology_term_id**

In [85]:
# identify the column in adata which corresponds to age

In [86]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id'],
      dtype='object')

In [87]:
list(adata.obs['age'].unique())

['30',
 '14',
 '31',
 '54',
 'unknown',
 '36',
 '40',
 '17',
 '38',
 '29',
 '41',
 '32',
 '64',
 '80',
 '56',
 '25',
 '49',
 '20',
 '46',
 '50',
 '66',
 '22',
 '65',
 '37',
 '21',
 '35',
 '23',
 '67',
 '61',
 '62',
 '48',
 '57',
 '63',
 '55',
 '47',
 '65:70',
 '40:45',
 '70:75',
 '55:60',
 '75',
 '51',
 '52',
 '28',
 '40:49',
 '30:49',
 '60:69',
 '45',
 '44',
 '79',
 '76',
 '50:59',
 '70:79',
 '>=80',
 '70',
 '69',
 '72',
 '68',
 '73',
 'NA',
 '78',
 '83',
 '84',
 '71',
 '58',
 '20:25',
 '60:65',
 '75:80']

In [88]:
filtered_df = adata.obs[adata.obs['age'] == '30:49']

unique_donors = filtered_df['donor'].unique()

print("Unique donors with age '30:49':")
print(unique_donors)

Unique donors with age '30:49':
['Donor_65', 'Donor_66', 'Donor_92']
Categories (128, object): ['Donor_55', 'Donor_77', 'Donor_26', 'Donor_54', ..., 'Donor_4', 'Donor_5', 'Donor_10', 'Donor_74']


In [89]:
age = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_nanostring/Suppl_info/age.csv')

In [90]:
age_dict = pd.Series(age['development_stage_ontology_term_id'].values, index=age['Age']).to_dict()

In [91]:
age_dict

{'>=80': 'HsapDv:0000095',
 '14': 'HsapDv:0000108',
 '17': 'HsapDv:0000111',
 '20': 'HsapDv:0000114',
 '20:25': 'HsapDv:0000237',
 '21': 'HsapDv:0000115',
 '22': 'HsapDv:0000116',
 '23': 'HsapDv:0000117',
 '25': 'HsapDv:0000119',
 '28': 'HsapDv:0000122',
 '29': 'HsapDv:0000123',
 '30': 'HsapDv:0000124',
 '30:49': 'HsapDv:0000088',
 '31': 'HsapDv:0000125',
 '32': 'HsapDv:0000126',
 '35': 'HsapDv:0000129',
 '36': 'HsapDv:0000130',
 '37': 'HsapDv:0000131',
 '38': 'HsapDv:0000132',
 '40': 'HsapDv:0000134',
 '40:45': 'HsapDv:0000239',
 '40:49': 'HsapDv:0000239',
 '41': 'HsapDv:0000135',
 '44': 'HsapDv:0000138',
 '45': 'HsapDv:0000139',
 '46': 'HsapDv:0000140',
 '47': 'HsapDv:0000141',
 '48': 'HsapDv:0000142',
 '49': 'HsapDv:0000143',
 '50': 'HsapDv:0000144',
 '50:59': 'HsapDv:0000240',
 '51': 'HsapDv:0000145',
 '52': 'HsapDv:0000146',
 '54': 'HsapDv:0000148',
 '55': 'HsapDv:0000149',
 '55:60': 'HsapDv:0000240',
 '56': 'HsapDv:0000150',
 '57': 'HsapDv:0000151',
 '58': 'HsapDv:0000152',
 '60:

In [92]:
mapping={'>=80': 'HsapDv:0000095',
 '14': 'HsapDv:0000108',
 '17': 'HsapDv:0000111',
 '20': 'HsapDv:0000114',
 '20:25': 'HsapDv:0000237',
 '21': 'HsapDv:0000115',
 '22': 'HsapDv:0000116',
 '23': 'HsapDv:0000117',
 '25': 'HsapDv:0000119',
 '28': 'HsapDv:0000122',
 '29': 'HsapDv:0000123',
 '30': 'HsapDv:0000124',
 '30:49': 'HsapDv:0000088',
 '31': 'HsapDv:0000125',
 '32': 'HsapDv:0000126',
 '35': 'HsapDv:0000129',
 '36': 'HsapDv:0000130',
 '37': 'HsapDv:0000131',
 '38': 'HsapDv:0000132',
 '40': 'HsapDv:0000134',
 '40:45': 'HsapDv:0000239',
 '40:49': 'HsapDv:0000239',
 '41': 'HsapDv:0000135',
 '44': 'HsapDv:0000138',
 '45': 'HsapDv:0000139',
 '46': 'HsapDv:0000140',
 '47': 'HsapDv:0000141',
 '48': 'HsapDv:0000142',
 '49': 'HsapDv:0000143',
 '50': 'HsapDv:0000144',
 '50:59': 'HsapDv:0000240',
 '51': 'HsapDv:0000145',
 '52': 'HsapDv:0000146',
 '54': 'HsapDv:0000148',
 '55': 'HsapDv:0000149',
 '55:60': 'HsapDv:0000240',
 '56': 'HsapDv:0000150',
 '57': 'HsapDv:0000151',
 '58': 'HsapDv:0000152',
 '60:65': 'HsapDv:0000241',
 '60:69': 'HsapDv:0000241',
 '61': 'HsapDv:0000155',
 '62': 'HsapDv:0000156',
 '63': 'HsapDv:0000157',
 '64': 'HsapDv:0000158',
 '65': 'HsapDv:0000159',
 '65:70': 'HsapDv:0000241',
 '66': 'HsapDv:0000160',
 '67': 'HsapDv:0000161',
 '68': 'HsapDv:0000162',
 '69': 'HsapDv:0000163',
 '70': 'HsapDv:0000164',
 '70:75': 'HsapDv:0000242',
 '70:79': 'HsapDv:0000242',
 '71': 'HsapDv:0000165',
 '72': 'HsapDv:0000166',
 '73': 'HsapDv:0000167',
 '75': 'HsapDv:0000169',
 '75:80': 'HsapDv:0000242',
 '76': 'HsapDv:0000170',
 '78': 'HsapDv:0000172',
 '79': 'HsapDv:0000173',
 '80': 'HsapDv:0000206',
 '83': 'HsapDv:0000209',
 '84': 'HsapDv:0000210',
 'NA': 'unknown',
 'unknown': 'unknown'}

In [93]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age'].map(mapping)

In [94]:
# change datatype of the column

In [95]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [96]:
# view unique values of development_stage_ontology_term_id column

In [97]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000124',
 'HsapDv:0000108',
 'HsapDv:0000125',
 'HsapDv:0000148',
 'unknown',
 'HsapDv:0000130',
 'HsapDv:0000134',
 'HsapDv:0000111',
 'HsapDv:0000132',
 'HsapDv:0000123',
 'HsapDv:0000135',
 'HsapDv:0000126',
 'HsapDv:0000158',
 'HsapDv:0000206',
 'HsapDv:0000150',
 'HsapDv:0000119',
 'HsapDv:0000143',
 'HsapDv:0000114',
 'HsapDv:0000140',
 'HsapDv:0000144',
 'HsapDv:0000160',
 'HsapDv:0000116',
 'HsapDv:0000159',
 'HsapDv:0000131',
 'HsapDv:0000115',
 'HsapDv:0000129',
 'HsapDv:0000117',
 'HsapDv:0000161',
 'HsapDv:0000155',
 'HsapDv:0000156',
 'HsapDv:0000142',
 'HsapDv:0000151',
 'HsapDv:0000157',
 'HsapDv:0000149',
 'HsapDv:0000141',
 'HsapDv:0000241',
 'HsapDv:0000239',
 'HsapDv:0000242',
 'HsapDv:0000240',
 'HsapDv:0000169',
 'HsapDv:0000145',
 'HsapDv:0000146',
 'HsapDv:0000122',
 'HsapDv:0000088',
 'HsapDv:0000139',
 'HsapDv:0000138',
 'HsapDv:0000173',
 'HsapDv:0000170',
 'HsapDv:0000095',
 'HsapDv:0000164',
 'HsapDv:0000163',
 'HsapDv:0000166',
 'HsapDv:0000162',


In [98]:
filtered_rows = adata.obs[adata.obs['development_stage_ontology_term_id'] == "HsapDv:0000088"]


unique_donor_values = filtered_rows['donor'].unique()


ages_for_unique_donors = adata.obs[adata.obs['donor'].isin(unique_donor_values)]['age']

unique_age_values = ages_for_unique_donors.unique()

print(unique_age_values)

['30:49']
Categories (67, object): ['14', '17', '20', '20:25', ..., '84', '>=80', 'NA', 'unknown']


In [99]:
# view adata.obs

In [100]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.002438,singlet,3076,1313,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.002932,singlet,2578,1170,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.000613,singlet,2802,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.000432,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0,0.001419,singlet,3158,1566,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.064667,singlet,2210,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.037017,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.142415,singlet,2343,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0,0.816544,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242


#### **donor_id**

In [101]:
#identify the column in adata.obs which provides donor information

In [102]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [103]:
# add the donor_id column

In [104]:
adata.obs['donor_id'] = adata.obs['donor']

In [105]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [106]:
# change datatype of the column

In [107]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [108]:
# view unique values of donor_id column

In [109]:
list(adata.obs['donor_id'].unique())

['Donor_1',
 'Donor_2',
 'Donor_3',
 'Donor_4',
 'Donor_5',
 'Donor_6',
 'Donor_7',
 'Donor_8',
 'Donor_9',
 'Donor_10',
 'Donor_11',
 'Donor_12',
 'Donor_13',
 'Donor_14',
 'Donor_15',
 'Donor_16',
 'Donor_17',
 'Donor_18',
 'Donor_19',
 'Donor_20',
 'Donor_21',
 'Donor_22',
 'Donor_23',
 'Donor_24',
 'Donor_25',
 'Donor_26',
 'Donor_27',
 'Donor_28',
 'Donor_29',
 'Donor_30',
 'Donor_31',
 'Donor_32',
 'Donor_33',
 'Donor_34',
 'Donor_35',
 'Donor_36',
 'Donor_37',
 'Donor_38',
 'Donor_39',
 'Donor_40',
 'Donor_41',
 'Donor_42',
 'Donor_43',
 'Donor_44',
 'Donor_45',
 'Donor_46',
 'Donor_47',
 'Donor_48',
 'Donor_49',
 'Donor_50',
 'Donor_51',
 'Donor_52',
 'Donor_53',
 'Donor_54',
 'Donor_55',
 'Donor_56',
 'Donor_57',
 'Donor_58',
 'Donor_59',
 'Donor_60',
 'Donor_61',
 'Donor_62',
 'Donor_63',
 'Donor_64',
 'Donor_65',
 'Donor_66',
 'Donor_67',
 'Donor_68',
 'Donor_69',
 'Donor_70',
 'Donor_71',
 'Donor_72',
 'Donor_73',
 'Donor_74',
 'Donor_75',
 'Donor_76',
 'Donor_77',
 'Donor_

In [110]:
#view obs

In [111]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scDblFinder.score_samples,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.002438,singlet,3076,1313,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.002932,singlet,2578,1170,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.000613,singlet,2802,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.000432,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,0.001419,singlet,3158,1566,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.064667,singlet,2210,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.037017,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.142415,singlet,2343,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,0.816544,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128


In [112]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [113]:
list(adata.obs['health_status'].unique())

['Healthy', 'COVID-19']

In [114]:
mapping={'Healthy' :'PATO:0000461', 'COVID-19' :'MONDO:0100096'}

In [115]:
adata.obs['disease_ontology_term_id']= adata.obs['health_status'].map(mapping)

In [116]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,3076,1313,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,2578,1170,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,2802,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,3158,1566,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,2210,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,2343,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096


In [117]:
# change datatype of the column

In [118]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [119]:
# view obs

In [120]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,scDblFinder.predicted_db_samples,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,3076,1313,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,2578,1170,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,2802,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,2905,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,singlet,3158,1566,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,2210,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,2174,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,2343,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,singlet,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096


In [121]:
list(adata.obs['disease_ontology_term_id'].unique())

['PATO:0000461', 'MONDO:0100096']

#### **is_primary_data**

In [122]:
#change data type of column

In [123]:
list(adata.obs['study.batch'].unique())

['Habermann_2020',
 'Adams_2020',
 'Reyfman_2019',
 'Madissoon_2020',
 'Morse_2019',
 'Travaglini_2020',
 'Bharat_2020',
 'Delorey_2021',
 'Lukassen_2020',
 'Melms_2021',
 'Noseda_Bayraktar_2021']

In [124]:
mapping = {'Habermann_2020':False,
 'Adams_2020':False,
 'Reyfman_2019':False,
 'Madissoon_2020':False,
 'Morse_2019':False,
 'Travaglini_2020':False,
 'Bharat_2020':False,
 'Delorey_2021':False,
 'Lukassen_2020':False,
 'Melms_2021':False,
 'Noseda_Bayraktar_2021':True}

In [125]:
adata.obs['is_primary_data']= adata.obs['study.batch'].map(mapping)

In [126]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [127]:
# view obs

In [128]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,nCount_SCT,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,3076,1313,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,2578,1170,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,2802,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461,False
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,2905,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461,False
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,3158,1566,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,2210,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,2174,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096,True
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,2343,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,3532,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096,True


In [129]:
list(adata.obs['is_primary_data'].unique())

[False, True]

#### **organism_ontology_term_id**

In [130]:
# assign organism id 

In [131]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [132]:
#change data type of column

In [133]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [134]:
# view obs

In [135]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,nFeature_SCT,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,1313,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,1170,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,1371,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,1226,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,1566,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,852,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,968,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,1326,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,2139,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [136]:
list(adata.obs['ethnicity'].unique())

['White',
 'Unknown',
 'Black',
 'Hispanic',
 'Asian',
 'White_not_hispanic',
 'Black_hispanic',
 'Asian_not_hispanic',
 'Not_hispanic']

In [137]:
filtered_df = adata.obs[(adata.obs['ethnicity'] == 'Black') & (adata.obs['study.batch'] == 'Delorey_2021')]

unique_donors_count = filtered_df['donor'].nunique()

print(f"Number of unique donors with ethnicity 'black' and study 'Delorey_2021': {unique_donors_count}")


Number of unique donors with ethnicity 'black' and study 'Delorey_2021': 8


In [138]:
mapping={'White':'HANCESTRO:0005',
 'Unknown':'unknown',
 'Black':'HANCESTRO:0010',
 'Hispanic':'HANCESTRO:0014',
 'Asian':'HANCESTRO:0008',
 'White_not_hispanic':'HANCESTRO:0005',
 'Black_hispanic':'HANCESTRO:0014',
 'Asian_not_hispanic':'HANCESTRO:0008',
 'Not_hispanic':'unknown'}

In [139]:
adata.obs['self_reported_ethnicity_ontology_term_id'] =  adata.obs['ethnicity'].map(mapping)

In [140]:
# change data type

In [141]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [142]:
# view obs

In [143]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,cell_type_mid,cell_state_v2,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,AT2,EP.AT2,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,B & Plasma cells,B.Doublets,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Monocyte/Macrophage,MYE.Monocyte,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,AT1,EP.AT1,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,B & Plasma cells,B.Plasma.IgA,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,T & NK cells,T.CD4+,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Monocyte/Macrophage,MYE.Macrophage.Alveolar,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown


In [144]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [145]:
adata.obs['donor_ethnicity'] = adata.obs['ethnicity']

#### **sex_ontology_term_id**

In [146]:
# identify the column in adata.obs which corresponds to sex

In [147]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity'],
      dtype='object')

In [148]:
list(adata.obs['sex'].unique())

['Male', 'Female']

In [149]:
# list the unique values 

In [150]:
mapping= {'Female': 'PATO:0000383', 'Male': 'PATO:0000384', 'unknown':'unknown'}

In [151]:
# add sex_ontology_term_id column

In [152]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex'].map(mapping)

In [153]:
# change data type

In [154]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [155]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,donor_ethnicity,sex_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,EFO:0011025,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,EFO:0011025,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,EFO:0011025,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,EFO:0011025,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,EFO:0009922,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,EFO:0009922,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,EFO:0009922,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384


#### **suspension_type**

In [156]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity', 'sex_ontology_term_id'],
      dtype='object')

In [157]:
list(adata.obs['scn_source'].unique())

['Cells', 'Nuclei']

In [158]:
mapping={'Cells':'cell', 'Nuclei':'nucleus'}

In [159]:
adata.obs['suspension_type'] = adata.obs['scn_source'].map(mapping)

In [160]:
# change data type of column

In [161]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [162]:
# view obs

In [163]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,donor_ethnicity,sex_ontology_term_id,suspension_type
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,CL:0002063,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,CL:0000236,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,CL:0000576,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,CL:0002062,HsapDv:0000124,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,CL:0000987,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,CL:0000624,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,CL:0000583,HsapDv:0000242,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus


#### **tissue_type**

In [164]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [165]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [166]:
# identify the column in adata.obs which corresponds to tissue

In [167]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity', 'sex_ontology_term_id', 'suspension_type',
       'tissue_type'],
      dtype='object')

In [168]:
list(adata.obs['region'].unique())

['L lower lobe',
 'lung parenchyma',
 'distal lung parenchyma & bronchi (?)',
 'upper lobe',
 'lower lobe',
 'R middle lobe',
 'L upper lobe',
 'upper L lobe',
 'upper-lower L-R lobes',
 'upper L-R lobes',
 'Parenchyma']

In [169]:
mapping= {'L lower lobe':'UBERON:0008953',
 'lung parenchyma':'UBERON:0008946',
 'distal lung parenchyma & bronchi (?)':'UBERON:0008946',
 'upper lobe':'UBERON:0008948',
 'lower lobe':'UBERON:0008949',
 'R middle lobe':'UBERON:0002174',
 'L upper lobe':'UBERON:0008952',
 'upper L lobe':'UBERON:0008952',
 'upper-lower L-R lobes':'UBERON:0000101',
 'upper L-R lobes':'UBERON:0008948',
 'Parenchyma':'UBERON:0000353'}

In [170]:
# add 'tissue_ontology_term_id' column

In [171]:
adata.obs['tissue_ontology_term_id'] =adata.obs['region'].map(mapping)

In [172]:
# change data type of column

In [173]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [174]:
#list the unique values in 'tissue_ontology_term_id' column

In [175]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0008953',
 'UBERON:0008946',
 'UBERON:0008948',
 'UBERON:0008949',
 'UBERON:0002174',
 'UBERON:0008952',
 'UBERON:0000101',
 'UBERON:0000353']

In [176]:
# view obs

In [177]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,donor_ethnicity,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353


In [178]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity', 'sex_ontology_term_id', 'suspension_type',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [179]:
# view obsm

In [180]:
# check whether all columns are prefixed with X

In [181]:
adata.obsm

AxisArrays with keys: X_harmony, X_pca, X_threeD_umap, X_umap

#### **uns (Dataset Metadata)**

In [182]:
# View

In [183]:
adata.uns

{'cell_state_v2_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
        '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
        '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
        '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
        '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
        '#d16100', '#ddefff', '#000035', '#7b4f4b', '#a1c299', '#300018',
        '#0aa6d8'], dtype=object)}

In [184]:
adata.uns.keys

<function dict.keys>

In [185]:
# Give a title for the dataset

In [186]:
adata.uns['title'] = 'sc/snRNA-seq integrated COVID19 lung cell atlas'

In [187]:
# Set the default embedding

In [188]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [189]:
# view anndata object

In [190]:
adata

AnnData object with n_obs × n_vars = 536120 × 23330
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex', 'age', 'ethnicity', 'smoking_status', 'health_status', 'region', 'version_10X', 'scn_source', 'percent_mito', 'percent_ribo', 'scrublet.scores_samples', 'scrublet.predicted_db_samples', 'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples', 'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'donor_ethnicity', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_is_filtered'
    uns: 'cell_state_v2_colors', 'title', 'default_embedding'
    obsm: 'X_harmony', 'X_pca', 

In [191]:
# view obs and var data types

In [192]:
adata.obs.dtypes

nCount_RNA                                   float64
nFeature_RNA                                   int32
cell_id                                       object
study.batch                                 category
donor                                       category
sex                                         category
age                                         category
ethnicity                                   category
smoking_status                              category
health_status                               category
region                                      category
version_10X                                 category
scn_source                                  category
percent_mito                                 float64
percent_ribo                                 float64
scrublet.scores_samples                      float64
scrublet.predicted_db_samples                  int32
scDblFinder.score_samples                    float64
scDblFinder.predicted_db_samples            ca

In [193]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed vst.mean from float64 to float32
changed vst.variance from float64 to float32
changed vst.variance.expected from float64 to float32
changed vst.variance.standardized from float64 to float32


In [194]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed nCount_RNA from float64 to float32
changed percent_mito from float64 to float32
changed percent_ribo from float64 to float32
changed scrublet.scores_samples from float64 to float32
changed scDblFinder.score_samples from float64 to float32
changed cell_id from object to category


In [195]:
# view obs

In [196]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,cell_id,study.batch,donor,sex,age,ethnicity,smoking_status,health_status,...,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,donor_ethnicity,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,HD66_AAACCTGAGACCTTTG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,HD66_AAACCTGAGAGATGAG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,HD66_AAACCTGAGCGTCTAT_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,HD66_AAACCTGAGGCACATG_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,HD66_AAACCTGCAACTTGAC_1_1,Habermann_2020,Donor_1,Male,30,White,Prior,Healthy,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,TTTGACTTCAAGTCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,TTTGGTTAGGATGCGT-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,TTTGGTTCATACTGAC-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,TTTGTTGAGCAGGCTA-1_3_5,Noseda_Bayraktar_2021,Donor_128,Male,75:80,Unknown,Unknown,COVID-19,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353


In [197]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'cell_id', 'study.batch', 'donor', 'sex',
       'age', 'ethnicity', 'smoking_status', 'health_status', 'region',
       'version_10X', 'scn_source', 'percent_mito', 'percent_ribo',
       'scrublet.scores_samples', 'scrublet.predicted_db_samples',
       'scDblFinder.score_samples', 'scDblFinder.predicted_db_samples',
       'nCount_SCT', 'nFeature_SCT', 'cell_type_mid', 'cell_state_v2',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity', 'sex_ontology_term_id', 'suspension_type',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [198]:
# delete unwanted columns in obs

In [199]:
del adata.obs['donor']
del adata.obs['sex']
del adata.obs['health_status']
del adata.obs['region']
del adata.obs['age']
del adata.obs['version_10X']
del adata.obs['cell_id']
del adata.obs['scn_source']
del adata.obs['ethnicity']

In [200]:
# view obs

In [201]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,study.batch,smoking_status,percent_mito,percent_ribo,scrublet.scores_samples,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,...,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,donor_ethnicity,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,Habermann_2020,Prior,10.452863,19.268015,0.020584,0,0.002438,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,Habermann_2020,Prior,7.321662,16.132198,0.019374,0,0.002932,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,Habermann_2020,Prior,18.080614,6.833014,0.038855,0,0.000613,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,Habermann_2020,Prior,13.388076,12.674045,0.041154,0,0.000432,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,Habermann_2020,Prior,3.820895,13.970149,0.038855,0,0.001419,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,Noseda_Bayraktar_2021,Unknown,0.221402,0.590406,0.012835,0,0.064667,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,Noseda_Bayraktar_2021,Unknown,0.373692,0.298954,0.037929,0,0.037017,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,Noseda_Bayraktar_2021,Unknown,0.340301,0.388916,0.057826,0,0.142415,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,Noseda_Bayraktar_2021,Unknown,0.725611,0.376243,0.213389,0,0.816544,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353


In [202]:
# view var

In [203]:
adata.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0,False
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0,False
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0,False
ENSG00000187634,0.015952,0.027739,0.025884,1.071679,1,False
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0,False
...,...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0,False
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0,False
ENSG00000211675,0.036180,4.161019,0.062768,46.427853,1,False
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,False


In [204]:
araw.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0
ENSG00000187634,0.015952,0.027739,0.025884,1.071678,1
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0
...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0
ENSG00000211675,0.036180,4.161019,0.062768,46.427851,1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0


In [205]:
#view uns

In [206]:
adata.uns

{'cell_state_v2_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
        '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
        '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
        '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
        '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
        '#d16100', '#ddefff', '#000035', '#7b4f4b', '#a1c299', '#300018',
        '#0aa6d8'], dtype=object),
 'title': 'sc/snRNA-seq integrated COVID19 lung cell atlas',
 'default_embedding': 'X_umap'}

In [207]:
list(adata.uns.keys())

['cell_state_v2_colors', 'title', 'default_embedding']

In [208]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'study.batch', 'smoking_status',
       'percent_mito', 'percent_ribo', 'scrublet.scores_samples',
       'scrublet.predicted_db_samples', 'scDblFinder.score_samples',
       'scDblFinder.predicted_db_samples', 'nCount_SCT', 'nFeature_SCT',
       'cell_type_mid', 'cell_state_v2', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity', 'sex_ontology_term_id', 'suspension_type',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [209]:
# Remove unwanted columns in uns

In [210]:
#check the format of expression matrix

In [211]:
adata.X

<536120x23330 sparse matrix of type '<class 'numpy.float64'>'
	with 863211443 stored elements in Compressed Sparse Row format>

In [212]:
araw.X

<536120x23330 sparse matrix of type '<class 'numpy.float32'>'
	with 863211443 stored elements in Compressed Sparse Row format>

In [213]:
#Copy raw counts to adata.raw

In [214]:
del adata.raw

In [215]:
adata.raw = araw

In [216]:
obs_dtype = adata.obs.dtypes

In [217]:
obs_dtype

nCount_RNA                                   float32
nFeature_RNA                                   int32
study.batch                                 category
smoking_status                              category
percent_mito                                 float32
percent_ribo                                 float32
scrublet.scores_samples                      float32
scrublet.predicted_db_samples                  int32
scDblFinder.score_samples                    float32
scDblFinder.predicted_db_samples            category
nCount_SCT                                     int32
nFeature_SCT                                   int32
cell_type_mid                               category
cell_state_v2                               category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    ca

In [218]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_nanostring/Final_objects/lung_atlas.h5ad', compression = 'gzip')

In [219]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,study.batch,smoking_status,percent_mito,percent_ribo,scrublet.scores_samples,scrublet.predicted_db_samples,scDblFinder.score_samples,scDblFinder.predicted_db_samples,...,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,donor_ethnicity,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
HD66_AAACCTGAGACCTTTG_1_1,7022.0,1570,Habermann_2020,Prior,10.452863,19.268015,0.020584,0,0.002438,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGAGATGAG_1_1,12220.0,2982,Habermann_2020,Prior,7.321662,16.132198,0.019374,0,0.002932,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGCGTCTAT_1_1,2604.0,1371,Habermann_2020,Prior,18.080614,6.833014,0.038855,0,0.000613,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGAGGCACATG_1_1,2801.0,1226,Habermann_2020,Prior,13.388076,12.674045,0.041154,0,0.000432,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
HD66_AAACCTGCAACTTGAC_1_1,3347.0,1566,Habermann_2020,Prior,3.820895,13.970149,0.038855,0,0.001419,singlet,...,Donor_1,PATO:0000461,False,NCBITaxon:9606,HANCESTRO:0005,White,PATO:0000384,cell,tissue,UBERON:0008953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTTCAAGTCGT-1_3_5,1355.0,813,Noseda_Bayraktar_2021,Unknown,0.221402,0.590406,0.012835,0,0.064667,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTAGGATGCGT-1_3_5,1336.0,927,Noseda_Bayraktar_2021,Unknown,0.373692,0.298954,0.037929,0,0.037017,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGGTTCATACTGAC-1_3_5,2056.0,1326,Noseda_Bayraktar_2021,Unknown,0.340301,0.388916,0.057826,0,0.142415,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353
TTTGTTGAGCAGGCTA-1_3_5,3719.0,2139,Noseda_Bayraktar_2021,Unknown,0.725611,0.376243,0.213389,0,0.816544,singlet,...,Donor_128,MONDO:0100096,True,NCBITaxon:9606,unknown,Unknown,PATO:0000384,nucleus,tissue,UBERON:0000353


In [220]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'study.batch', 'smoking_status',
       'percent_mito', 'percent_ribo', 'scrublet.scores_samples',
       'scrublet.predicted_db_samples', 'scDblFinder.score_samples',
       'scDblFinder.predicted_db_samples', 'nCount_SCT', 'nFeature_SCT',
       'cell_type_mid', 'cell_state_v2', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'donor_ethnicity', 'sex_ontology_term_id', 'suspension_type',
       'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [221]:
adata.raw.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0
ENSG00000187634,0.015952,0.027739,0.025884,1.071678,1
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0
...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0
ENSG00000211675,0.036180,4.161019,0.062768,46.427851,1
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0


In [222]:
adata.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable,feature_is_filtered
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000177757,0.000824,0.000842,0.000927,0.908446,0,False
ENSG00000225880,0.008196,0.008476,0.011995,0.706609,0,False
ENSG00000230368,0.009373,0.009871,0.014031,0.703497,0,False
ENSG00000187634,0.015952,0.027739,0.025884,1.071679,1,False
ENSG00000188976,0.096896,0.114255,0.171881,0.664734,0,False
...,...,...,...,...,...,...
ENSG00000203618,0.000802,0.000854,0.000900,0.948477,0,False
ENSG00000268292,0.001442,0.001477,0.001695,0.871179,0,False
ENSG00000211675,0.036180,4.161019,0.062768,46.427853,1,False
ENSG00000282033,0.001127,0.001286,0.001297,0.991599,0,False


In [223]:
adata.raw.X

<536120x23330 sparse matrix of type '<class 'numpy.float32'>'
	with 863211443 stored elements in Compressed Sparse Row format>

In [224]:
adata.layers.keys()


KeysView(Layers with keys: )