### **Curating covid19_across_age_in_ALI_organoids_invivo.h5ad**

Article:  The emergence of goblet inflammatory or ITGB6hi nasal progenitor cells determines age-associated SARS-CoV-2 pathogenesis

DOI: https://doi.org/10.1101/2023.01.16.524211

Data Source : https://www.covid19cellatlas.org/index.patient.html

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/Data/in_vivo_object_cxg_with_raw_counts.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 577234 × 17591
    obs: 'age_group', 'age_harmonised', 'age_status', 'file_of_origin', 'author_ann_harmonised', 'author_ann_harmonised_broad', 'donor_harmonised', 'age_status_granular', 'score_IFN_alpha', 'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1', 'status'
    var: 'featureid-0', 'gene_ids-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'age_status_colors', 'annotation_v1_colors'
    obsm: 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<577234x17591 sparse matrix of type '<class 'numpy.float32'>'
	with 1359172539 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 10)	0.42256668
  (0, 15)	1.2890576
  (0, 17)	0.42256668
  (0, 19)	0.42256668
  (0, 25)	0.7186911
  (0, 28)	0.42256668
  (0, 35)	0.94686604
  (0, 49)	0.7186911
  (0, 51)	0.42256668
  (0, 52)	0.42256668
  (0, 53)	0.42256668
  (0, 56)	2.123724
  (0, 57)	1.4243696
  (0, 60)	0.42256668
  (0, 64)	1.1325278
  (0, 66)	0.7186911
  (0, 68)	0.42256668
  (0, 75)	0.7186911
  (0, 83)	1.5435354
  (0, 86)	1.7462139
  (0, 87)	0.7186911
  (0, 91)	0.7186911
  (0, 93)	1.1325278
  (0, 96)	1.4243696
  (0, 97)	0.94686604
  :	:
  (577233, 15852)	3.3489397
  (577233, 15933)	3.3489397
  (577233, 15966)	3.3489397
  (577233, 16105)	4.02437
  (577233, 16151)	3.3489397
  (577233, 16175)	3.3489397
  (577233, 16296)	3.3489397
  (577233, 16350)	3.3489397
  (577233, 16359)	3.3489397
  (577233, 16361)	3.3489397
  (577233, 16362)	3.3489397
  (577233, 16401)	3.3489397
  (577233, 16458)	3.3489397
  (577233, 16493)	3.3489397
  (577233, 16659)	3.3489397
  (577233, 16706)	3.3489397
  (577233, 16712)	3.3489397
  (577233,

##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
# check whether raw counts are present in adata.raw

In [13]:
adata.raw

In [14]:
adata.layers.keys

<bound method Mapping.keys of Layers with keys: counts>

In [15]:
#print(adata.layers['counts'])

In [16]:
# Access the 'counts' layer
counts = adata.layers['counts']

# Create a boolean mask for non-zero elements
non_zero_mask = counts != 0

# Convert non-zero values to integers
counts[non_zero_mask] = counts[non_zero_mask].round().astype(int)

# Update the 'counts' layer
adata.layers['counts'] = counts


In [17]:
print(adata.layers['counts'])

  (0, 10)	1.0
  (0, 15)	5.0
  (0, 17)	1.0
  (0, 19)	1.0
  (0, 25)	2.0
  (0, 28)	1.0
  (0, 35)	3.0
  (0, 49)	2.0
  (0, 51)	1.0
  (0, 52)	1.0
  (0, 53)	1.0
  (0, 56)	14.0
  (0, 57)	6.0
  (0, 60)	1.0
  (0, 64)	4.0
  (0, 66)	2.0
  (0, 68)	1.0
  (0, 75)	2.0
  (0, 83)	7.0
  (0, 86)	9.0
  (0, 87)	2.0
  (0, 91)	2.0
  (0, 93)	4.0
  (0, 96)	6.0
  (0, 97)	3.0
  :	:
  (577233, 15852)	1.0
  (577233, 15933)	1.0
  (577233, 15966)	1.0
  (577233, 16105)	2.0
  (577233, 16151)	1.0
  (577233, 16175)	1.0
  (577233, 16296)	1.0
  (577233, 16350)	1.0
  (577233, 16359)	1.0
  (577233, 16361)	1.0
  (577233, 16362)	1.0
  (577233, 16401)	1.0
  (577233, 16458)	1.0
  (577233, 16493)	1.0
  (577233, 16659)	1.0
  (577233, 16706)	1.0
  (577233, 16712)	1.0
  (577233, 16838)	1.0
  (577233, 16884)	2.0
  (577233, 16926)	1.0
  (577233, 16935)	1.0
  (577233, 16940)	1.0
  (577233, 17377)	1.0
  (577233, 17452)	1.0
  (577233, 17503)	1.0


In [18]:
import anndata
import numpy as np

# Assuming 'adata' is already loaded and contains the 'counts' layer
print("Decimal numbers present:", np.any(adata.layers['counts'].data % 1 != 0) if 'counts' in adata.layers else "Layer 'counts' not found")


Decimal numbers present: False


In [19]:
# Check whether adata and araw has same dimensions.

In [20]:
araw = ad.AnnData(X=adata.layers['counts'].copy(), obs=adata.obs.copy(), var=adata.var.copy())

In [21]:
araw.X

<577234x17591 sparse matrix of type '<class 'numpy.float32'>'
	with 1359172539 stored elements in Compressed Sparse Row format>

In [22]:
del adata.layers['counts']

##### **Variables(var)**

In [23]:
# View the var of anndata and raw object

In [24]:
adata.var

Unnamed: 0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm
A1BG,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797
A1BG-AS1,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505
A1CF,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868
A2M,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142
A2M-AS1,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300
...,...,...,...,...,...,...
ZXDC,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859
ZYG11A,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222
ZYG11B,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026
ZYX,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921


In [25]:
adata.var['name'] = adata.var.index

In [26]:
adata.var

Unnamed: 0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
A1BG,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
A1BG-AS1,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
A1CF,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
A2M,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
A2M-AS1,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ZXDC,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ZYG11A,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ZYG11B,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ZYX,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [27]:
araw.var

Unnamed: 0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm
A1BG,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797
A1BG-AS1,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505
A1CF,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868
A2M,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142
A2M-AS1,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300
...,...,...,...,...,...,...
ZXDC,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859
ZYG11A,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222
ZYG11B,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026
ZYX,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921


In [28]:
araw.var['name'] = araw.var.index

In [29]:
adata.var.index = adata.var['gene_ids-1']

In [30]:
araw.var.index = araw.var['gene_ids-1']

In [31]:
adata.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [32]:
araw.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [33]:
# Load the approved genes file.

In [34]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [35]:
#Create a dictionary from the approved genes file 

In [36]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [37]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [38]:
len(genedict)

119799

In [39]:
#Filter out the genes which are not in the approved genes file.

In [40]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [41]:
len(var_to_keep_adata)

17558

In [42]:
len(var_to_keep_araw)

17558

In [43]:
adata.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [44]:
araw.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [45]:
# Modify the anndata object by filtering out the filtered genes.

In [46]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [47]:
adata.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [48]:
# View var

In [49]:
araw.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


feature is filtered

In [50]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [51]:
adata.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name,feature_is_filtered
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG,False
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1,False
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF,False
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M,False
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1,False
...,...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC,False
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A,False
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B,False
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX,False


In [52]:
araw.var

Unnamed: 0_level_0,featureid-0,gene_ids-1,highly_variable,means,dispersions,dispersions_norm,name
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000121410,A1BG,ENSG00000121410,False,0.041524,1.744950,-0.641797,A1BG
ENSG00000268895,A1BG-AS1,ENSG00000268895,False,0.016673,1.953380,-0.213505,A1BG-AS1
ENSG00000148584,A1CF,ENSG00000148584,False,0.002404,2.625631,1.167868,A1CF
ENSG00000175899,A2M,ENSG00000175899,True,0.047390,3.782054,3.544142,A2M
ENSG00000245105,A2M-AS1,ENSG00000245105,False,0.008230,1.953967,-0.212300,A2M-AS1
...,...,...,...,...,...,...,...
ENSG00000070476,ZXDC,ENSG00000070476,False,0.245947,1.905029,-0.312859,ZXDC
ENSG00000203995,ZYG11A,ENSG00000203995,False,0.007422,2.166888,0.225222,ZYG11A
ENSG00000162378,ZYG11B,ENSG00000162378,False,0.277820,1.784258,-0.561026,ZYG11B
ENSG00000159840,ZYX,ENSG00000159840,False,0.295521,1.707418,-0.718921,ZYX


In [53]:
del adata.var['name']
del adata.var['gene_ids-1']
del araw.var['name']
del araw.var['gene_ids-1']
del adata.var['featureid-0']
del araw.var['featureid-0']

In [54]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,feature_is_filtered
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,False,0.041524,1.744950,-0.641797,False
ENSG00000268895,False,0.016673,1.953380,-0.213505,False
ENSG00000148584,False,0.002404,2.625631,1.167868,False
ENSG00000175899,True,0.047390,3.782054,3.544142,False
ENSG00000245105,False,0.008230,1.953967,-0.212300,False
...,...,...,...,...,...
ENSG00000070476,False,0.245947,1.905029,-0.312859,False
ENSG00000203995,False,0.007422,2.166888,0.225222,False
ENSG00000162378,False,0.277820,1.784258,-0.561026,False
ENSG00000159840,False,0.295521,1.707418,-0.718921,False


In [55]:
araw.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000121410,False,0.041524,1.744950,-0.641797
ENSG00000268895,False,0.016673,1.953380,-0.213505
ENSG00000148584,False,0.002404,2.625631,1.167868
ENSG00000175899,True,0.047390,3.782054,3.544142
ENSG00000245105,False,0.008230,1.953967,-0.212300
...,...,...,...,...
ENSG00000070476,False,0.245947,1.905029,-0.312859
ENSG00000203995,False,0.007422,2.166888,0.225222
ENSG00000162378,False,0.277820,1.784258,-0.561026
ENSG00000159840,False,0.295521,1.707418,-0.718921


#### **obs (Cell metadata)**

In [56]:
#view obs

In [57]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,score_EMT,score_BASALOID,annotation_v1,status
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,0.157006,0.418843,AT2,COVID19
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,0.179795,0.310751,AT1,COVID19
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,0.258357,0.716596,AT2,COVID19
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,0.236939,0.714845,AT2,COVID19
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,0.138514,0.361797,AT2,COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,0.227514,-0.100314,AT2,COVID19
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,-0.060741,-0.086155,Goblet/Secretory,COVID19
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,-0.058798,-0.056902,AT2,COVID19
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,0.127114,0.035694,Goblet/Secretory,COVID19


In [58]:
# view the column names in obs

In [59]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status'],
      dtype='object')

#### **assay_ontology_term_id**

In [60]:
adata.obs['barcodes'] = adata.obs_names

In [61]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [62]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [63]:
assay_info

Unnamed: 0,barcode,3pv2_5pv1_5pv2,3pv3,multiome,summary,assay
0,AAACCTGAGAAACCAT,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
1,AAACCTGAGAAACCGC,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
2,AAACCTGAGAAACCTA,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
3,AAACCTGAGAAACGAG,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
4,AAACCTGAGAAACGCC,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
...,...,...,...,...,...,...
8179621,TTTGTTGGTTTGGGTA,0.0,0.0,1.0,multiome,multiome
8179622,TTTGTTGGTTTGGTTC,0.0,0.0,1.0,multiome,multiome
8179623,TTTGTTGGTTTGTCTA,0.0,0.0,1.0,multiome,multiome
8179624,TTTGTTGGTTTGTGGA,0.0,0.0,1.0,multiome,multiome


In [64]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [65]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [66]:
list(adata.obs['assay'].unique())

['3pv3',
 '3pv2_5pv1_5pv2+3pv3',
 '3pv3+multiome',
 '3pv2_5pv1_5pv2',
 nan,
 '3pv2_5pv1_5pv2+multiome']

In [67]:
# Convert 'assay' column values to strings
#adata.obs['assay'] = adata.obs['assay'].astype(str)

In [68]:
mapping={'Delorey_2021_COVID_only.h5ad':'EFO:0009922',#few5',
 'Yoshida_2022.h5ad':'EFO:0030004',
 'Ziegler_2021.h5ad':'EFO:0030019',
 'Bharat_2020.h5ad':'EFO:0009922', #few v2
 'Trump_2020.h5ad':'EFO:0009922',
 'Chua_2020.h5ad':'EFO:0009922',
 'Loske_2021.h5ad':'EFO:0009922',
 'Melms_2021.h5ad':'EFO:0009922'}

In [69]:
adata.obs['assay_ontology_term_id'] = adata.obs['file_of_origin'].map(mapping)

In [70]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,score_EMT,score_BASALOID,annotation_v1,status,barcodes,assay,assay_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,0.157006,0.418843,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,0.179795,0.310751,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,0.258357,0.716596,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,0.236939,0.714845,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,0.138514,0.361797,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,0.227514,-0.100314,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,-0.060741,-0.086155,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,-0.058798,-0.056902,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,0.127114,0.035694,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922


In [71]:
import pandas as pd

# Group the data by 'file_of_origin' and 'assay' and count the occurrences
counts = adata.obs.groupby(['assay', 'assay_ontology_term_id']).size()

# Group the data by 'file_of_origin' and collect unique values of 'assay'
unique_values = adata.obs.groupby('assay')['assay_ontology_term_id'].unique()

# Display the unique values and counts for each sample
for sample, assays in unique_values.items():
    print(f"file_of_origin: {sample}")
    print(f"Unique Assays: {', '.join(assays)}")
    print("Counts:")
    for assay in assays:
        count = counts.get((sample, assay), 0)
        print(f"{assay}: {count}")
    print()


file_of_origin: 3pv2_5pv1_5pv2
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 7917

file_of_origin: 3pv2_5pv1_5pv2+3pv3
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 6218

file_of_origin: 3pv2_5pv1_5pv2+multiome
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 7

file_of_origin: 3pv3
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 341846

file_of_origin: 3pv3+multiome
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 437



In [72]:
adata.obs.loc[adata.obs['assay'] == '3pv3', 'assay_ontology_term_id'] = 'EFO:0009922'

In [73]:
adata.obs.loc[adata.obs['assay'] == '3pv2_5pv1_5pv2', 'assay_ontology_term_id'] = 'EFO:0030004'
adata.obs.loc[adata.obs['assay'] == '3pv2_5pv1_5pv2+multiome', 'assay_ontology_term_id'] = 'EFO:0030004'

In [74]:
obs_df = pd.DataFrame(adata.obs)

# Group by the 'assay' column and iterate over each group
for assay, group in obs_df.groupby('assay'):
    # Print the assay value
    print("Assay:", assay)
    
    # Get the unique values of assay_ontology_term_id for this assay
    unique_ontology_term_ids = group['assay_ontology_term_id'].unique()
    
    # Print the corresponding assay_ontology_term_id values
    print("Assay Ontology Term IDs:", unique_ontology_term_ids)

Assay: 3pv2_5pv1_5pv2
Assay Ontology Term IDs: ['EFO:0030004']
Assay: 3pv2_5pv1_5pv2+3pv3
Assay Ontology Term IDs: ['EFO:0009922']
Assay: 3pv2_5pv1_5pv2+multiome
Assay Ontology Term IDs: ['EFO:0030004']
Assay: 3pv3
Assay Ontology Term IDs: ['EFO:0009922']
Assay: 3pv3+multiome
Assay Ontology Term IDs: ['EFO:0009922']


In [75]:
import pandas as pd

# Group the data by 'file_of_origin' and 'assay' and count the occurrences
counts = adata.obs.groupby(['file_of_origin', 'assay_ontology_term_id']).size()

# Group the data by 'file_of_origin' and collect unique values of 'assay'
unique_values = adata.obs.groupby('file_of_origin')['assay_ontology_term_id'].unique()

# Display the unique values and counts for each sample
for sample, assays in unique_values.items():
    print(f"file_of_origin: {sample}")
    print(f"Unique Assays: {', '.join(assays)}")
    print("Counts:")
    for assay in assays:
        count = counts.get((sample, assay), 0)
        print(f"{assay}: {count}")
    print()


file_of_origin: Bharat_2020.h5ad
Unique Assays: EFO:0030004, EFO:0009922
Counts:
EFO:0030004: 7332
EFO:0009922: 33256

file_of_origin: Chua_2020.h5ad
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 55282

file_of_origin: Delorey_2021_COVID_only.h5ad
Unique Assays: EFO:0009922, EFO:0030004
Counts:
EFO:0009922: 22360
EFO:0030004: 592

file_of_origin: Loske_2021.h5ad
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 154230

file_of_origin: Melms_2021.h5ad
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 29702

file_of_origin: Trump_2020.h5ad
Unique Assays: EFO:0009922
Counts:
EFO:0009922: 53678

file_of_origin: Yoshida_2022.h5ad
Unique Assays: EFO:0030004
Counts:
EFO:0030004: 191813

file_of_origin: Ziegler_2021.h5ad
Unique Assays: EFO:0030019
Counts:
EFO:0030019: 28989



  counts = adata.obs.groupby(['file_of_origin', 'assay_ontology_term_id']).size()
  unique_values = adata.obs.groupby('file_of_origin')['assay_ontology_term_id'].unique()


In [76]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

#### **cell_type_ontology_term_id**

In [77]:
#identify the column in adata.obs related. to cell type annotation

In [78]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status', 'barcodes', 'assay', 'assay_ontology_term_id'],
      dtype='object')

In [79]:
list(adata.obs['annotation_v1'].unique())

['AT2',
 'AT1',
 'KRT8+ DATPs',
 'Squamous',
 'Goblet/Secretory',
 'Cycling basal',
 'Basaloid-like 2',
 'Club',
 'Goblet inflammatory',
 'Basal',
 'Ciliated',
 'Deutorosomal',
 'Basaloid-like 1',
 'Airway mucous',
 'Ionocyte',
 'Ciliated inflammatory',
 'Transit epi']

In [80]:
# create a dictionary of cell type and ontology term

In [81]:
mapping= {'AT2':'CL:0002063',
 'AT1':'CL:0002062',
 'KRT8+ DATPs':'unknown',
 'Squamous':'CL:0000076',
 'Goblet/Secretory':'CL:0000160',
 'Cycling basal':'CL:0000646',
 'Basaloid-like 2':'unknown',
 'Club':'CL:0000158',
 'Goblet inflammatory':'CL:0000160',
 'Basal':'CL:0000646',
 'Ciliated':'CL:0000064',
 'Deutorosomal':'unknown',
 'Basaloid-like 1':'unknown',
 'Airway mucous':'CL:0002633',
 'Ionocyte':'CL:0005006',
 'Ciliated inflammatory':'CL:0000064',
 'Transit epi':'CL:0000244'}

In [82]:
# add the cell_type_ontology_term_id column

In [83]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['annotation_v1'].map(mapping)

In [84]:
# change datatype of the column

In [85]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [86]:
# view adata.obs

In [87]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,score_EMT,score_BASALOID,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,0.157006,0.418843,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,0.179795,0.310751,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,0.258357,0.716596,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,0.236939,0.714845,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,0.138514,0.361797,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,0.227514,-0.100314,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,-0.060741,-0.086155,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,-0.058798,-0.056902,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,0.127114,0.035694,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160


#### **development_stage_ontology_term_id**

In [88]:
# identify the column in adata which corresponds to age

In [89]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status', 'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [90]:
mapping = {'60-65':'HsapDv:0000241',
'50-55':'HsapDv:0000240',
'75-80':'HsapDv:0000242',
'40-45':'HsapDv:0000239',
'>89':'HsapDv:0000095',
'80-85':'HsapDv:0000243',
'55-60':'HsapDv:0000240',
'65-70':'HsapDv:0000241',
'30-35':'HsapDv:0000238',
'5 years 11 months ':'HsapDv:0000099',
'3 weeks':'HsapDv:0000082',
'6 months':'HsapDv:0000179',
'1 year 10 months':'HsapDv:0000195',
'6 weeks':'HsapDv:0000174',
'3 days':'HsapDv:0000082',
'3 years 11 months':'HsapDv:0000097',
'4 years 9 months':'HsapDv:0000098',
'12 months':'HsapDv:0000185',
'2 years 10 months':'HsapDv:0000096',
'15 years 5 months':'HsapDv:0000109',
'13 days':'HsapDv:0000082',
'3 months':'HsapDv:0000176',
'2 years 2 weeks':'HsapDv:0000096',
'20 months':'HsapDv:0000193',
'56y':'HsapDv:0000150',
'14 years 11 months':'HsapDv:0000108',
'2 years 5 months':'HsapDv:0000096',
'13  years':'HsapDv:0000107',
'11 years':'HsapDv:0000105',
'25y':'HsapDv:0000119',
'10 years':'HsapDv:0000104',
'6 years 9 months':'HsapDv:0000100',
'52y':'HsapDv:0000146',
'2 years 11 months':'HsapDv:0000096',
'12 years 5 months':'HsapDv:0000106',
'67y':'HsapDv:0000161',
'39y':'HsapDv:0000133',
'76y':'HsapDv:0000170',
'16 years':'HsapDv:0000110',
'14 years ':'HsapDv:0000108',
'7 years':'HsapDv:0000101',
'7 yrs 3 months':'HsapDv:0000101',
'70y':'HsapDv:0000164',
'9 years':'HsapDv:0000103',
'73y':'HsapDv:0000167',
'9 days':'HsapDv:0000082',
'65y':'HsapDv:0000159',
'20 days':'HsapDv:0000082',
'4y':'HsapDv:0000098',
'16y':'HsapDv:0000110',
'6 days':'HsapDv:0000082',
'13y':'HsapDv:0000107',
'6m':'HsapDv:0000179',
'1month':'HsapDv:0000174',
'14 years':'HsapDv:0000108',
'14y':'HsapDv:0000108',
'36y':'HsapDv:0000130',
'38y':'HsapDv:0000132',
'55y':'HsapDv:0000149',
'26y':'HsapDv:0000120',
'15y':'HsapDv:0000109',
'46y':'HsapDv:0000140',
'44y':'HsapDv:0000138',
'61y':'HsapDv:0000155',
'66y':'HsapDv:0000160',
'50-59':'HsapDv:0000240',
'30-39':'HsapDv:0000238',
'60-69':'HsapDv:0000241',
'70-79':'HsapDv:0000242',
'40-49':'HsapDv:0000239',
'19-29':'HsapDv:0000088',
'80-89':'HsapDv:0000243',
'36':'HsapDv:0000130',
'40':'HsapDv:0000134',
'17':'HsapDv:0000111',
'30':'HsapDv:0000124',
'41':'HsapDv:0000134',
'31':'HsapDv:0000125',
'54':'HsapDv:0000148',
'38':'HsapDv:0000132',
'52':'HsapDv:0000146',
'25':'HsapDv:0000119',
'28':'HsapDv:0000122',
'57':'HsapDv:0000151',
'59.0':'HsapDv:0000153',
'36.0':'HsapDv:0000130',
'68.0':'HsapDv:0000162',
'34.0':'HsapDv:0000128',
'41.0':'HsapDv:0000135',
'58.0':'HsapDv:0000152',
'24.0':'HsapDv:0000118',
'33.0':'HsapDv:0000127',
'67.0':'HsapDv:0000161',
'76.0':'HsapDv:0000170',
'53.0':'HsapDv:0000147',
'71.0':'HsapDv:0000165',
'51.0':'HsapDv:0000145',
'56.0':'HsapDv:0000150',
'52.0':'HsapDv:0000146',
'70.0':'HsapDv:0000164',
'78.0':'HsapDv:0000172',
'75.0':'HsapDv:0000169',
'84.0':'HsapDv:0000210',
'91.0':'HsapDv:0000217',
'55.0':'HsapDv:0000149',
'66.0':'HsapDv:0000160',
'82.0':'HsapDv:0000208',
'61.0':'HsapDv:0000155',
'32.0':'HsapDv:0000126',
'63.0':'HsapDv:0000157',
'62.0':'HsapDv:0000156',
'50.0':'HsapDv:0000144',
'73.0':'HsapDv:0000167',
'79.0':'HsapDv:0000173',
'29.0':'HsapDv:0000123',
'64.0':'HsapDv:0000158',
'45.0':'HsapDv:0000139',
'21.0':'HsapDv:0000115',
'54.0':'HsapDv:0000148',
'7':'HsapDv:0000101',
'27':'HsapDv:0000121',
'35':'HsapDv:0000129',
'39':'HsapDv:0000133',
'42':'HsapDv:0000136',
'47':'HsapDv:0000141',
'4':'HsapDv:0000098',
'77':'HsapDv:0000171',
'60':'HsapDv:0000154',
'2':'HsapDv:0000096',
'< 1 year':'HsapDv:0000235',
'62':'HsapDv:0000156',
'65':'HsapDv:0000159',
'66':'HsapDv:0000160',
'61':'HsapDv:0000155',
'68':'HsapDv:0000162',
'46':'HsapDv:0000140',
'50':'HsapDv:0000144',
'9':'HsapDv:0000103',
'12':'HsapDv:0000106',
'18':'HsapDv:0000112',
'15':'HsapDv:0000109',
'16':'HsapDv:0000110',
'10':'HsapDv:0000104',
'6':'HsapDv:0000100',
'5':'HsapDv:0000099',
'14':'HsapDv:0000108',
'8':'HsapDv:0000102',
'33':'HsapDv:0000127',
'34':'HsapDv:0000128',
'24':'HsapDv:0000118',
'29':'HsapDv:0000123',
'76':'HsapDv:0000170',
'45':'HsapDv:0000139',
'1':'HsapDv:0000246',
'13':'HsapDv:0000107',
'32':'HsapDv:0000126',
'69.0':'HsapDv:0000163',
'72.0':'HsapDv:0000166',
'80.0':'HsapDv:0000206',
'83.0':'HsapDv:0000209',
'65.0':'HsapDv:0000159'}

In [91]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age_harmonised'].map(mapping)

In [92]:
# change datatype of the column

In [93]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [94]:
# view unique values of development_stage_ontology_term_id column

In [95]:
# view adata.obs

In [96]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,score_EMT,score_BASALOID,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,0.157006,0.418843,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,0.179795,0.310751,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,0.258357,0.716596,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,0.236939,0.714845,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,0.138514,0.361797,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,0.227514,-0.100314,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,-0.060741,-0.086155,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,-0.058798,-0.056902,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,0.127114,0.035694,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152


#### **donor_id**

In [97]:
#identify the column in adata.obs which provides donor information

In [98]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status', 'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [99]:
# add the donor_id column

In [100]:
adata.obs['donor_id'] = adata.obs['donor_harmonised']

In [101]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [102]:
# change datatype of the column

In [103]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [104]:
# view unique values of donor_id column

In [105]:
#view obs

In [106]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,score_EMT,score_BASALOID,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,0.157006,0.418843,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,0.179795,0.310751,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,0.258357,0.716596,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,0.236939,0.714845,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,0.138514,0.361797,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,0.227514,-0.100314,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,-0.060741,-0.086155,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,-0.058798,-0.056902,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,0.127114,0.035694,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov


#### **disease_ontology_term_id**

In [107]:
# add the disease_ontology_term_id column

In [108]:
list(adata.obs['status'].unique())

['COVID19', 'Healthy']

In [109]:
adata.obs['status'] = adata.obs['status'].astype(str)

In [110]:
mapping= {'COVID19':'MONDO:0100096', 'Healthy':'PATO:0000461'}

In [111]:
adata.obs['disease_ontology_term_id']= adata.obs['status'].map(mapping)

In [112]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,score_BASALOID,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,0.418843,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,0.310751,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,0.716596,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,0.714845,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,0.361797,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,-0.100314,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,-0.086155,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,-0.056902,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,0.035694,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096


In [113]:
# change datatype of the column

In [114]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [115]:
# view obs

In [116]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,score_BASALOID,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,0.418843,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,0.310751,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,0.716596,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,0.714845,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,0.361797,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,-0.100314,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,-0.086155,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,-0.056902,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,0.035694,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096


#### **is_primary_data**

In [117]:
list(adata.obs['file_of_origin'].unique())

['Delorey_2021_COVID_only.h5ad',
 'Yoshida_2022.h5ad',
 'Ziegler_2021.h5ad',
 'Bharat_2020.h5ad',
 'Trump_2020.h5ad',
 'Chua_2020.h5ad',
 'Loske_2021.h5ad',
 'Melms_2021.h5ad']

In [118]:
mapping = {'Delorey_2021_COVID_only.h5ad':False,
 'Yoshida_2022.h5ad':False,
 'Ziegler_2021.h5ad':False,
 'Bharat_2020.h5ad':False,
 'Trump_2020.h5ad':True,
 'Chua_2020.h5ad':True,
 'Loske_2021.h5ad':True,
 'Melms_2021.h5ad':False}

In [119]:
adata.obs['is_primary_data']= adata.obs['file_of_origin'].map(mapping)

In [120]:
list(adata.obs['is_primary_data'].unique())

[False, True]

In [121]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False


In [122]:
#change data type of column

In [123]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [124]:
# view obs

In [125]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,annotation_v1,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,AT2,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,AT1,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,AT2,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,AT2,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,AT2,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,AT2,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,Goblet/Secretory,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,AT2,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,Goblet/Secretory,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False


#### **organism_ontology_term_id**

In [126]:
# assign organism id 

In [127]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [128]:
#change data type of column

In [129]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [130]:
# view obs

In [131]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,status,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,COVID19,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,COVID19,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,COVID19,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,COVID19,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,COVID19,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,COVID19,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,COVID19,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,COVID19,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,COVID19,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [132]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [133]:
# change data type

In [134]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [135]:
# view obs

In [136]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown


#### **sex_ontology_term_id**

In [137]:
# identify the column in adata.obs which corresponds to sex

In [138]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status', 'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [139]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,AATCACGAGGGTCACA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,GGTAACTAGTCCTACA,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,CATCCGTTCGCAAGAG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,TTCGGTCAGCTTTCCC,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,TTAGGCATCCTCACTG,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,AGCTTCCAGGCATGCA,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,GCACGGTCAGAAGTTA,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,TTGTGGATCATAGGCT,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,GCCGATGAGTAAACGT,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown


In [140]:
# add sex_ontology_term_id column

In [141]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [142]:
# change data type

In [143]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [144]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown


#### **suspension_type**

In [145]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,3pv3,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,3pv3,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,3pv3,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown


In [146]:
mapping = {'Delorey_2021_COVID_only.h5ad':'cell',
 'Yoshida_2022.h5ad':'cell',
 'Ziegler_2021.h5ad':'cell',
 'Bharat_2020.h5ad':'cell',
 'Trump_2020.h5ad':'cell',
 'Chua_2020.h5ad':'cell',
 'Loske_2021.h5ad':'cell',
 'Melms_2021.h5ad':'nucleus'}

In [147]:
adata.obs['suspension_type'] = adata.obs['file_of_origin'].map(mapping)

In [148]:
# change data type of column

In [149]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [150]:
# view obs

In [151]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,EFO:0009922,CL:0002062,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,EFO:0009922,CL:0002063,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,EFO:0009922,CL:0002063,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,EFO:0009922,CL:0000160,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus


#### **tissue_type**

In [152]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [153]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [154]:
# identify the column in adata.obs which corresponds to tissue

In [155]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status', 'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [156]:
mapping = {'Delorey_2021_COVID_only.h5ad':'UBERON:0002048',
 'Yoshida_2022.h5ad':'UBERON:0001005',
 'Ziegler_2021.h5ad':'UBERON:0001728',
 'Bharat_2020.h5ad':'UBERON:0002048',
 'Trump_2020.h5ad':'UBERON:0001728',
 'Chua_2020.h5ad':'UBERON:0001005',
 'Loske_2021.h5ad':'UBERON:0001728',
 'Melms_2021.h5ad':'UBERON:0002048'}

In [157]:
# add 'tissue_ontology_term_id' column

In [158]:
adata.obs['tissue_ontology_term_id'] = adata.obs['file_of_origin'].map(mapping)

In [159]:
# change data type of column

In [160]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [161]:
#list the unique values in 'tissue_ontology_term_id' column

In [162]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002048', 'UBERON:0001005', 'UBERON:0001728']

In [163]:
# view obs

In [164]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048


#### **obsm (Embeddings)**

In [165]:
# view obsm

In [166]:
# check whether all columns are prefixed with X

In [167]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [168]:
# View

In [169]:
adata.uns

{'age_status_colors': array(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'],
       dtype=object),
 'annotation_v1_colors': array(['#d8bfd8ff', '#800000ff', '#dc143cff', '#4b0082ff', '#bb7693ff',
        '#4169e1ff', '#20b2aaff', '#849db1ff', '#008080ff', '#3ca8bcff',
        '#db7202ff', '#c3bc3fff', '#4e9f50ff', '#ff8c00ff', '#f7d42aff',
        '#af894bff', '#808080ff'], dtype=object)}

In [170]:
adata.uns.keys

<function dict.keys>

In [171]:
# Convert the data type of 'annotation_v1_colors' to <U7
adata.uns['annotation_v1_colors'] = adata.uns['annotation_v1_colors'].astype('<U7')


In [172]:
# Give a title for the dataset

In [173]:
adata.uns['title'] = 'covid19_across_age_in_ALI_organoids_invivo'

In [174]:
# Set the default embedding

In [175]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [176]:
# view anndata object

In [177]:
adata

AnnData object with n_obs × n_vars = 577234 × 17558
    obs: 'age_group', 'age_harmonised', 'age_status', 'file_of_origin', 'author_ann_harmonised', 'author_ann_harmonised_broad', 'donor_harmonised', 'age_status_granular', 'score_IFN_alpha', 'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1', 'status', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'feature_is_filtered'
    uns: 'age_status_colors', 'annotation_v1_colors', 'title', 'default_embedding'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

In [178]:
# view obs and var data types

In [179]:
adata.obs.dtypes

age_group                                   category
age_harmonised                              category
age_status                                  category
file_of_origin                              category
author_ann_harmonised                       category
author_ann_harmonised_broad                 category
donor_harmonised                            category
age_status_granular                         category
score_IFN_alpha                              float64
score_IFN_gamma                              float64
score_EMT                                    float64
score_BASALOID                               float64
annotation_v1                               category
status                                        object
barcodes                                      object
assay                                         object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          ca

In [180]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed means from float64 to float32
changed dispersions from float64 to float32


In [181]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed score_IFN_alpha from float64 to float32
changed score_IFN_gamma from float64 to float32
changed score_EMT from float64 to float32
changed score_BASALOID from float64 to float32
changed status from object to category
changed barcodes from object to category
changed assay from object to category


In [182]:
# view obs

In [183]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,donor_harmonised,age_status_granular,score_IFN_alpha,score_IFN_gamma,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,-0.077939,0.068120,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,D1,old_adult-COVID19,0.024214,0.111761,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,-0.038762,0.155381,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,D1,old_adult-COVID19,0.296413,0.208366,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,D1,old_adult-COVID19,0.139501,0.097040,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,L22cov,old_adult-COVID19,-0.078560,0.026703,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,0.279772,0.033921,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.066474,0.053662,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,L22cov,old_adult-COVID19,-0.090820,0.002190,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048


In [184]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'donor_harmonised', 'age_status_granular', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1',
       'status', 'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [185]:
# delete unwanted columns in obs

In [186]:
del adata.obs['donor_harmonised']
del adata.obs['barcodes']
del adata.obs['assay']
del adata.obs['status']

In [187]:
# view obs

In [188]:
adata.obs

Unnamed: 0,age_group,age_harmonised,age_status,file_of_origin,author_ann_harmonised,author_ann_harmonised_broad,age_status_granular,score_IFN_alpha,score_IFN_gamma,score_EMT,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-AATCACGAGGGTCACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,old_adult-COVID19,-0.077939,0.068120,0.157006,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-GGTAACTAGTCCTACA,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,AT1,AT1,old_adult-COVID19,0.024214,0.111761,0.179795,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-CATCCGTTCGCAAGAG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,old_adult-COVID19,-0.038762,0.155381,0.258357,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTCGGTCAGCTTTCCC,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,KRT8+ PATS/ADI/DATPs,KRT8+ PATS/ADI/DATPs,old_adult-COVID19,0.296413,0.208366,0.236939,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
Delorey_2021_COVID_only.h5ad:02-P005175-S053-R01-TTAGGCATCCTCACTG,old_adult,60-65,elderly-COVID19,Delorey_2021_COVID_only.h5ad,Secretory,Secretory,old_adult-COVID19,0.139501,0.097040,0.138514,...,HsapDv:0000241,D1,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melms_2021.h5ad:AGCTTCCAGGCATGCA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,ECM-high epithelial,ECM-high epithelial,old_adult-COVID19,-0.078560,0.026703,0.227514,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:GCACGGTCAGAAGTTA-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,old_adult-COVID19,0.279772,0.033921,-0.060741,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:TTGTGGATCATAGGCT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,old_adult-COVID19,-0.066474,0.053662,-0.058798,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048
Melms_2021.h5ad:GCCGATGAGTAAACGT-1_27,old_adult,58.0,elderly-COVID19,Melms_2021.h5ad,AT2,AT2,old_adult-COVID19,-0.090820,0.002190,0.127114,...,HsapDv:0000152,L22cov,MONDO:0100096,False,NCBITaxon:9606,unknown,unknown,nucleus,tissue,UBERON:0002048


In [189]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'age_status_granular', 'score_IFN_alpha', 'score_IFN_gamma',
       'score_EMT', 'score_BASALOID', 'annotation_v1',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# view var

In [191]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,feature_is_filtered
gene_ids-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000121410,False,0.041524,1.744950,-0.641797,False
ENSG00000268895,False,0.016673,1.953380,-0.213505,False
ENSG00000148584,False,0.002404,2.625631,1.167868,False
ENSG00000175899,True,0.047390,3.782054,3.544142,False
ENSG00000245105,False,0.008230,1.953967,-0.212300,False
...,...,...,...,...,...
ENSG00000070476,False,0.245947,1.905029,-0.312859,False
ENSG00000203995,False,0.007422,2.166888,0.225222,False
ENSG00000162378,False,0.277820,1.784258,-0.561026,False
ENSG00000159840,False,0.295521,1.707417,-0.718921,False


In [192]:
#araw.var

In [193]:
#view uns

In [194]:
adata.uns

{'age_status_colors': array(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'],
       dtype=object),
 'annotation_v1_colors': array(['#d8bfd8', '#800000', '#dc143c', '#4b0082', '#bb7693', '#4169e1',
        '#20b2aa', '#849db1', '#008080', '#3ca8bc', '#db7202', '#c3bc3f',
        '#4e9f50', '#ff8c00', '#f7d42a', '#af894b', '#808080'], dtype='<U7'),
 'title': 'covid19_across_age_in_ALI_organoids_invivo',
 'default_embedding': 'X_umap'}

In [195]:
list(adata.uns.keys())

['age_status_colors', 'annotation_v1_colors', 'title', 'default_embedding']

In [196]:
adata.obs.columns

Index(['age_group', 'age_harmonised', 'age_status', 'file_of_origin',
       'author_ann_harmonised', 'author_ann_harmonised_broad',
       'age_status_granular', 'score_IFN_alpha', 'score_IFN_gamma',
       'score_EMT', 'score_BASALOID', 'annotation_v1',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [197]:
# Remove unwanted columns in uns

In [198]:
#check the format of expression matrix

In [199]:
adata.X

<577234x17558 sparse matrix of type '<class 'numpy.float32'>'
	with 1357456337 stored elements in Compressed Sparse Row format>

In [200]:
araw.X

<577234x17558 sparse matrix of type '<class 'numpy.float32'>'
	with 1357456337 stored elements in Compressed Sparse Row format>

In [201]:
adata.raw = araw

In [202]:
adata.raw.X

<577234x17558 sparse matrix of type '<class 'numpy.float32'>'
	with 1357456337 stored elements in Compressed Sparse Row format>

In [203]:
data_type = adata.raw.X.dtype

In [204]:
adata

AnnData object with n_obs × n_vars = 577234 × 17558
    obs: 'age_group', 'age_harmonised', 'age_status', 'file_of_origin', 'author_ann_harmonised', 'author_ann_harmonised_broad', 'age_status_granular', 'score_IFN_alpha', 'score_IFN_gamma', 'score_EMT', 'score_BASALOID', 'annotation_v1', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'feature_is_filtered'
    uns: 'age_status_colors', 'annotation_v1_colors', 'title', 'default_embedding'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

In [None]:
import scanpy as sc
import pandas as pd

# Assuming 'adata' is your AnnData object

# Convert the raw count matrix to a DataFrame for easier manipulation
raw_counts_df = pd.DataFrame(adata.raw.X.toarray(), index=adata.obs_names)

# Find duplicated rows
duplicates = raw_counts_df.duplicated(keep=False)

# Get the indices of the duplicated rows
duplicated_indices = raw_counts_df.index[duplicates]

# Review the duplicated entries
duplicated_entries = raw_counts_df.loc[duplicated_indices]

print("Duplicated Entries:\n", duplicated_entries)

# At this point, you can manually review the duplicated entries to decide which to keep

# For this example, let's assume we decide to keep only the first occurrence of each duplicate
unique_duplicated_indices = raw_counts_df.index[raw_counts_df.duplicated(keep='first')]

# Remove the duplicates from the AnnData object
adata_filtered = adata[~adata.obs_names.isin(unique_duplicated_indices)].copy()

# Verify that the duplicates are removed
raw_counts_df_filtered = pd.DataFrame(adata_filtered.raw.X.toarray(), index=adata_filtered.obs_names)
print("Duplicates after filtering:\n", raw_counts_df_filtered.duplicated().sum())  # Should be 0



# Continue with your analysis using the filtered AnnData object


In [204]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/Final_objects/covid_ali_organoid_invivo.h5ad', compression = 'gzip')