### **Curating covid19_across_age_in_ALI_organoids_invitro.h5ad**

Article:  The emergence of goblet inflammatory or ITGB6hi nasal progenitor cells determines age-associated SARS-CoV-2 pathogenesis

DOI: https://doi.org/10.1101/2023.01.16.524211

Data Source : https://www.covid19cellatlas.org/index.patient.html

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/Data/covid19_across_age_in_ALI_organoids.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 139598 × 26447
    obs: 'sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version', 'spike-in_primer', 'scrublet_score', 'doublet_bh_pval', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster', 'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase', 'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha', 'score_IFN_gamma', 'score_BASALOID'
    var: 'gene_ids', 'n_counts', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'ribo'
    uns: 'age_group_colors', 'age_treatment_colors', 'cell_annotation_colors', 'donor_id_colors', 'kit_version_colors', 'pool_colors', 'sample_colors', 'spike-in_primer_colors', 'treatment_colors'
    obsm: 'X_pca', 'X_umap'
    obsp: 'connectivities', 'distances'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<139598x26447 sparse matrix of type '<class 'numpy.float32'>'
	with 273690640 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 9)	0.95525336
  (0, 23)	1.7907126
  (0, 50)	1.4630437
  (0, 52)	1.7899494
  (0, 54)	0.97413427
  (0, 62)	1.7894595
  (0, 84)	1.4638903
  (0, 97)	0.97154975
  (0, 100)	0.97731566
  (0, 129)	1.4539667
  (0, 132)	1.4569253
  (0, 142)	0.9631139
  (0, 148)	0.97040415
  (0, 184)	0.9578377
  (0, 194)	0.96772635
  (0, 202)	0.9662395
  (0, 209)	1.4616368
  (0, 213)	1.4515843
  (0, 239)	0.96957666
  (0, 257)	0.96703225
  (0, 264)	0.9620198
  (0, 287)	0.9643724
  (0, 319)	0.9597278
  (0, 327)	1.4563276
  (0, 366)	1.4498371
  :	:
  (139597, 26268)	0.8938967
  (139597, 26279)	0.8741195
  (139597, 26291)	1.7080972
  (139597, 26312)	2.8740408
  (139597, 26318)	0.9099994
  (139597, 26327)	0.90487266
  (139597, 26342)	0.9335315
  (139597, 26345)	0.89501256
  (139597, 26346)	2.9464462
  (139597, 26347)	1.3851392
  (139597, 26370)	0.9019762
  (139597, 26371)	1.3660564
  (139597, 26396)	1.3877138
  (139597, 26402)	0.8985073
  (139597, 26405)	2.3137953
  (139597, 26406)	1.0204332
  (139597, 26408)	4.

##### **Raw counts matrix**

In [11]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/Data/cujba22_covid19_across_age_in_ALI_organoids_raw.h5ad')

In [12]:
araw

AnnData object with n_obs × n_vars = 139598 × 33559
    obs: 'sample', 'treatment', 'time', 'donor', 'sample_name', 'version', 'spike-in_primer', 'batch'
    var: 'gene_ids'

In [13]:
#adata.raw = araw

##### **Variables(var)**

In [14]:
# View the var of anndata and raw object

In [15]:
adata.var

Unnamed: 0,gene_ids,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo
MIR1302-2HG,ENSG00000243485,36.362267,False,38,0.000147,99.984650,36.362267,False
FAM138A,ENSG00000237613,4.000000,False,4,0.000016,99.998383,4.000000,False
AL627309.1,ENSG00000238009,175.012375,False,180,0.000707,99.927277,175.012375,False
AL627309.3,ENSG00000239945,15.000000,False,15,0.000061,99.993942,15.000000,False
AL627309.2,ENSG00000239906,30.000000,False,30,0.000121,99.987877,30.000000,False
...,...,...,...,...,...,...,...,...
VIRAL_HERV-W.2,AF127228,85.729340,False,88,0.000346,99.964447,85.729340,False
VIRAL_HERV-W.4,AF331500.1,54.581730,False,56,0.000221,99.977371,54.581730,False
VIRAL_HHV-6A_(Human_Herpes_Virus_6),NC_001664.4,3.997368,False,4,0.000016,99.998383,3.997368,False
VIRAL_HHV-6B_(Human_Herpes_Virus_6),NC_000898.1,6.000000,False,6,0.000024,99.997574,6.000000,False


In [16]:
adata.var['name'] = adata.var.index

In [17]:
adata.var

Unnamed: 0,gene_ids,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo,name
MIR1302-2HG,ENSG00000243485,36.362267,False,38,0.000147,99.984650,36.362267,False,MIR1302-2HG
FAM138A,ENSG00000237613,4.000000,False,4,0.000016,99.998383,4.000000,False,FAM138A
AL627309.1,ENSG00000238009,175.012375,False,180,0.000707,99.927277,175.012375,False,AL627309.1
AL627309.3,ENSG00000239945,15.000000,False,15,0.000061,99.993942,15.000000,False,AL627309.3
AL627309.2,ENSG00000239906,30.000000,False,30,0.000121,99.987877,30.000000,False,AL627309.2
...,...,...,...,...,...,...,...,...,...
VIRAL_HERV-W.2,AF127228,85.729340,False,88,0.000346,99.964447,85.729340,False,VIRAL_HERV-W.2
VIRAL_HERV-W.4,AF331500.1,54.581730,False,56,0.000221,99.977371,54.581730,False,VIRAL_HERV-W.4
VIRAL_HHV-6A_(Human_Herpes_Virus_6),NC_001664.4,3.997368,False,4,0.000016,99.998383,3.997368,False,VIRAL_HHV-6A_(Human_Herpes_Virus_6)
VIRAL_HHV-6B_(Human_Herpes_Virus_6),NC_000898.1,6.000000,False,6,0.000024,99.997574,6.000000,False,VIRAL_HHV-6B_(Human_Herpes_Virus_6)


In [18]:
araw.var

Unnamed: 0,gene_ids
MIR1302-2HG,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
AL627309.1,ENSG00000238009
AL627309.3,ENSG00000239945
...,...
VIRAL_Mumps_rubulavirus,NC_002200.1
VIRAL_Rubella,NC_001545.2
VIRAL_Varicella_Zoster_Virus_(VZV_Human_alphaherpesvirus_3),NC_001348.1
VIRAL_Cytomegalovieus_(CMV),NC_006273.2


In [19]:
araw.var['name'] = araw.var.index

In [20]:
adata.var.index = adata.var['gene_ids']

In [21]:
araw.var.index = araw.var['gene_ids']

In [22]:
adata.var

Unnamed: 0_level_0,gene_ids,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000243485,ENSG00000243485,36.362267,False,38,0.000147,99.984650,36.362267,False,MIR1302-2HG
ENSG00000237613,ENSG00000237613,4.000000,False,4,0.000016,99.998383,4.000000,False,FAM138A
ENSG00000238009,ENSG00000238009,175.012375,False,180,0.000707,99.927277,175.012375,False,AL627309.1
ENSG00000239945,ENSG00000239945,15.000000,False,15,0.000061,99.993942,15.000000,False,AL627309.3
ENSG00000239906,ENSG00000239906,30.000000,False,30,0.000121,99.987877,30.000000,False,AL627309.2
...,...,...,...,...,...,...,...,...,...
AF127228,AF127228,85.729340,False,88,0.000346,99.964447,85.729340,False,VIRAL_HERV-W.2
AF331500.1,AF331500.1,54.581730,False,56,0.000221,99.977371,54.581730,False,VIRAL_HERV-W.4
NC_001664.4,NC_001664.4,3.997368,False,4,0.000016,99.998383,3.997368,False,VIRAL_HHV-6A_(Human_Herpes_Virus_6)
NC_000898.1,NC_000898.1,6.000000,False,6,0.000024,99.997574,6.000000,False,VIRAL_HHV-6B_(Human_Herpes_Virus_6)


In [23]:
araw.var

Unnamed: 0_level_0,gene_ids,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,AL627309.1
ENSG00000239945,ENSG00000239945,AL627309.3
...,...,...
NC_002200.1,NC_002200.1,VIRAL_Mumps_rubulavirus
NC_001545.2,NC_001545.2,VIRAL_Rubella
NC_001348.1,NC_001348.1,VIRAL_Varicella_Zoster_Virus_(VZV_Human_alphah...
NC_006273.2,NC_006273.2,VIRAL_Cytomegalovieus_(CMV)


In [24]:
# Load the approved genes file.

In [25]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [26]:
#Create a dictionary from the approved genes file 

In [27]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [28]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [29]:
len(genedict)

119799

In [30]:
#Filter out the genes which are not in the approved genes file.

In [31]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [32]:
len(var_to_keep_adata)

26156

In [33]:
len(var_to_keep_araw)

33137

In [34]:
adata.var

Unnamed: 0_level_0,gene_ids,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000243485,ENSG00000243485,36.362267,False,38,0.000147,99.984650,36.362267,False,MIR1302-2HG
ENSG00000237613,ENSG00000237613,4.000000,False,4,0.000016,99.998383,4.000000,False,FAM138A
ENSG00000238009,ENSG00000238009,175.012375,False,180,0.000707,99.927277,175.012375,False,AL627309.1
ENSG00000239945,ENSG00000239945,15.000000,False,15,0.000061,99.993942,15.000000,False,AL627309.3
ENSG00000239906,ENSG00000239906,30.000000,False,30,0.000121,99.987877,30.000000,False,AL627309.2
...,...,...,...,...,...,...,...,...,...
AF127228,AF127228,85.729340,False,88,0.000346,99.964447,85.729340,False,VIRAL_HERV-W.2
AF331500.1,AF331500.1,54.581730,False,56,0.000221,99.977371,54.581730,False,VIRAL_HERV-W.4
NC_001664.4,NC_001664.4,3.997368,False,4,0.000016,99.998383,3.997368,False,VIRAL_HHV-6A_(Human_Herpes_Virus_6)
NC_000898.1,NC_000898.1,6.000000,False,6,0.000024,99.997574,6.000000,False,VIRAL_HHV-6B_(Human_Herpes_Virus_6)


In [35]:
araw.var

Unnamed: 0_level_0,gene_ids,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,AL627309.1
ENSG00000239945,ENSG00000239945,AL627309.3
...,...,...
NC_002200.1,NC_002200.1,VIRAL_Mumps_rubulavirus
NC_001545.2,NC_001545.2,VIRAL_Rubella
NC_001348.1,NC_001348.1,VIRAL_Varicella_Zoster_Virus_(VZV_Human_alphah...
NC_006273.2,NC_006273.2,VIRAL_Cytomegalovieus_(CMV)


In [36]:
# Modify the anndata object by filtering out the filtered genes.

In [37]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [38]:
adata.var

Unnamed: 0_level_0,gene_ids,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000243485,ENSG00000243485,36.362267,False,38,0.000147,99.984650,36.362267,False,MIR1302-2HG
ENSG00000237613,ENSG00000237613,4.000000,False,4,0.000016,99.998383,4.000000,False,FAM138A
ENSG00000238009,ENSG00000238009,175.012375,False,180,0.000707,99.927277,175.012375,False,AL627309.1
ENSG00000239945,ENSG00000239945,15.000000,False,15,0.000061,99.993942,15.000000,False,AL627309.3
ENSG00000239906,ENSG00000239906,30.000000,False,30,0.000121,99.987877,30.000000,False,AL627309.2
...,...,...,...,...,...,...,...,...,...
ENSG00000278384,ENSG00000278384,2965.226074,False,3135,0.011980,98.733383,2965.226074,False,AL354822.1
ENSG00000278633,ENSG00000278633,41.104771,False,42,0.000166,99.983032,41.104771,False,AC023491.2
ENSG00000276345,ENSG00000276345,69447.312500,False,49677,0.280586,79.929214,69447.312500,False,AC004556.1
ENSG00000271254,ENSG00000271254,15893.731445,False,15684,0.064215,93.663261,15893.731445,False,AC240274.1


In [39]:
# View var

In [40]:
adata.var

Unnamed: 0_level_0,gene_ids,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000243485,ENSG00000243485,36.362267,False,38,0.000147,99.984650,36.362267,False,MIR1302-2HG
ENSG00000237613,ENSG00000237613,4.000000,False,4,0.000016,99.998383,4.000000,False,FAM138A
ENSG00000238009,ENSG00000238009,175.012375,False,180,0.000707,99.927277,175.012375,False,AL627309.1
ENSG00000239945,ENSG00000239945,15.000000,False,15,0.000061,99.993942,15.000000,False,AL627309.3
ENSG00000239906,ENSG00000239906,30.000000,False,30,0.000121,99.987877,30.000000,False,AL627309.2
...,...,...,...,...,...,...,...,...,...
ENSG00000278384,ENSG00000278384,2965.226074,False,3135,0.011980,98.733383,2965.226074,False,AL354822.1
ENSG00000278633,ENSG00000278633,41.104771,False,42,0.000166,99.983032,41.104771,False,AC023491.2
ENSG00000276345,ENSG00000276345,69447.312500,False,49677,0.280586,79.929214,69447.312500,False,AC004556.1
ENSG00000271254,ENSG00000271254,15893.731445,False,15684,0.064215,93.663261,15893.731445,False,AC240274.1


In [41]:
araw.var

Unnamed: 0_level_0,gene_ids,name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,AL627309.1
ENSG00000239945,ENSG00000239945,AL627309.3
...,...,...
ENSG00000277856,ENSG00000277856,AC233755.2
ENSG00000275063,ENSG00000275063,AC233755.1
ENSG00000271254,ENSG00000271254,AC240274.1
ENSG00000277475,ENSG00000277475,AC213203.1


feature is filtered

In [42]:
del adata.var['name']
del adata.var['gene_ids']
del araw.var['name']
del araw.var['gene_ids']

In [43]:
def add_zero():
	global adata
	global araw
	if araw.shape[1] > adata.shape[1]:
		genes_add = [x for x in araw.var.index.to_list() if x not in adata.var.index.to_list()]
		new_matrix = sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape = araw.shape)
		all_genes = adata.var.index.to_list()
		all_genes.extend(genes_add)
		new_var = pd.DataFrame(index=all_genes)
		new_var = pd.merge(new_var, araw.var, left_index=True, right_index=True, how='left')
		new_var['feature_is_filtered'] = False
		new_var.loc[genes_add, 'feature_is_filtered'] = True
		new_adata = ad.AnnData(X=new_matrix, obs=adata.obs, var=new_var, uns=adata.uns, obsm=adata.obsm)
		if adata.layers:
			for layer in adata.layers:
				new_layer = sparse.csr_matrix((adata.layers[layer].data, adata.layers[layer].indices, adata.layers[layer].indptr), shape = araw.shape)
				new_adata.layers[layer] = new_layer
		new_adata = new_adata[:,araw.var.index.to_list()]
		new_adata.var = new_adata.var.merge(adata.var, left_index=True, right_index=True, how='left')
		adata = new_adata
	else:
		adata.var['feature_is_filtered'] = False


In [44]:
add_zero()

In [45]:
adata.var

Unnamed: 0,feature_is_filtered,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo
ENSG00000243485,False,36.362267,False,38.0,0.000147,99.984650,36.362267,False
ENSG00000237613,False,4.000000,False,4.0,0.000016,99.998383,4.000000,False
ENSG00000186092,True,,,,,,,
ENSG00000238009,False,175.012375,False,180.0,0.000707,99.927277,175.012375,False
ENSG00000239945,False,15.000000,False,15.0,0.000061,99.993942,15.000000,False
...,...,...,...,...,...,...,...,...
ENSG00000277856,True,,,,,,,
ENSG00000275063,True,,,,,,,
ENSG00000271254,False,15893.731445,False,15684.0,0.064215,93.663261,15893.731445,False
ENSG00000277475,False,5.953731,False,6.0,0.000024,99.997574,5.953731,False


In [46]:
list(adata.var['feature_is_filtered'].unique())

[False, True]

In [47]:
false_count = (adata.var['feature_is_filtered']== False).sum()

In [48]:
false_count

26156

In [49]:
adata.var

Unnamed: 0,feature_is_filtered,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo
ENSG00000243485,False,36.362267,False,38.0,0.000147,99.984650,36.362267,False
ENSG00000237613,False,4.000000,False,4.0,0.000016,99.998383,4.000000,False
ENSG00000186092,True,,,,,,,
ENSG00000238009,False,175.012375,False,180.0,0.000707,99.927277,175.012375,False
ENSG00000239945,False,15.000000,False,15.0,0.000061,99.993942,15.000000,False
...,...,...,...,...,...,...,...,...
ENSG00000277856,True,,,,,,,
ENSG00000275063,True,,,,,,,
ENSG00000271254,False,15893.731445,False,15684.0,0.064215,93.663261,15893.731445,False
ENSG00000277475,False,5.953731,False,6.0,0.000024,99.997574,5.953731,False


In [50]:
araw.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475
ENSG00000268674


#### **obs (Cell metadata)**

In [51]:
#view obs

In [52]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,gender,S_score,G2M_score,phase,bbknn_batch,cell_annotation,age_treatment,score_IFN_alpha,score_IFN_gamma,score_BASALOID
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,Female,0.012715,-0.122299,S,ALI2-v1.1,Goblet 2,Adult-mock,0.164289,0.242595,0.184872
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,Female,0.016216,-0.056671,S,ALI2-v1.1,Goblet 2 BPIFA1+,Adult-mock,0.437474,0.173749,0.107360
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,Female,-0.039390,-0.012464,G1,ALI2-v1.1,Goblet 2 BPIFA1+,Adult-mock,0.132267,0.212443,0.219769
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,Female,0.077160,-0.175455,S,ALI2-v1.1,Basal 2,Elderly-mock,-0.093077,0.058059,0.175730
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,Female,0.012258,-0.066333,S,ALI2-v1.1,Basal 1,Elderly-mock,0.123714,0.150537,0.051336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,Male,-0.028217,-0.130824,G1,ALI3-v2,Goblet 2 BPIFA1+,Elderly-SARS,0.176980,0.129034,0.282540
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,Female,-0.042824,-0.144969,G1,ALI3-v2,Secretory,Adult-SARS,-0.045456,0.129736,0.034195
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,Female,-0.027907,-0.138610,G1,ALI3-v2,Basal 2,Adult-SARS,-0.051619,0.112761,0.082636
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,Male,-0.057059,-0.149032,G1,ALI3-v2,Goblet 2 BPIFA1+,Paediatric-SARS,0.194762,0.217395,0.012546


In [53]:
# view the column names in obs

In [54]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID'],
      dtype='object')

In [55]:
list(adata.obs['kit_version'].unique())

['v1.1', 'v2']

#### **assay_ontology_term_id**

In [56]:
adata.obs['barcodes'] = adata.obs_names

In [57]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [58]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [59]:
assay_info

Unnamed: 0,barcode,3pv2_5pv1_5pv2,3pv3,multiome,summary,assay
0,AAACCTGAGAAACCAT,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
1,AAACCTGAGAAACCGC,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
2,AAACCTGAGAAACCTA,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
3,AAACCTGAGAAACGAG,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
4,AAACCTGAGAAACGCC,1.0,0.0,0.0,3pv2_5pv1_5pv2,3pv2_5pv1_5pv2
...,...,...,...,...,...,...
8179621,TTTGTTGGTTTGGGTA,0.0,0.0,1.0,multiome,multiome
8179622,TTTGTTGGTTTGGTTC,0.0,0.0,1.0,multiome,multiome
8179623,TTTGTTGGTTTGTCTA,0.0,0.0,1.0,multiome,multiome
8179624,TTTGTTGGTTTGTGGA,0.0,0.0,1.0,multiome,multiome


In [60]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [61]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [62]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+multiome', '3pv2_5pv1_5pv2+3pv3', nan]

In [63]:
# Convert 'assay' column values to strings
adata.obs['assay'] = adata.obs['assay'].astype(str)

In [64]:
import pandas as pd

# Group the data by 'sample' and collect unique values of 'assay'
unique_values = adata.obs.groupby('sample')['assay'].unique()

# Display the unique values for each sample
for sample, assays in unique_values.items():
    print(f"sample: {sample}")
    print(f"Unique Assays: {', '.join(assays)}\n")


sample: CV001_KM9465185
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9465186
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+multiome, 3pv2_5pv1_5pv2+3pv3

sample: CV001_KM9465187
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome, nan

sample: CV001_KM9465188
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9465189
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9465190
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9505819
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9505820
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9505821
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome

sample: CV001_KM9505822
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_

In [65]:
import pandas as pd

# Group the data by 'file_of_origin' and 'assay' and count the occurrences
counts = adata.obs.groupby(['sample', 'assay']).size()

# Group the data by 'file_of_origin' and collect unique values of 'assay'
unique_values = adata.obs.groupby('sample')['assay'].unique()

# Display the unique values and counts for each sample
for sample, assays in unique_values.items():
    print(f"sample: {sample}")
    print(f"Unique Assays: {', '.join(assays)}")
    print("Counts:")
    for assay in assays:
        count = counts.get((sample, assay), 0)
        print(f"{assay}: {count}")
    print()


sample: CV001_KM9465185
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome
Counts:
3pv2_5pv1_5pv2: 2563
3pv2_5pv1_5pv2+3pv3: 254
3pv2_5pv1_5pv2+multiome: 4

sample: CV001_KM9465186
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+multiome, 3pv2_5pv1_5pv2+3pv3
Counts:
3pv2_5pv1_5pv2: 9163
3pv2_5pv1_5pv2+multiome: 7
3pv2_5pv1_5pv2+3pv3: 1095

sample: CV001_KM9465187
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome, nan
Counts:
3pv2_5pv1_5pv2: 7469
3pv2_5pv1_5pv2+3pv3: 876
3pv2_5pv1_5pv2+multiome: 3
nan: 1

sample: CV001_KM9465188
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome
Counts:
3pv2_5pv1_5pv2: 3724
3pv2_5pv1_5pv2+3pv3: 445
3pv2_5pv1_5pv2+multiome: 4

sample: CV001_KM9465189
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 3pv2_5pv1_5pv2+multiome
Counts:
3pv2_5pv1_5pv2: 7969
3pv2_5pv1_5pv2+3pv3: 929
3pv2_5pv1_5pv2+multiome: 16

sample: CV001_KM9465190
Unique Assays: 3pv2_5pv1_5pv2, 3pv2_5pv1_5pv2+3pv3, 

In [66]:
mapping= {'v1.1':'EFO:0011025', 'v2':'EFO:0009900'}

In [67]:
adata.obs['assay_ontology_term_id'] = ['EFO:0030004']* len(adata.obs)

In [68]:
adata.obs['assay_ontology_term_id'] = adata.obs['kit_version'].map(mapping)

In [69]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [70]:
# view adata.obs

In [71]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,phase,bbknn_batch,cell_annotation,age_treatment,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,S,ALI2-v1.1,Goblet 2,Adult-mock,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,S,ALI2-v1.1,Goblet 2 BPIFA1+,Adult-mock,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,G1,ALI2-v1.1,Goblet 2 BPIFA1+,Adult-mock,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,S,ALI2-v1.1,Basal 2,Elderly-mock,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,S,ALI2-v1.1,Basal 1,Elderly-mock,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,G1,ALI3-v2,Goblet 2 BPIFA1+,Elderly-SARS,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,G1,ALI3-v2,Secretory,Adult-SARS,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,G1,ALI3-v2,Basal 2,Adult-SARS,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,G1,ALI3-v2,Goblet 2 BPIFA1+,Paediatric-SARS,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900


#### **cell_type_ontology_term_id**

In [72]:
#identify the column in adata.obs related. to cell type annotation

In [73]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id'],
      dtype='object')

In [74]:
list(adata.obs['cell_annotation'].unique())

['Goblet 2',
 'Goblet 2 BPIFA1+',
 'Basal 2',
 'Basal 1',
 'Basaloid-like 1',
 'Basaloid-like 2',
 'Basal|EMT1',
 'Goblet 1',
 'Secretory',
 'Transit epi 2',
 'Secretory 3',
 'Hillock',
 'Transit epi',
 'Cycling basal',
 'Deutorosomal',
 'Secretory 4',
 'Ciliated 1',
 'Ciliated 2',
 'Ionocyte',
 'Goblet 2 PLAU+',
 'Basal|EMT2',
 'Secretory 2',
 'Squamous',
 'Goblet 2 inflammatory']

In [75]:
# create a dictionary of cell type and ontology term

In [76]:
mapping= {
'AT2':'CL:0002063',
'AT1':'CL:0002062',
'KRT8+ DATPs':'CL:0000244',
'Squamous':'CL:0000076',
'Goblet/Secretory':'CL:0000160',
'Cycling basal':'CL:0000646',
'Basaloid-like 2':'CL:0000646',
'Club':'CL:0000158',
'Goblet inflammatory':'CL:0000160',
'Basal':'CL:0000646',
'Ciliated':'CL:0000064',
'Deutorosomal':'CL:0005012',
'Basaloid-like 1':'CL:0000646',
'Airway mucous':'CL:0002633',
'Ionocyte':'CL:0005006',
'Ciliated inflammatory':'CL:0000064',
'Transit epi':'CL:0000244',
'Goblet 2':'CL:0000160',
'Goblet 2 BPIFA1+':'CL:0000160',
'Basal 2':'CL:0000646',
'Basal 1':'CL:0000646',
'Basal|EMT1':'CL:0000646',
'Goblet 1':'CL:0000160',
'Secretory':'CL:0000151',
'Transit epi 2':'CL:0000244',
'Secretory 3':'CL:0000151',
'Hillock':'CL:4030024',
'Secretory 4':'CL:0000151',
'Ciliated 1':'CL:0000064',
'Ciliated 2':'CL:0000064',
'Goblet 2 PLAU+':'CL:0000160',
'Basal|EMT2':'CL:0000646',
'Secretory 2':'CL:0000151',
'Goblet 2 inflammatory':'CL:0000160'}

In [77]:
# add the cell_type_ontology_term_id column

In [78]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_annotation'].map(mapping)

In [79]:
# change datatype of the column

In [80]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [81]:
# view adata.obs

In [82]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,bbknn_batch,cell_annotation,age_treatment,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,ALI2-v1.1,Goblet 2,Adult-mock,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,ALI2-v1.1,Goblet 2 BPIFA1+,Adult-mock,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,ALI2-v1.1,Goblet 2 BPIFA1+,Adult-mock,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,ALI2-v1.1,Basal 2,Elderly-mock,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,ALI2-v1.1,Basal 1,Elderly-mock,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,ALI3-v2,Goblet 2 BPIFA1+,Elderly-SARS,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,ALI3-v2,Secretory,Adult-SARS,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,ALI3-v2,Basal 2,Adult-SARS,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,ALI3-v2,Goblet 2 BPIFA1+,Paediatric-SARS,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160


#### **development_stage_ontology_term_id**

In [83]:
# identify the column in adata which corresponds to age

In [84]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id'],
      dtype='object')

In [85]:
list(adata.obs['age_group'].unique())

['Adult', 'Elderly', 'Paediatric']

In [86]:
mapping = {'60-65':'HsapDv:0000241',
'50-55':'HsapDv:0000240',
'75-80':'HsapDv:0000242',
'40-45':'HsapDv:0000239',
'>89':'HsapDv:0000095',
'80-85':'HsapDv:0000243',
'55-60':'HsapDv:0000240',
'65-70':'HsapDv:0000241',
'30-35':'HsapDv:0000238',
'5 years 11 months ':'HsapDv:0000099',
'3 weeks':'HsapDv:0000262',
'6 months':'HsapDv:0000179',
'1 year 10 months':'HsapDv:0000195',
'6 weeks':'HsapDv:0000273',
'3 days':'HsapDv:0000262',
'3 years 11 months':'HsapDv:0000097',
'4 years 9 months':'HsapDv:0000098',
'12 months':'HsapDv:0000185',
'2 years 10 months':'HsapDv:0000096',
'15 years 5 months':'HsapDv:0000109',
'13 days':'HsapDv:0000262',
'3 months':'HsapDv:0000176',
'2 years 2 weeks':'HsapDv:0000096',
'20 months':'HsapDv:0000193',
'56y':'HsapDv:0000150',
'14 years 11 months':'HsapDv:0000108',
'2 years 5 months':'HsapDv:0000096',
'13  years':'HsapDv:0000107',
'11 years':'HsapDv:0000105',
'25y':'HsapDv:0000119',
'10 years':'HsapDv:0000104',
'6 years 9 months':'HsapDv:0000100',
'52y':'HsapDv:0000146',
'2 years 11 months':'HsapDv:0000096',
'12 years 5 months':'HsapDv:0000106',
'67y':'HsapDv:0000161',
'39y':'HsapDv:0000133',
'76y':'HsapDv:0000170',
'16 years':'HsapDv:0000110',
'14 years ':'HsapDv:0000108',
'7 years':'HsapDv:0000101',
'7 yrs 3 months':'HsapDv:0000101',
'70y':'HsapDv:0000164',
'9 years':'HsapDv:0000103',
'73y':'HsapDv:0000167',
'9 days':'HsapDv:0000262',
'65y':'HsapDv:0000159',
'20 days':'HsapDv:0000262',
'4y':'HsapDv:0000098',
'16y':'HsapDv:0000110',
'6 days':'HsapDv:0000262',
'13y':'HsapDv:0000107',
'6m':'HsapDv:0000179',
'1month':'HsapDv:0000273',
'14 years':'HsapDv:0000108',
'14y':'HsapDv:0000108',
'36y':'HsapDv:0000130',
'38y':'HsapDv:0000132',
'55y':'HsapDv:0000149',
'26y':'HsapDv:0000120',
'15y':'HsapDv:0000109',
'46y':'HsapDv:0000140',
'44y':'HsapDv:0000138',
'61y':'HsapDv:0000155',
'66y':'HsapDv:0000160',
'50-59':'HsapDv:0000240',
'30-39':'HsapDv:0000238',
'60-69':'HsapDv:0000241',
'70-79':'HsapDv:0000242',
'40-49':'HsapDv:0000239',
'19-29':'HsapDv:0000266',
'80-89':'HsapDv:0000243',
'36':'HsapDv:0000130',
'40':'HsapDv:0000134',
'17':'HsapDv:0000111',
'30':'HsapDv:0000124',
'41':'HsapDv:0000134',
'31':'HsapDv:0000125',
'54':'HsapDv:0000148',
'38':'HsapDv:0000132',
'52':'HsapDv:0000146',
'25':'HsapDv:0000119',
'28':'HsapDv:0000122',
'57':'HsapDv:0000151',
'59.0':'HsapDv:0000153',
'36.0':'HsapDv:0000130',
'68.0':'HsapDv:0000162',
'34.0':'HsapDv:0000128',
'41.0':'HsapDv:0000135',
'58.0':'HsapDv:0000152',
'24.0':'HsapDv:0000118',
'33.0':'HsapDv:0000127',
'67.0':'HsapDv:0000161',
'76.0':'HsapDv:0000170',
'53.0':'HsapDv:0000147',
'71.0':'HsapDv:0000165',
'51.0':'HsapDv:0000145',
'56.0':'HsapDv:0000150',
'52.0':'HsapDv:0000146',
'70.0':'HsapDv:0000164',
'78.0':'HsapDv:0000172',
'75.0':'HsapDv:0000169',
'84.0':'HsapDv:0000210',
'91.0':'HsapDv:0000217',
'55.0':'HsapDv:0000149',
'66.0':'HsapDv:0000160',
'82.0':'HsapDv:0000208',
'61.0':'HsapDv:0000155',
'32.0':'HsapDv:0000126',
'63.0':'HsapDv:0000157',
'62.0':'HsapDv:0000156',
'50.0':'HsapDv:0000144',
'73.0':'HsapDv:0000167',
'79.0':'HsapDv:0000173',
'29.0':'HsapDv:0000123',
'64.0':'HsapDv:0000158',
'45.0':'HsapDv:0000139',
'21.0':'HsapDv:0000115',
'54.0':'HsapDv:0000148',
'7':'HsapDv:0000101',
'27':'HsapDv:0000121',
'35':'HsapDv:0000129',
'39':'HsapDv:0000133',
'42':'HsapDv:0000136',
'47':'HsapDv:0000141',
'4':'HsapDv:0000098',
'77':'HsapDv:0000171',
'60':'HsapDv:0000154',
'2':'HsapDv:0000096',
'< 1 year':'HsapDv:0000260',
'62':'HsapDv:0000156',
'65':'HsapDv:0000159',
'66':'HsapDv:0000160',
'61':'HsapDv:0000155',
'68':'HsapDv:0000162',
'46':'HsapDv:0000140',
'50':'HsapDv:0000144',
'9':'HsapDv:0000103',
'12':'HsapDv:0000106',
'18':'HsapDv:0000112',
'15':'HsapDv:0000109',
'16':'HsapDv:0000110',
'10':'HsapDv:0000104',
'6':'HsapDv:0000100',
'5':'HsapDv:0000099',
'14':'HsapDv:0000108',
'8':'HsapDv:0000102',
'33':'HsapDv:0000127',
'34':'HsapDv:0000128',
'24':'HsapDv:0000118',
'29':'HsapDv:0000123',
'76':'HsapDv:0000170',
'45':'HsapDv:0000139',
'1':'HsapDv:0000246',
'13':'HsapDv:0000107',
'32':'HsapDv:0000126',
'69.0':'HsapDv:0000163',
'72.0':'HsapDv:0000166',
'80.0':'HsapDv:0000206',
'83.0':'HsapDv:0000209',
'65.0':'HsapDv:0000159',
'Adult':'HsapDv:0000087',
'Elderly':'HsapDv:0000093', 
'Paediatric':'HsapDv:0000080'}

In [87]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age_group'].map(mapping)

In [88]:
# change datatype of the column

In [89]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [90]:
# view unique values of development_stage_ontology_term_id column

In [91]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000087', 'HsapDv:0000093', 'HsapDv:0000080']

In [92]:
# view adata.obs

In [93]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,cell_annotation,age_treatment,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,Goblet 2,Adult-mock,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,Goblet 2 BPIFA1+,Adult-mock,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,Goblet 2 BPIFA1+,Adult-mock,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,Basal 2,Elderly-mock,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,Basal 1,Elderly-mock,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,Goblet 2 BPIFA1+,Elderly-SARS,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,Secretory,Adult-SARS,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,Basal 2,Adult-SARS,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,Goblet 2 BPIFA1+,Paediatric-SARS,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080


#### **donor_id**

In [94]:
#identify the column in adata.obs which provides donor information

In [95]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [96]:
# add the donor_id column

In [97]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [98]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [99]:
# change datatype of the column

In [100]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [101]:
# view unique values of donor_id column

In [102]:
list(adata.obs['donor_id'].unique())

['UCL_10',
 'UCL_6',
 'UCL_5',
 'UCL_12',
 'UCL_9',
 'UCL_11',
 'UCL_7',
 'UCL_3',
 'UCL_2',
 'UCL_1',
 'UCL_4']

In [103]:
#view obs

In [104]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,cell_annotation,age_treatment,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,Goblet 2,Adult-mock,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,Goblet 2 BPIFA1+,Adult-mock,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,Goblet 2 BPIFA1+,Adult-mock,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,Basal 2,Elderly-mock,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,Basal 1,Elderly-mock,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,Goblet 2 BPIFA1+,Elderly-SARS,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,Secretory,Adult-SARS,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,Basal 2,Adult-SARS,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,Goblet 2 BPIFA1+,Paediatric-SARS,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080


In [105]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [106]:
# add the disease_ontology_term_id column

In [107]:
list(adata.obs['treatment'].unique())

['mock', 'SARS']

In [108]:
mapping= {'COVID19':'MONDO:0100096', 'Healthy':'PATO:0000461'}

In [109]:
mapping= {'SARS':'MONDO:0100096', 'mock':'PATO:0000461'}

In [110]:
adata.obs['disease_ontology_term_id']= ['MONDO:0100096'] * len(adata.obs)

In [111]:
adata.obs['disease_ontology_term_id'] = adata.obs['treatment'].map(mapping)

In [112]:
# change datatype of the column

In [113]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [114]:
# view obs

In [115]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,age_treatment,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,Adult-mock,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,Adult-mock,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,Adult-mock,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,Elderly-mock,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,Elderly-mock,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,Elderly-SARS,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,Adult-SARS,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,Adult-SARS,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,Paediatric-SARS,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096


#### **is_primary_data**

In [116]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [117]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True


In [118]:
#change data type of column

In [119]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [120]:
# view obs

In [121]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,score_IFN_alpha,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,0.164289,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,0.437474,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,0.132267,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,-0.093077,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,0.123714,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,0.176980,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,-0.045456,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,-0.051619,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,0.194762,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True


#### **organism_ontology_term_id**

In [122]:
# assign organism id 

In [123]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [124]:
#change data type of column

In [125]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [126]:
# view obs

In [127]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,score_IFN_gamma,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,0.242595,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,0.173749,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,0.212443,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,0.058059,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,0.150537,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,0.129034,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,0.129736,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,0.112761,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,0.217395,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [128]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [129]:
# change data type

In [130]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [131]:
# view obs

In [132]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown


#### **sex_ontology_term_id**

In [133]:
# identify the column in adata.obs which corresponds to sex

In [134]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [135]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,score_BASALOID,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,0.184872,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,0.107360,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,0.219769,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,0.175730,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,0.051336,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,0.282540,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,0.034195,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,0.082636,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,0.012546,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown


In [136]:
# list the unique values 

In [137]:
list(adata.obs['gender'].unique())

['Female', 'Male']

In [138]:
# create a dictionary of sex and sex ontology term id

In [139]:
mapping= {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [140]:
# add sex_ontology_term_id column

In [141]:
adata.obs['sex_ontology_term_id'] = adata.obs['gender'].map(mapping)

In [142]:
# change data type

In [143]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [144]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384


#### **suspension_type**

In [145]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,AAACCTGAGCAGGTCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,AAACCTGAGCCAGTTT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,AAACCTGAGCGTAGTG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,AAACCTGAGCTCCTTC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,AAACCTGAGCTGGAAC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,TTTGTCAGTTCCATGA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,TTTGTCATCACGATGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,TTTGTCATCCCATTAT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,TTTGTCATCCGGGTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384


In [146]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [147]:
# change data type of column

In [148]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [149]:
# view obs

In [150]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,3pv2_5pv1_5pv2,EFO:0009900,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,3pv2_5pv1_5pv2,EFO:0009900,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,3pv2_5pv1_5pv2,EFO:0009900,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell


#### **tissue_type**

In [151]:
adata.obs['tissue_type'] = ['organoid'] * len(adata.obs)

In [152]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [153]:
# identify the column in adata.obs which corresponds to tissue

In [154]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_type'],
      dtype='object')

In [155]:
# add 'tissue_ontology_term_id' column

In [156]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0005384'] * len(adata.obs)

In [157]:
# change data type of column

In [158]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [159]:
#list the unique values in 'tissue_ontology_term_id' column

In [160]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0005384']

In [161]:
# view obs

In [162]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell,organoid,UBERON:0005384


In [163]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [164]:
# view obsm

In [165]:
# check whether all columns are prefixed with X

In [166]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### **uns (Dataset Metadata)**

In [167]:
# View

In [168]:
adata.uns

OverloadedDict, wrapping:
	{'age_group_colors': array(['#add8e6', '#d2b48c', '#8fbc8f'], dtype=object), 'age_treatment_colors': array(['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b3', '#937860'],
      dtype=object), 'cell_annotation_colors': array(['#db7093', '#b52b37', '#90728f', '#f47942', '#ffae34', '#e15759',
       '#dab6af', '#f47942', '#20b2aa', '#ff9888', '#66cdaa', '#f7d42a',
       '#849db1', '#af894b', '#bb7693', '#b9a0b4', '#d7ce9f', '#6b6b6b',
       '#b9aa97', '#5f735d', '#3ca8bc', '#db7202', '#c3bc3f', '#4e9f50'],
      dtype=object), 'donor_id_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8', '#ffbb78'],
      dtype=object), 'kit_version_colors': array(['#1f77b4', '#ff7f0e'], dtype=object), 'pool_colors': array(['#1f77b4', '#ff7f0e', '#279e68'], dtype=object), 'sample_colors': array(['#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6', '#A30059',
       '#7A4900', '#0000A6', '#63

In [169]:
adata.uns.keys

<bound method OverloadedDict.keys of OverloadedDict, wrapping:
	{'age_group_colors': array(['#add8e6', '#d2b48c', '#8fbc8f'], dtype=object), 'age_treatment_colors': array(['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b3', '#937860'],
      dtype=object), 'cell_annotation_colors': array(['#db7093', '#b52b37', '#90728f', '#f47942', '#ffae34', '#e15759',
       '#dab6af', '#f47942', '#20b2aa', '#ff9888', '#66cdaa', '#f7d42a',
       '#849db1', '#af894b', '#bb7693', '#b9a0b4', '#d7ce9f', '#6b6b6b',
       '#b9aa97', '#5f735d', '#3ca8bc', '#db7202', '#c3bc3f', '#4e9f50'],
      dtype=object), 'donor_id_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8', '#ffbb78'],
      dtype=object), 'kit_version_colors': array(['#1f77b4', '#ff7f0e'], dtype=object), 'pool_colors': array(['#1f77b4', '#ff7f0e', '#279e68'], dtype=object), 'sample_colors': array(['#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6', '#A3005

In [170]:
# Give a title for the dataset

In [171]:
adata.uns['title'] = 'covid19_across_age_in_ALI_organoids_invitro'

In [172]:
# Set the default embedding

In [173]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [174]:
# view anndata object

In [175]:
adata

AnnData object with n_obs × n_vars = 139598 × 33137
    obs: 'sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version', 'spike-in_primer', 'scrublet_score', 'doublet_bh_pval', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster', 'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase', 'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha', 'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered', 'n_counts', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'ribo'
    uns: 'age_group_colors', 'age_treatment_colors'

In [176]:
# view obs and var data types

In [177]:
adata.obs.dtypes

sample                                      category
treatment                                   category
time                                        category
pool                                        category
sample_name                                 category
kit_version                                 category
spike-in_primer                             category
scrublet_score                               float32
doublet_bh_pval                              float32
n_genes_by_counts                              int32
total_counts                                 float32
total_counts_mt                              float32
pct_counts_mt                                float32
total_counts_ribo                            float32
pct_counts_ribo                              float32
souporcell_cluster                          category
donor_id                                    category
age_group                                   category
gender                                      ca

In [178]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed n_cells_by_counts from float64 to float32


In [179]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assay from object to category


In [180]:
# view obs

In [181]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell,organoid,UBERON:0005384


In [182]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'gender', 'S_score', 'G2M_score', 'phase',
       'bbknn_batch', 'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'suspension_type', 'tissue_type', 'tissue_ontology_term_id'],
      dtype='object')

In [183]:
# delete unwanted columns in obs

In [184]:
del adata.obs['gender']
del adata.obs['barcodes']
del adata.obs['assay']

In [185]:
# view obs

In [186]:
adata.obs

Unnamed: 0,sample,treatment,time,pool,sample_name,kit_version,spike-in_primer,scrublet_score,doublet_bh_pval,n_genes_by_counts,...,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
CV001_KM10202575_AAACCTGAGCAGGTCA,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.074489,0.899867,2007,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCCAGTTT,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.182796,0.899867,1936,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCGTAGTG,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.216327,0.861171,1072,...,CL:0000160,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCTCCTTC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.066090,0.915221,1637,...,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM10202575_AAACCTGAGCTGGAAC,CV001_KM10202575,mock,4h,ALI2,ALI2_mock_4h_v1.1,v1.1,spike-in_LOW,0.187805,0.899867,1742,...,CL:0000646,HsapDv:0000093,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV001_KM9505916_TTTGTCAGTTCCATGA,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.152639,0.937426,2169,...,CL:0000160,HsapDv:0000093,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCACGATGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.220339,0.922660,1480,...,CL:0000151,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCCCATTAT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.129771,0.937426,1571,...,CL:0000646,HsapDv:0000087,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000383,cell,organoid,UBERON:0005384
CV001_KM9505916_TTTGTCATCCGGGTGT,CV001_KM9505916,SARS,72h,ALI3,ALI3_SARS_72h_V2,v2,spike-in_HIGH,0.065327,0.985310,2861,...,CL:0000160,HsapDv:0000080,MONDO:0100096,True,NCBITaxon:9606,unknown,PATO:0000384,cell,organoid,UBERON:0005384


In [187]:
# view var

In [188]:
adata.var

Unnamed: 0,feature_is_filtered,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo
ENSG00000243485,False,36.362267,False,38.0,0.000147,99.984650,36.362267,False
ENSG00000237613,False,4.000000,False,4.0,0.000016,99.998383,4.000000,False
ENSG00000186092,True,,,,,,,
ENSG00000238009,False,175.012375,False,180.0,0.000707,99.927277,175.012375,False
ENSG00000239945,False,15.000000,False,15.0,0.000061,99.993942,15.000000,False
...,...,...,...,...,...,...,...,...
ENSG00000277856,True,,,,,,,
ENSG00000275063,True,,,,,,,
ENSG00000271254,False,15893.731445,False,15684.0,0.064215,93.663261,15893.731445,False
ENSG00000277475,False,5.953731,False,6.0,0.000024,99.997574,5.953731,False


In [189]:
araw.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475
ENSG00000268674


In [190]:
#view uns

In [191]:
adata.uns

OverloadedDict, wrapping:
	{'age_group_colors': array(['#add8e6', '#d2b48c', '#8fbc8f'], dtype=object), 'age_treatment_colors': array(['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b3', '#937860'],
      dtype=object), 'cell_annotation_colors': array(['#db7093', '#b52b37', '#90728f', '#f47942', '#ffae34', '#e15759',
       '#dab6af', '#f47942', '#20b2aa', '#ff9888', '#66cdaa', '#f7d42a',
       '#849db1', '#af894b', '#bb7693', '#b9a0b4', '#d7ce9f', '#6b6b6b',
       '#b9aa97', '#5f735d', '#3ca8bc', '#db7202', '#c3bc3f', '#4e9f50'],
      dtype=object), 'donor_id_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8', '#ffbb78'],
      dtype=object), 'kit_version_colors': array(['#1f77b4', '#ff7f0e'], dtype=object), 'pool_colors': array(['#1f77b4', '#ff7f0e', '#279e68'], dtype=object), 'sample_colors': array(['#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6', '#A30059',
       '#7A4900', '#0000A6', '#63

In [192]:
list(adata.uns.keys())

['age_group_colors',
 'age_treatment_colors',
 'cell_annotation_colors',
 'donor_id_colors',
 'kit_version_colors',
 'pool_colors',
 'sample_colors',
 'spike-in_primer_colors',
 'treatment_colors',
 'title',
 'default_embedding']

In [193]:
adata.obs.columns

Index(['sample', 'treatment', 'time', 'pool', 'sample_name', 'kit_version',
       'spike-in_primer', 'scrublet_score', 'doublet_bh_pval',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'total_counts_ribo', 'pct_counts_ribo', 'souporcell_cluster',
       'donor_id', 'age_group', 'S_score', 'G2M_score', 'phase', 'bbknn_batch',
       'cell_annotation', 'age_treatment', 'score_IFN_alpha',
       'score_IFN_gamma', 'score_BASALOID', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [194]:
# Remove unwanted columns in uns

In [195]:
#check the format of expression matrix

In [196]:
adata.X

<139598x33137 sparse matrix of type '<class 'numpy.float32'>'
	with 273113459 stored elements in Compressed Sparse Row format>

In [197]:
araw.X

<139598x33137 sparse matrix of type '<class 'numpy.float32'>'
	with 277464811 stored elements in Compressed Sparse Row format>

In [198]:
#Copy raw counts to adata.raw

In [199]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

In [200]:
adata.raw = araw

In [201]:
adata.var

Unnamed: 0,feature_is_filtered,n_counts,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,ribo
ENSG00000243485,False,36.362267,False,38.0,0.000147,99.984650,36.362267,False
ENSG00000237613,False,4.000000,False,4.0,0.000016,99.998383,4.000000,False
ENSG00000186092,True,,,,,,,
ENSG00000238009,False,175.012375,False,180.0,0.000707,99.927277,175.012375,False
ENSG00000239945,False,15.000000,False,15.0,0.000061,99.993942,15.000000,False
...,...,...,...,...,...,...,...,...
ENSG00000277856,True,,,,,,,
ENSG00000275063,True,,,,,,,
ENSG00000271254,False,15893.731445,False,15684.0,0.064215,93.663261,15893.731445,False
ENSG00000277475,False,5.953731,False,6.0,0.000024,99.997574,5.953731,False


In [202]:
adata.raw.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475
ENSG00000268674


In [203]:
del adata.var['mt']
del adata.var['ribo']

In [204]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids//Final_objects/covid_ali_organoid_invitro.h5ad', compression = 'gzip')