### **Curating organoids.h5ad**

Article: Early human lung immune cell development and its role in epithelial cell fate

DOI: 10.1126/sciimmunol.adf99

Data Source : https://fetal-lung-immune.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Data/organoids.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 8556 × 20591
    obs: 'batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden', 'leiden_R', 'cellstate', 'He2022Predicted_celltype'
    var: 'gene_ids', 'n_cells', 'highly_variableHCA_LNG13237417', 'highly_variableHCA_LNG13237416', 'highly_variableHCA_LNG13237415', 'highly_variable_n', 'highly_variable', 'Deep_HCA_LNG13237417', 'Deep_HCA_LNG13237416', 'Deep_HCA_LNG13237415', 'Deep_n'
    uns: 'batch_colors', 'cellstate_colors', 'dendrogram_leiden_R', 'leiden', 'leiden_R_colors', 'leiden_colors', 'log1p', 'majority_voting_colors', 'neighbors', 'new_celltype_colors', 'pca', 'phase_colors', 'predicted_labels_colors', 'rank_genes_groups', 'treatment_colors', 'umap'
    obsm: 'X_original_umap', 'X_pca', 'X_umap', 'rep'
    obsp: 'connectivities', 'distances'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<8556x20591 sparse matrix of type '<class 'numpy.float32'>'
	with 39059079 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 10)	0.84663177
  (0, 18)	0.2873907
  (0, 20)	0.2873907
  (0, 26)	0.5103594
  (0, 30)	0.5103594
  (0, 33)	0.2873907
  (0, 34)	0.2873907
  (0, 35)	0.98010063
  (0, 38)	0.2873907
  (0, 40)	0.98010063
  (0, 41)	0.5103594
  (0, 42)	0.2873907
  (0, 48)	0.2873907
  (0, 51)	0.2873907
  (0, 53)	0.84663177
  (0, 57)	0.2873907
  (0, 59)	0.2873907
  (0, 61)	0.5103594
  (0, 62)	0.5103594
  (0, 65)	1.2984351
  (0, 72)	0.5103594
  (0, 75)	0.84663177
  (0, 78)	0.69256437
  (0, 79)	0.5103594
  (0, 81)	0.2873907
  :	:
  (8555, 20525)	1.1577468
  (8555, 20526)	0.10874838
  (8555, 20529)	0.4538799
  (8555, 20531)	0.2961303
  (8555, 20535)	0.52430874
  (8555, 20536)	0.3781125
  (8555, 20545)	0.20682193
  (8555, 20547)	0.86650425
  (8555, 20548)	0.5901021
  (8555, 20552)	0.4538799
  (8555, 20556)	0.20682193
  (8555, 20558)	0.76491785
  (8555, 20562)	0.76491785
  (8555, 20563)	0.10874838
  (8555, 20565)	0.20682193
  (8555, 20566)	0.2961303
  (8555, 20569)	0.20682193
  (8555, 20574)	0.20682193
  (8555, 

In [11]:
adata.layers.keys()

KeysView(Layers with keys: )

In [12]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk


In [13]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype'],
      dtype='object')

##### **Raw counts matrix**

In [14]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Data/Raw/MasahiroOrganoid_raw.h5ad')

In [15]:
araw.obs

Unnamed: 0,batch,percent_mito,n_counts,treatment
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,31622.0,control
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,34187.0,control
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,40469.0,control
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,24350.0,control
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23749.0,control
...,...,...,...,...
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,20572.0,IL-13
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,21275.0,IL-13
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,20209.0,IL-13
TTTGTCAGTCGGCTCA-HCA_LNG13237417,HCA_LNG13237417,0.023543,89157.0,IL-13


In [16]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk


In [17]:
common_barcodes =  set(adata.obs_names) & set(araw.obs_names)

In [18]:
araw_filtered = araw[araw.obs_names.isin(common_barcodes)].copy()


In [19]:
araw=araw_filtered

In [20]:
araw

AnnData object with n_obs × n_vars = 8556 × 33538
    obs: 'batch', 'percent_mito', 'n_counts', 'treatment'
    var: 'gene_ids'

In [21]:
print(araw.X)

  (0, 21)	4.0
  (0, 29)	1.0
  (0, 32)	1.0
  (0, 39)	2.0
  (0, 43)	2.0
  (0, 46)	1.0
  (0, 47)	1.0
  (0, 48)	5.0
  (0, 51)	1.0
  (0, 53)	5.0
  (0, 54)	2.0
  (0, 55)	1.0
  (0, 61)	1.0
  (0, 64)	1.0
  (0, 66)	4.0
  (0, 70)	1.0
  (0, 72)	1.0
  (0, 74)	2.0
  (0, 75)	2.0
  (0, 78)	8.0
  (0, 86)	2.0
  (0, 89)	4.0
  (0, 93)	3.0
  (0, 94)	2.0
  (0, 98)	1.0
  :	:
  (8555, 33479)	1.0
  (8555, 33482)	2.0
  (8555, 33484)	3.0
  (8555, 33487)	2.0
  (8555, 33492)	2.0
  (8555, 33493)	2.0
  (8555, 33495)	9.0
  (8555, 33496)	116.0
  (8555, 33497)	64.0
  (8555, 33498)	486.0
  (8555, 33499)	509.0
  (8555, 33500)	29.0
  (8555, 33501)	135.0
  (8555, 33502)	260.0
  (8555, 33503)	122.0
  (8555, 33504)	79.0
  (8555, 33505)	110.0
  (8555, 33506)	44.0
  (8555, 33507)	15.0
  (8555, 33508)	130.0
  (8555, 33523)	2.0
  (8555, 33527)	1.0
  (8555, 33528)	7.0
  (8555, 33532)	10.0
  (8555, 33535)	2.0


##### **Variables(var)**

In [22]:
# View the var of anndata and raw object

In [23]:
adata.var

Unnamed: 0,gene_ids,n_cells,highly_variableHCA_LNG13237417,highly_variableHCA_LNG13237416,highly_variableHCA_LNG13237415,highly_variable_n,highly_variable,Deep_HCA_LNG13237417,Deep_HCA_LNG13237416,Deep_HCA_LNG13237415,Deep_n
MIR1302-2HG,ENSG00000243485,42,False,False,False,0,False,False,False,False,0
AL627309.1,ENSG00000238009,26,False,False,False,0,False,False,False,False,0
AL669831.5,ENSG00000237491,998,True,True,True,3,True,False,False,False,0
FAM87B,ENSG00000177757,20,False,False,False,0,False,False,False,False,0
LINC00115,ENSG00000225880,385,False,True,False,1,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...
AC007325.4,ENSG00000278817,2140,False,False,False,0,False,False,False,False,0
AC007325.2,ENSG00000277196,2366,True,True,False,2,True,False,True,False,1
AL354822.1,ENSG00000278384,34,False,False,False,0,False,False,False,False,0
AC004556.1,ENSG00000276345,5877,False,False,False,0,False,False,False,False,0


In [24]:
araw.var

Unnamed: 0,gene_ids
MIR1302-2HG,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
AL627309.1,ENSG00000238009
AL627309.3,ENSG00000239945
...,...
AC233755.2,ENSG00000277856
AC233755.1,ENSG00000275063
AC240274.1,ENSG00000271254
AC213203.1,ENSG00000277475


In [25]:
adata.var_names = adata.var['gene_ids']
araw.var_names = araw.var['gene_ids']

In [26]:
# Load the approved genes file.

In [27]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [28]:
#Create a dictionary from the approved genes file 

In [29]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [30]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [31]:
len(genedict)

119799

In [32]:
#Filter out the genes which are not in the approved genes file.

In [33]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [34]:
len(var_to_keep_adata)

20402

In [35]:
len(var_to_keep_araw)

33137

In [36]:
adata.var

Unnamed: 0_level_0,gene_ids,n_cells,highly_variableHCA_LNG13237417,highly_variableHCA_LNG13237416,highly_variableHCA_LNG13237415,highly_variable_n,highly_variable,Deep_HCA_LNG13237417,Deep_HCA_LNG13237416,Deep_HCA_LNG13237415,Deep_n
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000243485,ENSG00000243485,42,False,False,False,0,False,False,False,False,0
ENSG00000238009,ENSG00000238009,26,False,False,False,0,False,False,False,False,0
ENSG00000237491,ENSG00000237491,998,True,True,True,3,True,False,False,False,0
ENSG00000177757,ENSG00000177757,20,False,False,False,0,False,False,False,False,0
ENSG00000225880,ENSG00000225880,385,False,True,False,1,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000278817,ENSG00000278817,2140,False,False,False,0,False,False,False,False,0
ENSG00000277196,ENSG00000277196,2366,True,True,False,2,True,False,True,False,1
ENSG00000278384,ENSG00000278384,34,False,False,False,0,False,False,False,False,0
ENSG00000276345,ENSG00000276345,5877,False,False,False,0,False,False,False,False,0


In [37]:
araw.var

Unnamed: 0_level_0,gene_ids
gene_ids,Unnamed: 1_level_1
ENSG00000243485,ENSG00000243485
ENSG00000237613,ENSG00000237613
ENSG00000186092,ENSG00000186092
ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945
...,...
ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475


In [38]:
# Modify the anndata object by filtering out the filtered genes.

In [39]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [40]:
def add_zero():
	global adata
	global araw
	if araw.shape[1] > adata.shape[1]:
		genes_add = [x for x in araw.var.index.to_list() if x not in adata.var.index.to_list()]
		new_matrix = sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape = araw.shape)
		all_genes = adata.var.index.to_list()
		all_genes.extend(genes_add)
		new_var = pd.DataFrame(index=all_genes)
		new_var = pd.merge(new_var, araw.var, left_index=True, right_index=True, how='left')
		new_var['feature_is_filtered'] = False
		new_var.loc[genes_add, 'feature_is_filtered'] = True
		new_adata = ad.AnnData(X=new_matrix, obs=adata.obs, var=new_var, uns=adata.uns, obsm=adata.obsm)
		if adata.layers:
			for layer in adata.layers:
				new_layer = sparse.csr_matrix((adata.layers[layer].data, adata.layers[layer].indices, adata.layers[layer].indptr), shape = araw.shape)
				new_adata.layers[layer] = new_layer
		new_adata = new_adata[:,araw.var.index.to_list()]
		new_adata.var = new_adata.var.merge(adata.var, left_index=True, right_index=True, how='left')
		adata = new_adata
	else:
		adata.var['feature_is_filtered'] = False


In [41]:
add_zero()

In [42]:
adata.var

Unnamed: 0,gene_ids_x,feature_is_filtered,gene_ids_y,n_cells,highly_variableHCA_LNG13237417,highly_variableHCA_LNG13237416,highly_variableHCA_LNG13237415,highly_variable_n,highly_variable,Deep_HCA_LNG13237417,Deep_HCA_LNG13237416,Deep_HCA_LNG13237415,Deep_n
ENSG00000243485,ENSG00000243485,False,ENSG00000243485,42.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000237613,ENSG00000237613,True,,,,,,,,,,,
ENSG00000186092,ENSG00000186092,True,,,,,,,,,,,
ENSG00000238009,ENSG00000238009,False,ENSG00000238009,26.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000239945,ENSG00000239945,True,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,True,,,,,,,,,,,
ENSG00000275063,ENSG00000275063,True,,,,,,,,,,,
ENSG00000271254,ENSG00000271254,False,ENSG00000271254,1522.0,True,True,True,3.0,True,False,False,False,0.0
ENSG00000277475,ENSG00000277475,True,,,,,,,,,,,


In [43]:
# View var

In [44]:
adata.var

Unnamed: 0,gene_ids_x,feature_is_filtered,gene_ids_y,n_cells,highly_variableHCA_LNG13237417,highly_variableHCA_LNG13237416,highly_variableHCA_LNG13237415,highly_variable_n,highly_variable,Deep_HCA_LNG13237417,Deep_HCA_LNG13237416,Deep_HCA_LNG13237415,Deep_n
ENSG00000243485,ENSG00000243485,False,ENSG00000243485,42.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000237613,ENSG00000237613,True,,,,,,,,,,,
ENSG00000186092,ENSG00000186092,True,,,,,,,,,,,
ENSG00000238009,ENSG00000238009,False,ENSG00000238009,26.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000239945,ENSG00000239945,True,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,True,,,,,,,,,,,
ENSG00000275063,ENSG00000275063,True,,,,,,,,,,,
ENSG00000271254,ENSG00000271254,False,ENSG00000271254,1522.0,True,True,True,3.0,True,False,False,False,0.0
ENSG00000277475,ENSG00000277475,True,,,,,,,,,,,


In [45]:
araw.var

Unnamed: 0_level_0,gene_ids
gene_ids,Unnamed: 1_level_1
ENSG00000243485,ENSG00000243485
ENSG00000237613,ENSG00000237613
ENSG00000186092,ENSG00000186092
ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945
...,...
ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475


feature is filtered

In [46]:
list(adata.var['feature_is_filtered'].unique())

[False, True]

In [47]:
false_count = (adata.var['feature_is_filtered']== False).sum()

In [48]:
false_count

20402

In [49]:
adata.var

Unnamed: 0,gene_ids_x,feature_is_filtered,gene_ids_y,n_cells,highly_variableHCA_LNG13237417,highly_variableHCA_LNG13237416,highly_variableHCA_LNG13237415,highly_variable_n,highly_variable,Deep_HCA_LNG13237417,Deep_HCA_LNG13237416,Deep_HCA_LNG13237415,Deep_n
ENSG00000243485,ENSG00000243485,False,ENSG00000243485,42.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000237613,ENSG00000237613,True,,,,,,,,,,,
ENSG00000186092,ENSG00000186092,True,,,,,,,,,,,
ENSG00000238009,ENSG00000238009,False,ENSG00000238009,26.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000239945,ENSG00000239945,True,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,True,,,,,,,,,,,
ENSG00000275063,ENSG00000275063,True,,,,,,,,,,,
ENSG00000271254,ENSG00000271254,False,ENSG00000271254,1522.0,True,True,True,3.0,True,False,False,False,0.0
ENSG00000277475,ENSG00000277475,True,,,,,,,,,,,


In [50]:
araw.var

Unnamed: 0_level_0,gene_ids
gene_ids,Unnamed: 1_level_1
ENSG00000243485,ENSG00000243485
ENSG00000237613,ENSG00000237613
ENSG00000186092,ENSG00000186092
ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945
...,...
ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475


#### **obs (Cell metadata)**

In [51]:
#view obs

In [52]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk


In [53]:
# view the column names in obs

In [54]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype'],
      dtype='object')

In [55]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk


#### **assay_ontology_term_id**

In [56]:
adata.obs['barcodes'] = adata.obs_names

In [57]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [58]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [59]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [60]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [61]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+3pv3', '3pv2_5pv1_5pv2+multiome']

In [62]:
mapping= {'3pv2_5pv1_5pv2':'EFO:0030004', '3pv2_5pv1_5pv2,3pv3':'EFO:0009922', '3pv2_5pv1_5pv2,multiome':'EFO:0030004'}

In [63]:
# Convert 'assay' column values to strings
adata.obs['assay'] = adata.obs['assay'].astype(str)

In [64]:
adata.obs['assay_ontology_term_id'] = ['EFO:0030004']* len(adata.obs)

In [65]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [66]:
# view adata.obs

In [67]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004


#### **cell_type_ontology_term_id**

In [68]:
#identify the column in adata.obs related. to cell type annotation

In [69]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id'],
      dtype='object')

In [70]:
list(adata.obs['He2022Predicted_celltype'].unique())

['Mid stalk',
 'Late tip',
 'Mid tip',
 'Late stalk',
 'AT2',
 'Pulmonary NE precursor',
 'Mid airway progenitor',
 'GHRL+ NE precursor',
 'Early stalk',
 'Early airway progenitor',
 'MUC5AC+ ASCL1+ progenitor',
 'Early tip']

In [71]:
# create a dictionary of cell type and ontology term

In [72]:
mapping= {'Mid stalk':'CL:0002368',
'Late tip':'CL:0010003',
'Mid tip':'CL:0010003',
'Late stalk':'CL:0002368',
'AT2':'CL:0002063',
'Pulmonary NE precursor':'CL:1000223',
'Mid airway progenitor':'CL:0002368',
'GHRL+ NE precursor':'CL:1000223',
'Early stalk':'CL:0002368',
'Early airway progenitor':'CL:0002368',
'MUC5AC+ ASCL1+ progenitor':'CL:0002368',
'Early tip':'CL:0010003'
}

In [73]:
# add the cell_type_ontology_term_id column

In [74]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['He2022Predicted_celltype'].map(mapping)

In [75]:
# change datatype of the column

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [77]:
# view adata.obs

In [78]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368


#### **development_stage_ontology_term_id**

In [79]:
# identify the column in adata which corresponds to age

In [80]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id'],
      dtype='object')

In [81]:
mapping = {'12.0':'HsapDv:0000050', '20.0':'HsapDv:0000058', '9.0':'HsapDv:0000047', '8.0':'HsapDv:0000046', '18.0':'HsapDv:0000056', '22.0':'HsapDv:0000060', '15.0':'HsapDv:0000053', '6.86':'HsapDv:0000029', '5.0':'HsapDv:0000022', '11.0':'HsapDv:0000049', 'nan':'unknown'}

In [82]:
adata.obs['development_stage_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [83]:
# change datatype of the column

In [84]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [85]:
# view unique values of development_stage_ontology_term_id column

In [86]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['unknown']

In [87]:
# view adata.obs

In [88]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown


#### **donor_id**

In [89]:
#identify the column in adata.obs which provides donor information

In [90]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [91]:
# add the donor_id column

In [92]:
adata.obs['donor_id'] = adata.obs['batch']

In [93]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [94]:
# change datatype of the column

In [95]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [96]:
# view unique values of donor_id column

In [97]:
list(adata.obs['donor_id'].unique())

['HCA_LNG13237415', 'HCA_LNG13237416', 'HCA_LNG13237417']

In [98]:
#view obs

In [99]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,leiden,leiden_R,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,3,3,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,1,1,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,3,3,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,17,170,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,4,4,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,14,140,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,0,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,11,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417


In [100]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [101]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [102]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,leiden_R,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,3,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,1,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,3,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,170,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,4,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,140,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461


In [103]:
# change datatype of the column

In [104]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [105]:
# view obs

In [106]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,leiden_R,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,3,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,1,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,3,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,170,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,4,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,140,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,0,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,11,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461


#### **is_primary_data**

In [107]:
import pandas as pd

# Assuming adata.obs is a pandas DataFrame
# Replace 'adata.obs' with the actual name of your DataFrame if different

# Set 'is_primary_data' to True for all entries initially
adata.obs['is_primary_data'] = True

# Set 'is_primary_data' to False for entries with 'batch' values starting with '5891' or 'WSSS'
adata.obs.loc[adata.obs['batch'].str.startswith(('5891', 'WSSS')), 'is_primary_data'] = False


In [108]:
list(adata.obs['is_primary_data'].unique())

[True]

In [109]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True


In [110]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [111]:
#change data type of column

In [112]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [113]:
# view obs

In [114]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,cellstate,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,BPIFB1- SERPINA1+ mid tip,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,BPIFB1- CA9+ mid tip,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,BPIFB1- SERPINA1+ mid tip,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,Ionocyte,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,BPIFB1- CA9+ mid tip,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,COL4A1+ mid tip,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,SCGB1B2P+ BPIFB1- CA9- mid tip,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,Cycling BPIFB1- mid tip 1,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True


#### **organism_ontology_term_id**

In [115]:
# assign organism id 

In [116]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [117]:
#change data type of column

In [118]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [119]:
# view obs

In [120]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,He2022Predicted_celltype,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,Mid stalk,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,Mid stalk,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,Late tip,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,Mid stalk,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,Mid stalk,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,Mid tip,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,Mid stalk,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,Mid stalk,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,Mid stalk,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [121]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [122]:
# change data type

In [123]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [124]:
# view obs

In [125]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown


In [126]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [127]:
# identify the column in adata.obs which corresponds to sex

In [128]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [129]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,AAACCTGAGGTGCTTT,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,AAACCTGCATTGGGCC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,AAACCTGGTGACTACT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,AAACCTGTCGAGAGCA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,AAACGGGAGCCCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,TTTGTCACACAGGCCT,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,TTTGTCACACATCCAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,TTTGTCACACGGCTAC,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,TTTGTCAGTCGCGAAA,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown


In [130]:
# list the unique values 

In [131]:
# create a dictionary of sex and sex ontology term id

In [132]:
mapping= {'F': 'PATO:0000383', 'M': 'PATO:0000384', 'Unknown':'unknown'}

In [133]:
# add sex_ontology_term_id column

In [134]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [135]:
# change data type

In [136]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [137]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown


#### **suspension_type**

In [138]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,3pv2_5pv1_5pv2,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown


In [139]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [140]:
# change data type of column

In [141]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [142]:
# view obs

In [143]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,EFO:0030004,CL:0010003,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,EFO:0030004,CL:0010003,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,EFO:0030004,CL:0002368,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell


#### **tissue_type**

In [144]:
adata.obs['tissue_type'] = ['organoid'] * len(adata.obs)

In [145]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [146]:
# identify the column in adata.obs which corresponds to tissue

In [147]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [148]:
# add 'tissue_ontology_term_id' column

In [149]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [150]:
# change data type of column

In [151]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [152]:
#list the unique values in 'tissue_ontology_term_id' column

In [153]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002048']

In [154]:
# view obs

In [155]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [156]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [157]:
# view obsm

In [158]:
# check whether all columns are prefixed with X

In [159]:
adata.obsm

AxisArrays with keys: X_original_umap, X_pca, X_umap, rep

In [160]:
adata.obsm['X_rep'] = adata.obsm['rep']
del adata.obsm['rep']

#### **uns (Dataset Metadata)**

In [161]:
# View

In [162]:
adata.uns

{'batch_colors': array(['#1f77b4', '#ff7f0e', '#279e68'], dtype=object),
 'cellstate_colors': array(['#1f77b4', '#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c',
        '#98df8a', '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b',
        '#c49c94', '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22',
        '#dbdb8d', '#17becf', '#9edae5', '#9edae5'], dtype=object),
 'dendrogram_leiden_R': {'categories_idx_ordered': array([ 1, 16, 11, 10, 13, 22, 12, 21, 28,  3,  0, 14, 29,  4, 15, 18, 19,
         20,  6,  9, 24, 27, 25, 26,  5,  8,  2,  7, 17, 23]),
  'categories_ordered': array(['1', '12,1', '10,1', '10,0', '10,3', '16', '10,2', '15', '19', '3',
         '0', '11', '20', '4', '12,0', '13', '14,0', '14,1', '6', '9',
         '17,1', '18,2', '18,0', '18,1', '5', '8', '2', '7', '12,2', '17,0'],
        dtype=object),
  'cor_method': 'pearson',
  'correlation_matrix': array([[ 1.00000000e+00, -3.38644673e-01, -4.32086115e-01,
           5.87701594e-01,  5.79391509e-01, -4.

In [163]:
adata.uns.keys

<function dict.keys>

In [164]:
# Give a title for the dataset

In [165]:
adata.uns['title'] = 'organoid'

In [166]:
# Set the default embedding

In [167]:
adata.uns['default_embedding'] = 'X_umap'

In [168]:
del adata.uns['majority_voting_colors']
del adata.uns['new_celltype_colors']
del adata.uns['predicted_labels_colors']

### **Final check**

In [169]:
# view anndata object

In [170]:
adata

AnnData object with n_obs × n_vars = 8556 × 33137
    obs: 'batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden', 'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'gene_ids_x', 'feature_is_filtered', 'gene_ids_y', 'n_cells', 'highly_variableHCA_LNG13237417', 'highly_variableHCA_LNG13237416', 'highly_variableHCA_LNG13237415', 'highly_variable_n', 'highly_variable', 'Deep_HCA_LNG13237417', 'Deep_HCA_LNG13237416', 'Deep_HCA_LNG13237415', 'Deep_n'
    uns: 'batch_colors', 'cellstate_colors', 'dendrogram_leiden_R', 'leiden', 'leiden_R_colors', 'leiden_colors', 'log1p', 'neighb

In [171]:
# view obs and var data types

In [172]:
adata.obs.dtypes

batch                                       category
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int64
doublet_scores                               float64
bh_pval                                      float64
treatment                                   category
S_score                                      float64
G2M_score                                    float64
phase                                       category
leiden                                      category
leiden_R                                    category
cellstate                                   category
He2022Predicted_celltype                    category
barcodes                                      object
assay                                         object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          ca

In [173]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed n_cells from float64 to float32
changed highly_variable_n from float64 to float32
changed Deep_n from float64 to float32


In [174]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed doublet_scores from float64 to float32
changed bh_pval from float64 to float32
changed S_score from float64 to float32
changed G2M_score from float64 to float32
changed n_genes from int64 to int32
changed barcodes from object to category
changed assay from object to category


In [175]:
# view obs

In [176]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [177]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [178]:
# delete unwanted columns in obs

In [179]:
del adata.obs['barcodes']
del adata.obs['assay']

In [180]:
# view obs

In [181]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [182]:
# view var

In [183]:
adata.var

Unnamed: 0,gene_ids_x,feature_is_filtered,gene_ids_y,n_cells,highly_variableHCA_LNG13237417,highly_variableHCA_LNG13237416,highly_variableHCA_LNG13237415,highly_variable_n,highly_variable,Deep_HCA_LNG13237417,Deep_HCA_LNG13237416,Deep_HCA_LNG13237415,Deep_n
ENSG00000243485,ENSG00000243485,False,ENSG00000243485,42.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000237613,ENSG00000237613,True,,,,,,,,,,,
ENSG00000186092,ENSG00000186092,True,,,,,,,,,,,
ENSG00000238009,ENSG00000238009,False,ENSG00000238009,26.0,False,False,False,0.0,False,False,False,False,0.0
ENSG00000239945,ENSG00000239945,True,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277856,ENSG00000277856,True,,,,,,,,,,,
ENSG00000275063,ENSG00000275063,True,,,,,,,,,,,
ENSG00000271254,ENSG00000271254,False,ENSG00000271254,1522.0,True,True,True,3.0,True,False,False,False,0.0
ENSG00000277475,ENSG00000277475,True,,,,,,,,,,,


In [184]:
araw.var

Unnamed: 0_level_0,gene_ids
gene_ids,Unnamed: 1_level_1
ENSG00000243485,ENSG00000243485
ENSG00000237613,ENSG00000237613
ENSG00000186092,ENSG00000186092
ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945
...,...
ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475


In [185]:
del adata.var['highly_variableHCA_LNG13237417']
del adata.var['highly_variableHCA_LNG13237416']
del adata.var['highly_variableHCA_LNG13237415']
del adata.var['highly_variable_n']
del adata.var['highly_variable']
del adata.var['Deep_HCA_LNG13237417']
del adata.var['Deep_HCA_LNG13237416']
del adata.var['Deep_HCA_LNG13237415']
del adata.var['Deep_n']
del adata.var['gene_ids_y']
del adata.var['gene_ids_x']
del adata.var['n_cells']
del araw.var['gene_ids']

In [186]:
#view uns

In [187]:
adata.uns

{'batch_colors': array(['#1f77b4', '#ff7f0e', '#279e68'], dtype=object),
 'cellstate_colors': array(['#1f77b4', '#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c',
        '#98df8a', '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b',
        '#c49c94', '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22',
        '#dbdb8d', '#17becf', '#9edae5', '#9edae5'], dtype=object),
 'dendrogram_leiden_R': {'categories_idx_ordered': array([ 1, 16, 11, 10, 13, 22, 12, 21, 28,  3,  0, 14, 29,  4, 15, 18, 19,
         20,  6,  9, 24, 27, 25, 26,  5,  8,  2,  7, 17, 23]),
  'categories_ordered': array(['1', '12,1', '10,1', '10,0', '10,3', '16', '10,2', '15', '19', '3',
         '0', '11', '20', '4', '12,0', '13', '14,0', '14,1', '6', '9',
         '17,1', '18,2', '18,0', '18,1', '5', '8', '2', '7', '12,2', '17,0'],
        dtype=object),
  'cor_method': 'pearson',
  'correlation_matrix': array([[ 1.00000000e+00, -3.38644673e-01, -4.32086115e-01,
           5.87701594e-01,  5.79391509e-01, -4.

In [188]:
list(adata.uns.keys())

['batch_colors',
 'cellstate_colors',
 'dendrogram_leiden_R',
 'leiden',
 'leiden_R_colors',
 'leiden_colors',
 'log1p',
 'neighbors',
 'pca',
 'phase_colors',
 'rank_genes_groups',
 'treatment_colors',
 'umap',
 'title',
 'default_embedding']

In [189]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# Remove unwanted columns in uns

In [191]:
#check the format of expression matrix

In [192]:
adata.X

<8556x33137 sparse matrix of type '<class 'numpy.float32'>'
	with 38973678 stored elements in Compressed Sparse Row format>

In [193]:
araw.X

<8556x33137 sparse matrix of type '<class 'numpy.float32'>'
	with 39133350 stored elements in Compressed Sparse Row format>

In [194]:
#Copy raw counts to adata.raw

In [195]:
adata.raw = araw

In [196]:
obs_dtype = adata.obs.dtypes

In [197]:
obs_dtype

batch                                       category
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int32
doublet_scores                               float32
bh_pval                                      float32
treatment                                   category
S_score                                      float32
G2M_score                                    float32
phase                                       category
leiden                                      category
leiden_R                                    category
cellstate                                   category
He2022Predicted_celltype                    category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    ca

In [198]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Final_objects/to_upload/scrnaseq/organoid.h5ad', compression = 'gzip')

In [199]:
adata.obs

Unnamed: 0,batch,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,treatment,S_score,G2M_score,phase,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGGTGCTTT-HCA_LNG13237415,HCA_LNG13237415,0.050155,30035.0,6351,0.122970,0.895578,control,0.356778,-0.078414,S,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCATTGGGCC-HCA_LNG13237415,HCA_LNG13237415,0.041156,32778.0,5353,0.223022,0.895578,control,-0.080427,-0.183847,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGGTGACTACT-HCA_LNG13237415,HCA_LNG13237415,0.035533,39028.0,5603,0.061350,0.498832,control,-0.046427,-0.163270,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGTCGAGAGCA-HCA_LNG13237415,HCA_LNG13237415,0.037002,23449.0,5356,0.206780,0.131299,control,-0.044377,-0.190089,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACGGGAGCCCGAAA-HCA_LNG13237415,HCA_LNG13237415,0.021348,23241.0,4877,0.147368,0.874995,control,-0.076444,-0.200009,G1,...,unknown,HCA_LNG13237415,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACACAGGCCT-HCA_LNG13237417,HCA_LNG13237417,0.143310,2971.0,1941,0.042735,0.995378,IL-13,-0.087941,0.216716,G2M,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACATCCAA-HCA_LNG13237417,HCA_LNG13237417,0.029506,19965.0,4704,0.203931,0.829519,IL-13,0.105654,-0.138075,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCACACGGCTAC-HCA_LNG13237417,HCA_LNG13237417,0.027215,20696.0,4642,0.312500,0.212251,IL-13,-0.032795,-0.188867,G1,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCAGTCGCGAAA-HCA_LNG13237417,HCA_LNG13237417,0.056163,19074.0,5168,0.132143,0.995378,IL-13,0.337020,-0.080434,S,...,unknown,HCA_LNG13237417,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [200]:
adata.obs.columns

Index(['batch', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores',
       'bh_pval', 'treatment', 'S_score', 'G2M_score', 'phase', 'leiden',
       'leiden_R', 'cellstate', 'He2022Predicted_celltype',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [201]:
adata.raw.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475
ENSG00000268674


In [202]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000243485,False
ENSG00000237613,True
ENSG00000186092,True
ENSG00000238009,False
ENSG00000239945,True
...,...
ENSG00000277856,True
ENSG00000275063,True
ENSG00000271254,False
ENSG00000277475,True
