### **Curating 240314KT_AT2_organoid_filtered.h5ad**

Article: Early human lung immune cell development and its role in epithelial cell fate

DOI: 10.1126/sciimmunol.adf99

Data Source : https://fetal-lung-immune.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/lung_organoid_2024/Data/240314KT_AT2_organoid_filtered.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 9619 × 21868
    obs: 'Souporcell4_status', 'Souporcell4_assignment', 'percent_mito', 'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score', 'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor'
    var: 'gene_ids', 'feature_types', 'n_cells', 'highly_variablefirst', 'highly_variable_n'
    uns: 'Souporcell4_assignment_colors', 'batch_colors', 'celltype_colors', 'leiden', 'leiden_R_colors', 'log1p', 'majority_voting_colors', 'neighbors', 'pca', 'predicted_labels_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    obsp: 'connectivities', 'distances'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<9619x21868 sparse matrix of type '<class 'numpy.float32'>'
	with 37008792 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 30)	0.619026
  (0, 39)	0.619026
  (0, 45)	0.9985108
  (0, 48)	0.619026
  (0, 50)	0.619026
  (0, 51)	0.619026
  (0, 59)	1.2729452
  (0, 64)	0.619026
  (0, 78)	0.619026
  (0, 87)	0.619026
  (0, 104)	0.619026
  (0, 105)	0.619026
  (0, 108)	0.619026
  (0, 119)	2.745411
  (0, 122)	0.619026
  (0, 126)	0.619026
  (0, 145)	0.9985108
  (0, 152)	1.2729452
  (0, 155)	0.9985108
  (0, 195)	0.619026
  (0, 204)	0.619026
  (0, 215)	0.619026
  (0, 223)	0.619026
  (0, 224)	1.2729452
  (0, 244)	0.619026
  :	:
  (9618, 21725)	0.5632831
  (9618, 21735)	0.32078904
  (9618, 21744)	0.32078904
  (9618, 21751)	0.32078904
  (9618, 21760)	0.32078904
  (9618, 21761)	0.92142123
  (9618, 21763)	0.5632831
  (9618, 21773)	0.32078904
  (9618, 21788)	1.482498
  (9618, 21793)	0.32078904
  (9618, 21795)	0.32078904
  (9618, 21796)	0.32078904
  (9618, 21801)	4.7566504
  (9618, 21806)	0.32078904
  (9618, 21807)	0.5632831
  (9618, 21808)	0.32078904
  (9618, 21810)	0.32078904
  (9618, 21813)	0.32078904
  (9618, 21815)	0.

In [11]:
adata.layers.keys()

KeysView(Layers with keys: )

In [12]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,phase,leiden,predicted_labels,over_clustering,majority_voting,conf_score,leiden_R,celltype,donor
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,G1,4,Pulmonary neuroendocrine,19,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,G1,7,Pulmonary neuroendocrine,46,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,G1,0,Late tip,53,Late tip,0.979895,0,AT2-like,HDBR15934
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,G1,0,Late tip,83,Late tip,0.642230,0,AT2-like,HDBR15934
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,G1,7,Late tip,20,Late tip,0.863851,7,Differentiating basal-like,HDBR16392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,G2M,5,Late tip,36,Late tip,0.235389,5,Cycling AT2,HDBR16392
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,G1,6,Late tip,85,Mid fibro,0.735850,6,AT2-like,HDBR16392
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,S,5,Late tip,31,Late tip,0.998556,5,Cycling AT2,HDBR16402
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,S,3,Late tip,18,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets


In [13]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor'],
      dtype='object')

##### **Raw counts matrix**

In [14]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/lung_organoid_2024/Data/Lim_scRNA-seq_raw.h5ad')

In [15]:
araw.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts
AAACCTGAGCAGCGTA,singlet,3,0.025313,11970.0
AAACCTGAGTACGTAA,singlet,3,0.027868,23360.0
AAACCTGCACCATGTA,singlet,0,0.005957,16619.0
AAACCTGCACCTATCC,singlet,0,0.017658,12119.0
AAACCTGCACTTAACG,singlet,2,0.038610,13986.0
...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2777.0
TTTGTCATCGAGAGCA,singlet,2,0.016931,16124.0
TTTGTCATCGCCTGAG,singlet,3,0.026760,15508.0
TTTGTCATCTATCGCC,doublet,1/0,0.015649,50036.0


In [16]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,phase,leiden,predicted_labels,over_clustering,majority_voting,conf_score,leiden_R,celltype,donor
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,G1,4,Pulmonary neuroendocrine,19,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,G1,7,Pulmonary neuroendocrine,46,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,G1,0,Late tip,53,Late tip,0.979895,0,AT2-like,HDBR15934
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,G1,0,Late tip,83,Late tip,0.642230,0,AT2-like,HDBR15934
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,G1,7,Late tip,20,Late tip,0.863851,7,Differentiating basal-like,HDBR16392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,G2M,5,Late tip,36,Late tip,0.235389,5,Cycling AT2,HDBR16392
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,G1,6,Late tip,85,Mid fibro,0.735850,6,AT2-like,HDBR16392
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,S,5,Late tip,31,Late tip,0.998556,5,Cycling AT2,HDBR16402
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,S,3,Late tip,18,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets


In [17]:
common_barcodes =  set(adata.obs_names) & set(araw.obs_names)

In [18]:
araw_filtered = araw[araw.obs_names.isin(common_barcodes)].copy()


In [19]:
araw=araw_filtered

In [20]:
araw

AnnData object with n_obs × n_vars = 9619 × 36601
    obs: 'Souporcell4_status', 'Souporcell4_assignment', 'percent_mito', 'n_counts'
    var: 'gene_ids', 'feature_types'

In [21]:
print(araw.X)

  (0, 44)	1.0
  (0, 53)	1.0
  (0, 59)	2.0
  (0, 62)	1.0
  (0, 64)	1.0
  (0, 65)	1.0
  (0, 73)	3.0
  (0, 78)	1.0
  (0, 94)	1.0
  (0, 104)	1.0
  (0, 136)	1.0
  (0, 137)	1.0
  (0, 142)	1.0
  (0, 170)	17.0
  (0, 173)	1.0
  (0, 177)	1.0
  (0, 208)	2.0
  (0, 216)	3.0
  (0, 219)	2.0
  (0, 264)	1.0
  (0, 280)	1.0
  (0, 293)	1.0
  (0, 338)	1.0
  (0, 339)	3.0
  (0, 370)	1.0
  :	:
  (9618, 36386)	1.0
  (9618, 36389)	1.0
  (9618, 36390)	1.0
  (9618, 36401)	305.0
  (9618, 36406)	1.0
  (9618, 36407)	2.0
  (9618, 36408)	1.0
  (9618, 36410)	1.0
  (9618, 36413)	1.0
  (9618, 36415)	3.0
  (9618, 36416)	1.0
  (9618, 36423)	1.0
  (9618, 36429)	1.0
  (9618, 36432)	1.0
  (9618, 36450)	3.0
  (9618, 36559)	2.0
  (9618, 36561)	14.0
  (9618, 36562)	12.0
  (9618, 36563)	1.0
  (9618, 36564)	3.0
  (9618, 36565)	6.0
  (9618, 36566)	2.0
  (9618, 36568)	3.0
  (9618, 36571)	3.0
  (9618, 36600)	2.0


##### **Variables(var)**

In [22]:
# View the var of anndata and raw object

In [23]:
adata.var

Unnamed: 0,gene_ids,feature_types,n_cells,highly_variablefirst,highly_variable_n
MIR1302-2HG,ENSG00000243485,Gene Expression,9,False,0
AL627309.1,ENSG00000238009,Gene Expression,18,False,0
AL627309.5,ENSG00000241860,Gene Expression,141,True,1
AP006222.2,ENSG00000286448,Gene Expression,7,False,0
LINC01409,ENSG00000237491,Gene Expression,1097,False,0
...,...,...,...,...,...
AL354822.1,ENSG00000278384,Gene Expression,318,False,0
AL592183.1,ENSG00000273748,Gene Expression,182,False,0
AC240274.1,ENSG00000271254,Gene Expression,607,False,0
AC007325.4,ENSG00000278817,Gene Expression,224,False,0


In [24]:
araw.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


In [25]:
adata.var_names = adata.var['gene_ids']
araw.var_names = araw.var['gene_ids']

In [26]:
# Load the approved genes file.

In [27]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [28]:
#Create a dictionary from the approved genes file 

In [29]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [30]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [31]:
len(genedict)

119799

In [32]:
#Filter out the genes which are not in the approved genes file.

In [33]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [34]:
len(var_to_keep_adata)

21760

In [35]:
len(var_to_keep_araw)

36390

In [36]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,n_cells,highly_variablefirst,highly_variable_n
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,9,False,0
ENSG00000238009,ENSG00000238009,Gene Expression,18,False,0
ENSG00000241860,ENSG00000241860,Gene Expression,141,True,1
ENSG00000286448,ENSG00000286448,Gene Expression,7,False,0
ENSG00000237491,ENSG00000237491,Gene Expression,1097,False,0
...,...,...,...,...,...
ENSG00000278384,ENSG00000278384,Gene Expression,318,False,0
ENSG00000273748,ENSG00000273748,Gene Expression,182,False,0
ENSG00000271254,ENSG00000271254,Gene Expression,607,False,0
ENSG00000278817,ENSG00000278817,Gene Expression,224,False,0


In [37]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,Gene Expression
ENSG00000237613,ENSG00000237613,Gene Expression
ENSG00000186092,ENSG00000186092,Gene Expression
ENSG00000238009,ENSG00000238009,Gene Expression
ENSG00000239945,ENSG00000239945,Gene Expression
...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression
ENSG00000278633,ENSG00000278633,Gene Expression
ENSG00000276017,ENSG00000276017,Gene Expression
ENSG00000278817,ENSG00000278817,Gene Expression


In [38]:
# Modify the anndata object by filtering out the filtered genes.

In [39]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [40]:
def add_zero():
	global adata
	global araw
	if araw.shape[1] > adata.shape[1]:
		genes_add = [x for x in araw.var.index.to_list() if x not in adata.var.index.to_list()]
		new_matrix = sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape = araw.shape)
		all_genes = adata.var.index.to_list()
		all_genes.extend(genes_add)
		new_var = pd.DataFrame(index=all_genes)
		new_var = pd.merge(new_var, araw.var, left_index=True, right_index=True, how='left')
		new_var['feature_is_filtered'] = False
		new_var.loc[genes_add, 'feature_is_filtered'] = True
		new_adata = ad.AnnData(X=new_matrix, obs=adata.obs, var=new_var, uns=adata.uns, obsm=adata.obsm)
		if adata.layers:
			for layer in adata.layers:
				new_layer = sparse.csr_matrix((adata.layers[layer].data, adata.layers[layer].indices, adata.layers[layer].indptr), shape = araw.shape)
				new_adata.layers[layer] = new_layer
		new_adata = new_adata[:,araw.var.index.to_list()]
		new_adata.var = new_adata.var.merge(adata.var, left_index=True, right_index=True, how='left')
		adata = new_adata
	else:
		adata.var['feature_is_filtered'] = False


In [41]:
add_zero()

In [42]:
adata.var

Unnamed: 0,gene_ids_x,feature_types_x,feature_is_filtered,gene_ids_y,feature_types_y,n_cells,highly_variablefirst,highly_variable_n
ENSG00000243485,ENSG00000243485,Gene Expression,False,ENSG00000243485,Gene Expression,9.0,False,0.0
ENSG00000237613,ENSG00000237613,Gene Expression,True,,,,,
ENSG00000186092,ENSG00000186092,Gene Expression,True,,,,,
ENSG00000238009,ENSG00000238009,Gene Expression,False,ENSG00000238009,Gene Expression,18.0,False,0.0
ENSG00000239945,ENSG00000239945,Gene Expression,True,,,,,
...,...,...,...,...,...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,True,,,,,
ENSG00000278633,ENSG00000278633,Gene Expression,True,,,,,
ENSG00000276017,ENSG00000276017,Gene Expression,True,,,,,
ENSG00000278817,ENSG00000278817,Gene Expression,False,ENSG00000278817,Gene Expression,224.0,False,0.0


In [43]:
# View var

In [44]:
adata.var

Unnamed: 0,gene_ids_x,feature_types_x,feature_is_filtered,gene_ids_y,feature_types_y,n_cells,highly_variablefirst,highly_variable_n
ENSG00000243485,ENSG00000243485,Gene Expression,False,ENSG00000243485,Gene Expression,9.0,False,0.0
ENSG00000237613,ENSG00000237613,Gene Expression,True,,,,,
ENSG00000186092,ENSG00000186092,Gene Expression,True,,,,,
ENSG00000238009,ENSG00000238009,Gene Expression,False,ENSG00000238009,Gene Expression,18.0,False,0.0
ENSG00000239945,ENSG00000239945,Gene Expression,True,,,,,
...,...,...,...,...,...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,True,,,,,
ENSG00000278633,ENSG00000278633,Gene Expression,True,,,,,
ENSG00000276017,ENSG00000276017,Gene Expression,True,,,,,
ENSG00000278817,ENSG00000278817,Gene Expression,False,ENSG00000278817,Gene Expression,224.0,False,0.0


In [45]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,Gene Expression
ENSG00000237613,ENSG00000237613,Gene Expression
ENSG00000186092,ENSG00000186092,Gene Expression
ENSG00000238009,ENSG00000238009,Gene Expression
ENSG00000239945,ENSG00000239945,Gene Expression
...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression
ENSG00000278633,ENSG00000278633,Gene Expression
ENSG00000276017,ENSG00000276017,Gene Expression
ENSG00000278817,ENSG00000278817,Gene Expression


feature is filtered

In [46]:
list(adata.var['feature_is_filtered'].unique())

[False, True]

In [47]:
false_count = (adata.var['feature_is_filtered']== False).sum()

In [48]:
false_count

21760

In [49]:
adata.var

Unnamed: 0,gene_ids_x,feature_types_x,feature_is_filtered,gene_ids_y,feature_types_y,n_cells,highly_variablefirst,highly_variable_n
ENSG00000243485,ENSG00000243485,Gene Expression,False,ENSG00000243485,Gene Expression,9.0,False,0.0
ENSG00000237613,ENSG00000237613,Gene Expression,True,,,,,
ENSG00000186092,ENSG00000186092,Gene Expression,True,,,,,
ENSG00000238009,ENSG00000238009,Gene Expression,False,ENSG00000238009,Gene Expression,18.0,False,0.0
ENSG00000239945,ENSG00000239945,Gene Expression,True,,,,,
...,...,...,...,...,...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,True,,,,,
ENSG00000278633,ENSG00000278633,Gene Expression,True,,,,,
ENSG00000276017,ENSG00000276017,Gene Expression,True,,,,,
ENSG00000278817,ENSG00000278817,Gene Expression,False,ENSG00000278817,Gene Expression,224.0,False,0.0


In [50]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,Gene Expression
ENSG00000237613,ENSG00000237613,Gene Expression
ENSG00000186092,ENSG00000186092,Gene Expression
ENSG00000238009,ENSG00000238009,Gene Expression
ENSG00000239945,ENSG00000239945,Gene Expression
...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression
ENSG00000278633,ENSG00000278633,Gene Expression
ENSG00000276017,ENSG00000276017,Gene Expression
ENSG00000278817,ENSG00000278817,Gene Expression


In [51]:
del adata.var['gene_ids_x']
del araw.var['gene_ids']

#### **obs (Cell metadata)**

In [52]:
#view obs

In [53]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,phase,leiden,predicted_labels,over_clustering,majority_voting,conf_score,leiden_R,celltype,donor
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,G1,4,Pulmonary neuroendocrine,19,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,G1,7,Pulmonary neuroendocrine,46,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,G1,0,Late tip,53,Late tip,0.979895,0,AT2-like,HDBR15934
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,G1,0,Late tip,83,Late tip,0.642230,0,AT2-like,HDBR15934
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,G1,7,Late tip,20,Late tip,0.863851,7,Differentiating basal-like,HDBR16392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,G2M,5,Late tip,36,Late tip,0.235389,5,Cycling AT2,HDBR16392
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,G1,6,Late tip,85,Mid fibro,0.735850,6,AT2-like,HDBR16392
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,S,5,Late tip,31,Late tip,0.998556,5,Cycling AT2,HDBR16402
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,S,3,Late tip,18,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets


In [54]:
# view the column names in obs

In [55]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor'],
      dtype='object')

In [56]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,phase,leiden,predicted_labels,over_clustering,majority_voting,conf_score,leiden_R,celltype,donor
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,G1,4,Pulmonary neuroendocrine,19,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,G1,7,Pulmonary neuroendocrine,46,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,G1,0,Late tip,53,Late tip,0.979895,0,AT2-like,HDBR15934
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,G1,0,Late tip,83,Late tip,0.642230,0,AT2-like,HDBR15934
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,G1,7,Late tip,20,Late tip,0.863851,7,Differentiating basal-like,HDBR16392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,G2M,5,Late tip,36,Late tip,0.235389,5,Cycling AT2,HDBR16392
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,G1,6,Late tip,85,Mid fibro,0.735850,6,AT2-like,HDBR16392
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,S,5,Late tip,31,Late tip,0.998556,5,Cycling AT2,HDBR16402
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,S,3,Late tip,18,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets


#### **assay_ontology_term_id**

In [57]:
adata.obs['barcodes'] = adata.obs_names

In [58]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [59]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Megagut/Suppl_info/barcode_assay_summary.csv')

In [60]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay_summary']))

In [61]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [62]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2,3pv3', '3pv2_5pv1_5pv2,multiome']

In [63]:
mapping= {'3pv2_5pv1_5pv2':'EFO:0030004', '3pv2_5pv1_5pv2,3pv3':'EFO:0009922', '3pv2_5pv1_5pv2,multiome':'EFO:0030004'}

In [64]:
# Convert 'assay' column values to strings
adata.obs['assay'] = adata.obs['assay'].astype(str)

In [65]:
adata.obs['assay_ontology_term_id'] = ['EFO:0011025']* len(adata.obs)

In [66]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [67]:
# view adata.obs

In [68]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,predicted_labels,over_clustering,majority_voting,conf_score,leiden_R,celltype,donor,barcodes,assay,assay_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,Pulmonary neuroendocrine,19,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,Pulmonary neuroendocrine,46,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,Late tip,53,Late tip,0.979895,0,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,Late tip,83,Late tip,0.642230,0,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,Late tip,20,Late tip,0.863851,7,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,Late tip,36,Late tip,0.235389,5,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,Late tip,85,Mid fibro,0.735850,6,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,Late tip,31,Late tip,0.998556,5,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,Late tip,18,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025


#### **cell_type_ontology_term_id**

In [69]:
#identify the column in adata.obs related. to cell type annotation

In [70]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id'],
      dtype='object')

In [71]:
list(adata.obs['celltype'].unique())

['Intermediate',
 'Differentiating basal-like',
 'AT2-like',
 'Cycling AT2',
 'NE prog',
 'Differentiating pulmonary NE',
 'CXCL+ AT2-like',
 'Ciliated-like']

In [72]:
# create a dictionary of cell type and ontology term

In [73]:
mapping= {'Intermediate':'CL:0002368',
'Differentiating basal-like':'CL:0000646',
'AT2-like':'CL:0002063',
'Cycling AT2':'CL:0002063',
'NE prog':'CL:1000223',
'Differentiating pulmonary NE':'CL:1000223',
'CXCL+ AT2-like':'CL:0002063',
'Ciliated-like':'CL:1000271'
}

In [74]:
# add the cell_type_ontology_term_id column

In [75]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['celltype'].map(mapping)

In [76]:
# change datatype of the column

In [77]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [78]:
# view adata.obs

In [79]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,over_clustering,majority_voting,conf_score,leiden_R,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,19,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,46,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,53,Late tip,0.979895,0,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,83,Late tip,0.642230,0,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,20,Late tip,0.863851,7,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,36,Late tip,0.235389,5,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,85,Mid fibro,0.735850,6,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,31,Late tip,0.998556,5,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,18,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223


#### **development_stage_ontology_term_id**

In [80]:
# identify the column in adata which corresponds to age

In [81]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [82]:
mapping = {'12.0':'HsapDv:0000050', '20.0':'HsapDv:0000058', '9.0':'HsapDv:0000047', '8.0':'HsapDv:0000046', '18.0':'HsapDv:0000056', '22.0':'HsapDv:0000060', '15.0':'HsapDv:0000053', '6.86':'HsapDv:0000029', '5.0':'HsapDv:0000022', '11.0':'HsapDv:0000049', 'nan':'unknown'}

In [83]:
adata.obs['development_stage_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [84]:
# change datatype of the column

In [85]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [86]:
# view unique values of development_stage_ontology_term_id column

In [87]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['unknown']

In [88]:
# view adata.obs

In [89]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,majority_voting,conf_score,leiden_R,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,Pulmonary neuroendocrine,0.670715,4,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,Pulmonary neuroendocrine,0.755056,7,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,Late tip,0.979895,0,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,Late tip,0.642230,0,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,Late tip,0.863851,7,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,Late tip,0.235389,5,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,Mid fibro,0.735850,6,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,Late tip,0.998556,5,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,Mid fibro,0.996191,30,Differentiating pulmonary NE,doublets,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown


#### **donor_id**

In [90]:
#identify the column in adata.obs which provides donor information

In [91]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [92]:
# add the donor_id column

In [93]:
adata.obs['donor'] = adata.obs['donor'].astype(str)

In [94]:
adata.obs['donor'].fillna('pooled', inplace=True)
adata.obs['donor'].replace('', 'pooled', inplace=True)

# Replace 'doublets' with 'pooled'
adata.obs['donor'].replace('doublets', 'pooled', inplace=True)

In [95]:
adata.obs['donor_id'] = adata.obs['donor']

In [96]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [97]:
# change datatype of the column

In [98]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [99]:
# view unique values of donor_id column

In [100]:
list(adata.obs['donor_id'].unique())

['HDBR16402', 'HDBR15934', 'HDBR16392', 'pooled', 'HDBR16011']

In [101]:
#view obs

In [102]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,conf_score,leiden_R,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,0.670715,4,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,0.755056,7,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,0.979895,0,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,0.642230,0,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,0.863851,7,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,0.235389,5,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,0.735850,6,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,0.998556,5,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,0.996191,30,Differentiating pulmonary NE,pooled,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled


In [103]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [104]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [105]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,leiden_R,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,4,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,7,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,0,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,0,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,7,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,5,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,6,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,5,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,30,Differentiating pulmonary NE,pooled,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461


In [106]:
# change datatype of the column

In [107]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [108]:
# view obs

In [109]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,leiden_R,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,4,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,7,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,0,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,0,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,7,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,5,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,6,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,5,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,30,Differentiating pulmonary NE,pooled,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461


#### **is_primary_data**

In [110]:
import pandas as pd

# Assuming adata.obs is a pandas DataFrame
# Replace 'adata.obs' with the actual name of your DataFrame if different

# Set 'is_primary_data' to True for all entries initially
adata.obs['is_primary_data'] = True

# Set 'is_primary_data' to False for entries with 'batch' values starting with '5891' or 'WSSS'
adata.obs.loc[adata.obs['batch'].str.startswith(('5891', 'WSSS')), 'is_primary_data'] = False


In [111]:
list(adata.obs['is_primary_data'].unique())

[True]

In [112]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,Differentiating pulmonary NE,pooled,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True


In [113]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [114]:
#change data type of column

In [115]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [116]:
# view obs

In [117]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,celltype,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,Intermediate,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,Differentiating basal-like,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,AT2-like,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,AT2-like,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,Differentiating basal-like,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,Cycling AT2,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,AT2-like,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,Cycling AT2,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,Differentiating pulmonary NE,pooled,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True


#### **organism_ontology_term_id**

In [118]:
# assign organism id 

In [119]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [120]:
#change data type of column

In [121]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [122]:
# view obs

In [123]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,donor,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,HDBR16402,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,HDBR16402,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,HDBR15934,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,HDBR15934,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,HDBR16392,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,HDBR16392,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,HDBR16392,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,HDBR16402,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,pooled,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [124]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [125]:
# change data type

In [126]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [127]:
# view obs

In [128]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown


In [129]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [130]:
# identify the column in adata.obs which corresponds to sex

In [131]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [132]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,AAACCTGAGCAGCGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,AAACCTGAGTACGTAA,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,AAACCTGCACCATGTA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,AAACCTGCACCTATCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,AAACCTGCACTTAACG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,TTTGTCATCCACGAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,TTTGTCATCGAGAGCA,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,TTTGTCATCGCCTGAG,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,TTTGTCATCTATCGCC,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown


In [133]:
# list the unique values 

In [134]:
# create a dictionary of sex and sex ontology term id

In [135]:
mapping= {'F': 'PATO:0000383', 'M': 'PATO:0000384', 'Unknown':'unknown'}

In [136]:
# add sex_ontology_term_id column

In [137]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [138]:
# change data type

In [139]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [140]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown


#### **suspension_type**

In [141]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,3pv2_5pv1_5pv2,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown


In [142]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [143]:
# change data type of column

In [144]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [145]:
# view obs

In [146]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,EFO:0011025,CL:0002368,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,EFO:0011025,CL:0000646,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,EFO:0011025,CL:0002063,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,EFO:0011025,CL:0000646,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,EFO:0011025,CL:0002063,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,EFO:0011025,CL:0002063,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,EFO:0011025,CL:1000223,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell


#### **tissue_type**

In [147]:
adata.obs['tissue_type'] = ['organoid'] * len(adata.obs)

In [148]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [149]:
# identify the column in adata.obs which corresponds to tissue

In [150]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [151]:
# add 'tissue_ontology_term_id' column

In [152]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [153]:
# change data type of column

In [154]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [155]:
#list the unique values in 'tissue_ontology_term_id' column

In [156]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002048']

In [157]:
# view obs

In [158]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [159]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [160]:
# view obsm

In [161]:
# check whether all columns are prefixed with X

In [162]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### **uns (Dataset Metadata)**

In [163]:
# View

In [164]:
adata.uns

OverloadedDict, wrapping:
	{'Souporcell4_assignment_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b'],
      dtype=object), 'batch_colors': array(['#1f77b4'], dtype=object), 'celltype_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61'], dtype=object), 'leiden': {'params': {'n_iterations': -1, 'random_state': 0, 'resolution': 0.2}}, 'leiden_R_colors': array(['#023fa5', '#7d87b9', '#d6bcc0', '#bb7784', '#8e063b', '#4a6fe3',
       '#8595e1', '#b5bbe3', '#e6afb9', '#e07b91', '#d33f6a', '#11c638',
       '#8dd593', '#c6dec7', '#ead3c6', '#f0b98d', '#ef9708', '#0fcfc0',
       '#9cded6', '#d5eae7'], dtype=object), 'log1p': {}, 'majority_voting_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728'], dtype=object), 'neighbors': {'connectivities_key': 'connectivities', 'distances_key': 'distances', 'params': {'method': 'umap', 'metric': 'euclidean', 'n_neighbors': 10, 'n_pcs': 50, 'random_state': 0}

In [165]:
adata.uns.keys

<bound method OverloadedDict.keys of OverloadedDict, wrapping:
	{'Souporcell4_assignment_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b'],
      dtype=object), 'batch_colors': array(['#1f77b4'], dtype=object), 'celltype_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61'], dtype=object), 'leiden': {'params': {'n_iterations': -1, 'random_state': 0, 'resolution': 0.2}}, 'leiden_R_colors': array(['#023fa5', '#7d87b9', '#d6bcc0', '#bb7784', '#8e063b', '#4a6fe3',
       '#8595e1', '#b5bbe3', '#e6afb9', '#e07b91', '#d33f6a', '#11c638',
       '#8dd593', '#c6dec7', '#ead3c6', '#f0b98d', '#ef9708', '#0fcfc0',
       '#9cded6', '#d5eae7'], dtype=object), 'log1p': {}, 'majority_voting_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728'], dtype=object), 'neighbors': {'connectivities_key': 'connectivities', 'distances_key': 'distances', 'params': {'method': 'umap', 'metric': 'euclidean', 'n_neighbors'

In [166]:
# Give a title for the dataset

In [167]:
adata.uns['title'] = 'organoid'

In [168]:
# Set the default embedding

In [169]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [170]:
# view anndata object

In [171]:
adata

AnnData object with n_obs × n_vars = 9619 × 36390
    obs: 'Souporcell4_status', 'Souporcell4_assignment', 'percent_mito', 'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score', 'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'feature_types_x', 'feature_is_filtered', 'gene_ids_y', 'feature_types_y', 'n_cells', 'highly_variablefirst', 'highly_variable_n'
    uns: 'Souporcell4_assignment_colors', 'batch_colors', 'celltype_colors', 'leiden', 'leiden_R_colors', 'log1p', 'majority_voting_colors', 'neighbors', 'pca', 'predicted_labels_colors', 'umap', 't

In [172]:
# view obs and var data types

In [173]:
adata.obs.dtypes

Souporcell4_status                          category
Souporcell4_assignment                      category
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int64
batch                                       category
doublet_scores                               float64
bh_pval                                      float64
S_score                                      float64
G2M_score                                    float64
phase                                       category
leiden                                      category
predicted_labels                            category
over_clustering                             category
majority_voting                             category
conf_score                                   float64
leiden_R                                    category
celltype                                    category
donor                                         

In [174]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed n_cells from float64 to float32
changed highly_variable_n from float64 to float32


In [175]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed doublet_scores from float64 to float32
changed bh_pval from float64 to float32
changed S_score from float64 to float32
changed G2M_score from float64 to float32
changed conf_score from float64 to float32
changed n_genes from int64 to int32
changed donor from object to category
changed barcodes from object to category
changed assay from object to category


In [176]:
# view obs

In [177]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [178]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype', 'donor',
       'barcodes', 'assay', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [179]:
# delete unwanted columns in obs

In [180]:
del adata.obs['barcodes']
del adata.obs['assay']
del adata.obs['donor']
del adata.uns['log1p']

In [181]:
# view obs

In [182]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [183]:
# view var

In [184]:
adata.var

Unnamed: 0,feature_types_x,feature_is_filtered,gene_ids_y,feature_types_y,n_cells,highly_variablefirst,highly_variable_n
ENSG00000243485,Gene Expression,False,ENSG00000243485,Gene Expression,9.0,False,0.0
ENSG00000237613,Gene Expression,True,,,,,
ENSG00000186092,Gene Expression,True,,,,,
ENSG00000238009,Gene Expression,False,ENSG00000238009,Gene Expression,18.0,False,0.0
ENSG00000239945,Gene Expression,True,,,,,
...,...,...,...,...,...,...,...
ENSG00000277836,Gene Expression,True,,,,,
ENSG00000278633,Gene Expression,True,,,,,
ENSG00000276017,Gene Expression,True,,,,,
ENSG00000278817,Gene Expression,False,ENSG00000278817,Gene Expression,224.0,False,0.0


In [185]:
araw.var

Unnamed: 0_level_0,feature_types
gene_ids,Unnamed: 1_level_1
ENSG00000243485,Gene Expression
ENSG00000237613,Gene Expression
ENSG00000186092,Gene Expression
ENSG00000238009,Gene Expression
ENSG00000239945,Gene Expression
...,...
ENSG00000277836,Gene Expression
ENSG00000278633,Gene Expression
ENSG00000276017,Gene Expression
ENSG00000278817,Gene Expression


In [186]:
del araw.var['feature_types']
del adata.var['gene_ids_y']
del adata.var['feature_types_y']
del adata.var['n_cells']
del adata.var['highly_variablefirst']
del adata.var['highly_variable_n']
del adata.var['feature_types_x']

In [187]:
#view uns

In [188]:
adata.uns

OverloadedDict, wrapping:
	{'Souporcell4_assignment_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b'],
      dtype=object), 'batch_colors': array(['#1f77b4'], dtype=object), 'celltype_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61'], dtype=object), 'leiden': {'params': {'n_iterations': -1, 'random_state': 0, 'resolution': 0.2}}, 'leiden_R_colors': array(['#023fa5', '#7d87b9', '#d6bcc0', '#bb7784', '#8e063b', '#4a6fe3',
       '#8595e1', '#b5bbe3', '#e6afb9', '#e07b91', '#d33f6a', '#11c638',
       '#8dd593', '#c6dec7', '#ead3c6', '#f0b98d', '#ef9708', '#0fcfc0',
       '#9cded6', '#d5eae7'], dtype=object), 'majority_voting_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728'], dtype=object), 'neighbors': {'connectivities_key': 'connectivities', 'distances_key': 'distances', 'params': {'method': 'umap', 'metric': 'euclidean', 'n_neighbors': 10, 'n_pcs': 50, 'random_state': 0}}, 'pca': {'p

In [189]:
list(adata.uns.keys())

['Souporcell4_assignment_colors',
 'batch_colors',
 'celltype_colors',
 'leiden',
 'leiden_R_colors',
 'majority_voting_colors',
 'neighbors',
 'pca',
 'predicted_labels_colors',
 'umap',
 'title',
 'default_embedding']

In [190]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [191]:
# Remove unwanted columns in uns

In [192]:
#check the format of expression matrix

In [193]:
adata.X

<9619x36390 sparse matrix of type '<class 'numpy.float32'>'
	with 36956351 stored elements in Compressed Sparse Row format>

In [194]:
araw.X

<9619x36390 sparse matrix of type '<class 'numpy.float32'>'
	with 37083515 stored elements in Compressed Sparse Row format>

In [195]:
#Copy raw counts to adata.raw

In [196]:
adata.raw = araw

In [197]:
obs_dtype = adata.obs.dtypes

In [198]:
obs_dtype

Souporcell4_status                          category
Souporcell4_assignment                      category
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int32
batch                                       category
doublet_scores                               float32
bh_pval                                      float32
S_score                                      float32
G2M_score                                    float32
phase                                       category
leiden                                      category
predicted_labels                            category
over_clustering                             category
majority_voting                             category
conf_score                                   float32
leiden_R                                    category
celltype                                    category
assay_ontology_term_id                      ca

In [199]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/lung_organoid_2024/Final_objects/lung_organoid.h5ad', compression = 'gzip')

In [200]:
adata.obs

Unnamed: 0,Souporcell4_status,Souporcell4_assignment,percent_mito,n_counts,n_genes,batch,doublet_scores,bh_pval,S_score,G2M_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCAGCGTA,singlet,3,0.025313,11667.0,3050,first,0.109792,0.792204,-0.154529,-0.085028,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGAGTACGTAA,singlet,3,0.027868,22709.0,4423,first,0.166667,0.792204,-0.097155,-0.174267,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCATGTA,singlet,0,0.005957,16519.0,3771,first,0.334873,0.792204,-0.070328,-0.124150,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACCTATCC,singlet,0,0.017658,11905.0,3015,first,0.121951,0.792204,-0.082849,-0.208696,...,unknown,HDBR15934,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
AAACCTGCACTTAACG,singlet,2,0.038610,13446.0,3568,first,0.125270,0.792204,-0.119884,-0.196853,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCACGAAT,singlet,2,0.103709,2489.0,1326,first,0.083472,0.792204,0.294187,0.429153,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGAGAGCA,singlet,2,0.016931,15851.0,3354,first,0.177285,0.792204,-0.127767,-0.152186,...,unknown,HDBR16392,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCGCCTGAG,singlet,3,0.026760,15093.0,3791,first,0.195231,0.792204,0.478485,0.114177,...,unknown,HDBR16402,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048
TTTGTCATCTATCGCC,doublet,doublets,0.015649,49251.0,6942,first,0.232082,0.792204,0.037060,-0.000052,...,unknown,pooled,PATO:0000461,True,NCBITaxon:9606,unknown,unknown,cell,organoid,UBERON:0002048


In [201]:
adata.obs.columns

Index(['Souporcell4_status', 'Souporcell4_assignment', 'percent_mito',
       'n_counts', 'n_genes', 'batch', 'doublet_scores', 'bh_pval', 'S_score',
       'G2M_score', 'phase', 'leiden', 'predicted_labels', 'over_clustering',
       'majority_voting', 'conf_score', 'leiden_R', 'celltype',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [202]:
adata.raw.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277836
ENSG00000278633
ENSG00000276017
ENSG00000278817
ENSG00000277196


In [203]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000243485,False
ENSG00000237613,True
ENSG00000186092,True
ENSG00000238009,False
ENSG00000239945,True
...,...
ENSG00000277836,True
ENSG00000278633,True
ENSG00000276017,True
ENSG00000278817,False
