### **Curating leukocytes_fetal_lung.h5ad**

Article: Early human lung immune cell development and its role in epithelial cell fate

DOI: 10.1126/sciimmunol.adf99

Data Source : https://fetal-lung-immune.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Data/leukocytes_fetal_lung.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 77559 × 27292
    obs: 'batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype', 'broad_type'
    var: 'n_cells-Gurdon', 'gene_ids-Gurdon'
    uns: 'batch_colors', 'celltype_colors', 'dendrogram_celltype', 'domain_colors', 'leiden', 'neighbors', 'new_celltype_colors', 'pca', 'predicted_Gurdon_colors', 'predicted_hi_colors', 'project_colors', 'rank_genes_groups', 'rank_genes_groups_filtered', 'rank_genes_groups_global', 'umap'
    obsm: 'X_bbknn_umap', 'X_pca', 'X_umap', 'X_umap_original'
    obsp: 'connectivities', 'distances'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<77559x27292 sparse matrix of type '<class 'numpy.float32'>'
	with 181125874 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 3)	0.6173298
  (0, 16)	0.6173298
  (0, 18)	0.9961889
  (0, 55)	0.6173298
  (0, 58)	0.6173298
  (0, 86)	0.6173298
  (0, 87)	0.6173298
  (0, 90)	0.6173298
  (0, 105)	0.6173298
  (0, 106)	0.6173298
  (0, 108)	0.6173298
  (0, 118)	0.6173298
  (0, 121)	0.6173298
  (0, 122)	0.6173298
  (0, 449)	1.2702976
  (0, 1235)	0.6173298
  (0, 1254)	0.6173298
  (0, 1723)	0.6173298
  (0, 1916)	0.6173298
  (0, 2347)	0.6173298
  (0, 2383)	0.6173298
  (0, 2748)	0.6173298
  (0, 3295)	0.6173298
  (0, 3410)	0.6173298
  (0, 3489)	0.6173298
  :	:
  (77558, 27202)	0.39650372
  (77558, 27207)	0.39650372
  (77558, 27213)	0.39650372
  (77558, 27224)	0.6796748
  (77558, 27227)	0.39650372
  (77558, 27228)	0.39650372
  (77558, 27229)	0.39650372
  (77558, 27230)	0.90010196
  (77558, 27231)	0.39650372
  (77558, 27232)	0.39650372
  (77558, 27233)	0.6796748
  (77558, 27243)	0.39650372
  (77558, 27245)	0.6796748
  (77558, 27247)	0.6796748
  (77558, 27248)	0.90010196
  (77558, 27251)	0.39650372
  (77558, 27252)	0.67967

##### **Raw counts matrix**

In [11]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Data/Raw/leukocytes_fetal_lung_full_raw.h5ad')

In [12]:
araw

AnnData object with n_obs × n_vars = 77559 × 33538
    obs: 'batch', 'barcodes'

In [13]:
#adata.raw = araw

##### **Variables(var)**

In [14]:
# View the var of anndata and raw object

In [15]:
adata.var

Unnamed: 0,n_cells-Gurdon,gene_ids-Gurdon
A1BG,True,ENSG00000121410
A1BG-AS1,787.0,ENSG00000268895
A1CF,49.0,ENSG00000148584
A2M,True,ENSG00000175899
A2M-AS1,796.0,ENSG00000245105
...,...,...
ZXDC,True,ENSG00000070476
ZYG11A,50.0,ENSG00000203995
ZYG11B,True,ENSG00000162378
ZYX,52233.0,ENSG00000159840


In [16]:
adata.var['name'] = adata.var.index

In [17]:
adata.var

Unnamed: 0,n_cells-Gurdon,gene_ids-Gurdon,name
A1BG,True,ENSG00000121410,A1BG
A1BG-AS1,787.0,ENSG00000268895,A1BG-AS1
A1CF,49.0,ENSG00000148584,A1CF
A2M,True,ENSG00000175899,A2M
A2M-AS1,796.0,ENSG00000245105,A2M-AS1
...,...,...,...
ZXDC,True,ENSG00000070476,ZXDC
ZYG11A,50.0,ENSG00000203995,ZYG11A
ZYG11B,True,ENSG00000162378,ZYG11B
ZYX,52233.0,ENSG00000159840,ZYX


In [18]:
araw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


In [19]:
adata.var.index = adata.var['gene_ids-Gurdon']

In [20]:
adata.var

Unnamed: 0_level_0,n_cells-Gurdon,gene_ids-Gurdon,name
gene_ids-Gurdon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,True,ENSG00000121410,A1BG
ENSG00000268895,787.0,ENSG00000268895,A1BG-AS1
ENSG00000148584,49.0,ENSG00000148584,A1CF
ENSG00000175899,True,ENSG00000175899,A2M
ENSG00000245105,796.0,ENSG00000245105,A2M-AS1
...,...,...,...
ENSG00000070476,True,ENSG00000070476,ZXDC
ENSG00000203995,50.0,ENSG00000203995,ZYG11A
ENSG00000162378,True,ENSG00000162378,ZYG11B
ENSG00000159840,52233.0,ENSG00000159840,ZYX


In [21]:
araw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


In [22]:
# Load the approved genes file.

In [23]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [24]:
#Create a dictionary from the approved genes file 

In [25]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [26]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [27]:
len(genedict)

119799

In [28]:
#Filter out the genes which are not in the approved genes file.

In [29]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [30]:
len(var_to_keep_adata)

26278

In [31]:
len(var_to_keep_araw)

33137

In [32]:
adata.var

Unnamed: 0_level_0,n_cells-Gurdon,gene_ids-Gurdon,name
gene_ids-Gurdon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,True,ENSG00000121410,A1BG
ENSG00000268895,787.0,ENSG00000268895,A1BG-AS1
ENSG00000148584,49.0,ENSG00000148584,A1CF
ENSG00000175899,True,ENSG00000175899,A2M
ENSG00000245105,796.0,ENSG00000245105,A2M-AS1
...,...,...,...
ENSG00000070476,True,ENSG00000070476,ZXDC
ENSG00000203995,50.0,ENSG00000203995,ZYG11A
ENSG00000162378,True,ENSG00000162378,ZYG11B
ENSG00000159840,52233.0,ENSG00000159840,ZYX


In [33]:
araw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


In [34]:
# Modify the anndata object by filtering out the filtered genes.

In [35]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [36]:
adata.var

Unnamed: 0_level_0,n_cells-Gurdon,gene_ids-Gurdon,name
gene_ids-Gurdon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,True,ENSG00000121410,A1BG
ENSG00000268895,787.0,ENSG00000268895,A1BG-AS1
ENSG00000148584,49.0,ENSG00000148584,A1CF
ENSG00000175899,True,ENSG00000175899,A2M
ENSG00000245105,796.0,ENSG00000245105,A2M-AS1
...,...,...,...
ENSG00000070476,True,ENSG00000070476,ZXDC
ENSG00000203995,50.0,ENSG00000203995,ZYG11A
ENSG00000162378,True,ENSG00000162378,ZYG11B
ENSG00000159840,52233.0,ENSG00000159840,ZYX


In [37]:
# View var

In [38]:
adata.var

Unnamed: 0_level_0,n_cells-Gurdon,gene_ids-Gurdon,name
gene_ids-Gurdon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,True,ENSG00000121410,A1BG
ENSG00000268895,787.0,ENSG00000268895,A1BG-AS1
ENSG00000148584,49.0,ENSG00000148584,A1CF
ENSG00000175899,True,ENSG00000175899,A2M
ENSG00000245105,796.0,ENSG00000245105,A2M-AS1
...,...,...,...
ENSG00000070476,True,ENSG00000070476,ZXDC
ENSG00000203995,50.0,ENSG00000203995,ZYG11A
ENSG00000162378,True,ENSG00000162378,ZYG11B
ENSG00000159840,52233.0,ENSG00000159840,ZYX


In [39]:
araw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


feature is filtered

In [40]:
del adata.var['name']
del adata.var['gene_ids-Gurdon']

In [41]:
def add_zero():
	global adata
	global araw
	if araw.shape[1] > adata.shape[1]:
		genes_add = [x for x in araw.var.index.to_list() if x not in adata.var.index.to_list()]
		new_matrix = sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape = araw.shape)
		all_genes = adata.var.index.to_list()
		all_genes.extend(genes_add)
		new_var = pd.DataFrame(index=all_genes)
		new_var = pd.merge(new_var, araw.var, left_index=True, right_index=True, how='left')
		new_var['feature_is_filtered'] = False
		new_var.loc[genes_add, 'feature_is_filtered'] = True
		new_adata = ad.AnnData(X=new_matrix, obs=adata.obs, var=new_var, uns=adata.uns, obsm=adata.obsm)
		if adata.layers:
			for layer in adata.layers:
				new_layer = sparse.csr_matrix((adata.layers[layer].data, adata.layers[layer].indices, adata.layers[layer].indptr), shape = araw.shape)
				new_adata.layers[layer] = new_layer
		new_adata = new_adata[:,araw.var.index.to_list()]
		new_adata.var = new_adata.var.merge(adata.var, left_index=True, right_index=True, how='left')
		adata = new_adata
	else:
		adata.var['feature_is_filtered'] = False


In [42]:
add_zero()

In [43]:
adata.var

Unnamed: 0,feature_is_filtered,n_cells-Gurdon
ENSG00000121410,False,True
ENSG00000268895,False,787.0
ENSG00000148584,False,49.0
ENSG00000175899,False,True
ENSG00000245105,False,796.0
...,...,...
ENSG00000203995,False,50.0
ENSG00000162378,False,True
ENSG00000159840,False,52233.0
ENSG00000074755,False,True


In [44]:
list(adata.var['feature_is_filtered'].unique())

[False, True]

In [45]:
false_count = (adata.var['feature_is_filtered']== False).sum()

In [46]:
false_count

26278

In [47]:
adata.var

Unnamed: 0,feature_is_filtered,n_cells-Gurdon
ENSG00000121410,False,True
ENSG00000268895,False,787.0
ENSG00000148584,False,49.0
ENSG00000175899,False,True
ENSG00000245105,False,796.0
...,...,...
ENSG00000203995,False,50.0
ENSG00000162378,False,True
ENSG00000159840,False,52233.0
ENSG00000074755,False,True


In [48]:
araw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


#### **obs (Cell metadata)**

In [49]:
#view obs

In [50]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,domain,gender,leiden_R,celltype,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,broad_type
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,Myeloid,M,110,DC2,,,,,,Myeloid
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,NK,M,21,Intermediate NK,,,,,,NK
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,T,M,13,CD8 T,TRB_only,T,unassigned,TCR,TRA+TRB,T
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,Myeloid,M,112,DC2,,,,,,Myeloid
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,B,M,120,Pro-B,,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,Megk & Mast,M,242,Mast,,,,,,Myeloid
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,Megk & Mast,M,242,Mast,,,,,,Myeloid
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,Megk & Mast,M,242,Mast,,,,,,Myeloid
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,Megk & Mast,M,243,Baso/Eosino,,,,,,Myeloid


In [51]:
# view the column names in obs

In [52]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type'],
      dtype='object')

In [53]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,domain,gender,leiden_R,celltype,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,broad_type
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,Myeloid,M,110,DC2,,,,,,Myeloid
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,NK,M,21,Intermediate NK,,,,,,NK
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,T,M,13,CD8 T,TRB_only,T,unassigned,TCR,TRA+TRB,T
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,Myeloid,M,112,DC2,,,,,,Myeloid
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,B,M,120,Pro-B,,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,Megk & Mast,M,242,Mast,,,,,,Myeloid
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,Megk & Mast,M,242,Mast,,,,,,Myeloid
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,Megk & Mast,M,242,Mast,,,,,,Myeloid
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,Megk & Mast,M,243,Baso/Eosino,,,,,,Myeloid


In [54]:
list(adata.obs['batch'].unique())

['Imm_FLNG8965963',
 'Imm_FLNG8965964',
 'Imm_FLNG8965965',
 'Imm_FLNG8965966',
 'Imm_FLNG8965968',
 'Imm_FLNG8966013',
 'Imm_FLNG8966014',
 'Imm_FLNG8966015',
 'Imm_FLNG8966016',
 'Imm_FLNG8966017',
 'Imm_FLNG8966018',
 'Imm_FLNG8965969-Imm_FLNG8965970',
 'Imm_FLNG8966063-Imm_FLNG8966066',
 'Imm_FLNG8966064-Imm_FLNG8966067',
 'Imm_FLNG8966065-CV001_KM9294203-Imm_FLNG8966068-CV001_KM9294204',
 'Imm_FLNG9347056-Imm_FLNG9347059',
 '5891STDY8062349',
 '5891STDY8062350',
 '5891STDY8062351',
 '5891STDY8062352',
 '5891STDY8062353',
 '5891STDY8062354',
 '5891STDY8062355',
 '5891STDY8062356',
 'WSSS8012016',
 'WSSS8011222',
 'WSSS_F_LNG8713176',
 'WSSS_F_LNG8713177',
 'WSSS_F_LNG8713178',
 'WSSS_F_LNG8713179',
 'WSSS_F_LNG8713180',
 'WSSS_F_LNG8713181',
 'WSSS_F_LNG8713184',
 'WSSS_F_LNG8713185',
 'WSSS_F_LNG8713186',
 'WSSS_F_LNG8713187',
 'WSSS_F_LNG8713188',
 'WSSS_F_LNG8713189',
 'WSSS_F_LNG8713190',
 'WSSS_F_LNG8713191',
 '5891STDY9030806',
 '5891STDY9030807',
 '5891STDY9030808',
 '5891ST

#### **assay_ontology_term_id**

In [55]:
adata.obs['barcodes'] = adata.obs_names

In [56]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [57]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [58]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [59]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [60]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+3pv3', '3pv2_5pv1_5pv2+multiome', nan]

In [61]:
mapping= {'3pv2_5pv1_5pv2':'EFO:0030004', '3pv2_5pv1_5pv2+3pv3':'EFO:0030004', '3pv2_5pv1_5pv2+multiome':'EFO:0030004','nan':'EFO:0030004'}

In [62]:
mapping= { '3GEX' :'EFO:0009899', '5GEX':'EFO:0030004' , 'nan' :'EFO:0030004'}

In [63]:
mapping={'nan':'EFO:0030004',
'nan':'EFO:0009899',
'Imm_FLNG8965963':'EFO:0009899',
'Imm_FLNG8965964':'EFO:0009899',
'Imm_FLNG8965965':'EFO:0009899',
'Imm_FLNG8965966':'EFO:0009899',
'Imm_FLNG8965968':'EFO:0009899',
'Imm_FLNG8966013':'EFO:0009899',
'Imm_FLNG8966014':'EFO:0009899',
'Imm_FLNG8966015':'EFO:0009899',
'Imm_FLNG8966016':'EFO:0009899',
'Imm_FLNG8966017':'EFO:0009899',
'Imm_FLNG8966018':'EFO:0009899',
'Imm_FLNG8965969-Imm_FLNG8965970':'EFO:0009899',
'Imm_FLNG8966063-Imm_FLNG8966066':'EFO:0009899',
'Imm_FLNG8966064-Imm_FLNG8966067':'EFO:0009899',
'Imm_FLNG8966065-CV001_KM9294203-Imm_FLNG8966068-CV001_KM9294204':'EFO:0009899',
'Imm_FLNG9347056-Imm_FLNG9347059':'EFO:0009899',
'5891STDY8062349':'EFO:0011025',
'5891STDY8062350':'EFO:0011025',
'5891STDY8062351':'EFO:0011025',
'5891STDY8062352':'EFO:0011025',
'5891STDY8062353':'EFO:0011025',
'5891STDY8062354':'EFO:0011025',
'5891STDY8062355':'EFO:0011025',
'5891STDY8062356':'EFO:0011025',
'WSSS8012016':'EFO:0011025',
'WSSS8011222':'EFO:0011025',
'WSSS_F_LNG8713176':'EFO:0011025',
'WSSS_F_LNG8713177':'EFO:0011025',
'WSSS_F_LNG8713178':'EFO:0011025',
'WSSS_F_LNG8713179':'EFO:0011025',
'WSSS_F_LNG8713180':'EFO:0011025',
'WSSS_F_LNG8713181':'EFO:0011025',
'WSSS_F_LNG8713184':'EFO:0011025',
'WSSS_F_LNG8713185':'EFO:0011025',
'WSSS_F_LNG8713186':'EFO:0011025',
'WSSS_F_LNG8713187':'EFO:0011025',
'WSSS_F_LNG8713188':'EFO:0011025',
'WSSS_F_LNG8713189':'EFO:0011025',
'WSSS_F_LNG8713190':'EFO:0011025',
'WSSS_F_LNG8713191':'EFO:0011025',
'5891STDY9030806':'EFO:0011025',
'5891STDY9030807':'EFO:0011025',
'5891STDY9030808':'EFO:0011025',
'5891STDY9030809':'EFO:0011025',
'5891STDY9030810':'EFO:0011025'}

In [64]:
adata.obs['assay_ontology_term_id'] =  adata.obs['batch'].map(mapping)

In [65]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [66]:
# view adata.obs

In [67]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,celltype,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,DC2,,,,,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,Intermediate NK,,,,,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,CD8 T,TRB_only,T,unassigned,TCR,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,DC2,,,,,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,Pro-B,,,,,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,Mast,,,,,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,Mast,,,,,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,Mast,,,,,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,Baso/Eosino,,,,,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025


#### **cell_type_ontology_term_id**

In [68]:
#identify the column in adata.obs related. to cell type annotation

In [69]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id'],
      dtype='object')

In [70]:
list(adata.obs['celltype'].unique())

['DC2',
 'Intermediate NK',
 'CD8 T',
 'Pro-B',
 'ILC3',
 'MΦ',
 'CD56bright NK',
 'CD16+ NK',
 'Cycling NK',
 'CD4 T',
 'pDC',
 'Pre-pDC/DC5',
 'ILCP',
 'Type 3 innate T',
 'CD5+ Mature B',
 'ILC2',
 'Type 1 innate T',
 'CD5- Mature B',
 'Alveolar fibro',
 'Mast',
 'Neutrophil',
 'TNC+ fibro',
 'S100A12-hi CD14+ mono',
 'Pro-B/Pre-B transition',
 'Late pro-B',
 'aNK',
 'DC1',
 'SMC',
 'Pre-pro-B',
 'CXCL9+ MΦ',
 'Large pre-B',
 'CD16+ mono',
 'S100A12-lo CD14+ mono',
 'Treg',
 'Megk progenitor',
 'κ small pre-B',
 'Immature B',
 'GMP',
 'Adventitial fibro',
 'Distal epithelial',
 'Late pre-B',
 'Secretory epithelial',
 'Pericyte',
 'aDC',
 'Endo',
 'Erythroid',
 'Pro-monocyte',
 'Baso/Eosino',
 'CMP',
 'LMPP/ELP',
 'MEP',
 'HSC/MPP',
 'λ small pre-B',
 'APOE+ MΦ',
 'Myelocyte-like',
 'Pro-myelocyte',
 'T progenitors',
 'Megk',
 'Ciliated']

In [71]:
# create a dictionary of cell type and ontology term

In [72]:
mapping= {
'DC2':'CL:0000990',
'Intermediate NK':'CL:0000623',
'CD8 T':'CL:0000625',
'Pro-B':'CL:0000826',
'ILC3':'CL:0001078',
'MΦ':'CL:0000235',
'CD56bright NK':'CL:0000938',
'CD16+ NK':'CL:0000939',
'Cycling NK':'CL:0000623',
'CD4 T':'CL:0000624',
'pDC':'CL:0000784',
'Pre-pDC/DC5':'CL:0000784',
'ILCP':'CL:0001065',
'Type 3 innate T':'CL:0001078',
'CD5+ Mature B':'CL:0000785',
'ILC2':'CL:0001069',
'Type 1 innate T':'CL:0001067',
'CD5- Mature B':'CL:0000785',
'Alveolar fibro':'CL:4028004',
'Mast':'CL:0000097',
'Neutrophil':'CL:0000775',
'TNC+ fibro':'CL:0000057',
'S100A12-hi CD14+ mono':'CL:0001054',
'Pro-B/Pre-B transition':'CL:0002045',
'Late pro-B':'CL:0002048',
'aNK':'CL:0000623',
'DC1':'CL:0000990',
'SMC':'CL:0000192',
'Pre-pro-B':'CL:0002046',
'CXCL9+ MΦ':'CL:0000235',
'Large pre-B':'CL:0000957',
'CD16+ mono':'CL:0002396',
'S100A12-lo CD14+ mono':'CL:0001054',
'Treg':'CL:0000815',
'Megk progenitor':'CL:0000553',
'κ small pre-B':'CL:0002053',
'Immature B':'CL:0000816',
'GMP':'CL:0000557',
'Adventitial fibro':'CL:4028006',
'Distal epithelial':'CL:0002305',
'Late pre-B':'CL:0000817',
'Secretory epithelial':'CL:1000272',
'Pericyte':'CL:0009089',
'aDC':'CL:0000451',
'Endo':'CL:0000115',
'Erythroid':'CL:0000764',
'Pro-monocyte':'CL:0000576',
'Baso/Eosino':'CL:0000094',
'CMP':'CL:0000049',
'LMPP/ELP':'CL:0000936',
'MEP':'CL:0000050',
'HSC/MPP':'CL:0000837',
'λ small pre-B':'CL:0002053',
'APOE+ MΦ':'CL:0000235',
'Myelocyte-like':'CL:0002193',
'Pro-myelocyte':'CL:0002193',
'T progenitors':'CL:0000827',
'Megk':'CL:0000556',
'Ciliated':'CL:0000064'}

In [73]:
# add the cell_type_ontology_term_id column

In [74]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['celltype'].map(mapping)

In [75]:
# change datatype of the column

In [76]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [77]:
# view adata.obs

In [78]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,,,,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,,,,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,TRB_only,T,unassigned,TCR,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,,,,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,,,,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,,,,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,,,,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,,,,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,,,,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094


#### **development_stage_ontology_term_id**

In [79]:
# identify the column in adata which corresponds to age

In [80]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id'],
      dtype='object')

In [81]:
list(adata.obs['stage'].unique())

[12.0, 20.0, 9.0, 8.0, 18.0, 22.0, 15.0, 6.86, 5.0, 11.0, nan]

In [82]:
adata.obs['stage'] = adata.obs['stage'].astype('str')

In [83]:
mapping={'HDBR14732':'HsapDv:0000050',
'HDBR14774':'HsapDv:0000058',
'HDBR14776':'HsapDv:0000050',
'HDBR14787':'HsapDv:0000058',
'HDBR14794':'HsapDv:0000050',
'HDBR14806':'HsapDv:0000058',
'HDBR14815':'HsapDv:0000050',
'HDBR14944':'HsapDv:0000047',
'HDBR14969':'HsapDv:0000046',
'HDBR15024':'HsapDv:0000058',
'HDBR15084':'HsapDv:0000046',
'HDBR15111':'HsapDv:0000047',
'HDBR15204':'HsapDv:0000058',
'HDBR15246':'HsapDv:0000058',
'HDBR15279+15280':'unknown',
'HDBR15332':'HsapDv:0000050',
'HDBR15383':'HsapDv:0000058',
'HDBR15404':'HsapDv:0000047',
'HDBR15503':'HsapDv:0000047',
'HDBR14804':'HsapDv:0000059',
'HDBR15168':'HsapDv:0000059',
'HDBR14854':'HsapDv:0000057',
'HDBR15167':'HsapDv:0000057',
'HDBR14808':'HsapDv:0000055',
'HDBR15219':'HsapDv:0000055',
'F29':'HsapDv:0000054',
'F41':'HsapDv:0000054',
'F78':'HsapDv:0000054',
'F21':'HsapDv:0000053',
'F72':'HsapDv:0000053',
'F30':'HsapDv:0000052',
'F50':'HsapDv:0000052',
'F51':'HsapDv:0000052',
'F66':'HsapDv:0000052',
'F73':'HsapDv:0000052',
'HDBR14853':'HsapDv:0000052',
'HDBR15233':'HsapDv:0000052',
'F38':'HsapDv:0000051',
'F71':'HsapDv:0000051',
'F45':'HsapDv:0000050',
'F67':'HsapDv:0000049',
'F23':'HsapDv:0000048',
'F64':'HsapDv:0000048',
'F69':'HsapDv:0000048',
'HDBR15280':'HsapDv:0000048',
'F19':'HsapDv:0000047',
'F22':'HsapDv:0000047',
'F33':'HsapDv:0000046',
'HDBR15279':'HsapDv:0000046',
'F34':'HsapDv:0000030',
'F61':'HsapDv:0000027',
'BRC2192':'HsapDv:0000026',
'F32':'HsapDv:0000026',
'BRC2188':'HsapDv:0000023',
'F37':'HsapDv:0000020',
'F35':'HsapDv:0000002',
'HDBR14706':'HsapDv:0000058'}

In [84]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [85]:
# change datatype of the column

In [86]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [87]:
# view unique values of development_stage_ontology_term_id column

In [88]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000050',
 'HsapDv:0000058',
 'HsapDv:0000047',
 'HsapDv:0000046',
 'HsapDv:0000055',
 'HsapDv:0000059',
 'HsapDv:0000052',
 'HsapDv:0000057',
 'HsapDv:0000026',
 'HsapDv:0000023',
 'HsapDv:0000048',
 'unknown']

In [89]:
# view adata.obs

In [90]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,productive_summary,isotype_summary,receptor_type,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,,,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,,,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,T,unassigned,TCR,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,,,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,,,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,,,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,,,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,,,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,,,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052


#### **donor_id**

In [91]:
#identify the column in adata.obs which provides donor information

In [92]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [93]:
# add the donor_id column

In [94]:
adata.obs['donor_id'] = adata.obs['donor']

In [95]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [96]:
# change datatype of the column

In [97]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [98]:
# view unique values of donor_id column

In [99]:
list(adata.obs['donor_id'].unique())

['HDBR14732',
 'HDBR14706',
 'HDBR14806',
 'HDBR14794',
 'HDBR15246',
 'HDBR14774',
 'HDBR14787',
 'HDBR15204',
 'HDBR14815',
 'HDBR15111',
 'HDBR14969',
 'HDBR15084',
 'HDBR14944',
 'HDBR15024',
 'HDBR14776',
 'HDBR15332',
 'HDBR15383',
 'HDBR15503',
 'HDBR15404',
 'HDBR14808',
 'HDBR14804',
 'HDBR14853',
 'HDBR14854',
 'BRC2192',
 'BRC2188',
 'HDBR15167',
 'HDBR15168',
 'HDBR15233',
 'HDBR15219',
 'HDBR15279',
 'HDBR15280',
 'HDBR15279+15280']

In [100]:
adata.obs['donor_id'] = adata.obs['donor_id'].replace('HDBR15279+15280', 'pooled')

  adata.obs['donor_id'] = adata.obs['donor_id'].replace('HDBR15279+15280', 'pooled')


In [101]:
#view obs

In [102]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,isotype_summary,receptor_type,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,unassigned,TCR,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233


In [103]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [104]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [105]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,receptor_type,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,TCR,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461


In [106]:
# change datatype of the column

In [107]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [108]:
# view obs

In [109]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,receptor_type,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,TCR,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461


#### **is_primary_data**

In [110]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [111]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False


In [112]:
#change data type of column

In [113]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [114]:
# view obs

In [115]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,receptor_subtype,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,TRA+TRB,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False


#### **organism_ontology_term_id**

In [116]:
# assign organism id 

In [117]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [118]:
#change data type of column

In [119]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [120]:
# view obs

In [121]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,broad_type,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,Myeloid,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,NK,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,T,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,Myeloid,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,B,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,Myeloid,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,Myeloid,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,Myeloid,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,Myeloid,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [122]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [123]:
# change data type

In [124]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [125]:
# view obs

In [126]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown


In [127]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [128]:
# identify the column in adata.obs which corresponds to sex

In [129]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [130]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,AAACGGGAGATCCCAT,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,AAACGGGCAGTGACAG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,AAAGATGTCCAAACTG,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,AAAGATGTCGCCATAA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,AAAGTAGAGACGCACA,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown


In [131]:
# list the unique values 

In [132]:
list(adata.obs['gender'].unique())

['M', 'F', 'Unknown']

In [133]:
mapping={
'BRC2188':'PATO:0000383',
'BRC2192':'PATO:0000383',
'F19':'PATO:0000383',
'F22':'PATO:0000383',
'F29':'PATO:0000383',
'F32':'PATO:0000383',
'F33':'PATO:0000383',
'F34':'PATO:0000383',
'F35':'PATO:0000383',
'F37':'PATO:0000383',
'F41':'PATO:0000383',
'F45':'PATO:0000383',
'F50':'PATO:0000383',
'F51':'PATO:0000383',
'F67':'PATO:0000383',
'F69':'PATO:0000383',
'F71':'PATO:0000383',
'F72':'PATO:0000383',
'F73':'PATO:0000383',
'HDBR14853':'PATO:0000383',
'HDBR14854':'PATO:0000383',
'HDBR15279':'PATO:0000383',
'F21':'PATO:0000384',
'F23':'PATO:0000384',
'F30':'PATO:0000384',
'F38':'PATO:0000384',
'F61':'PATO:0000384',
'F64':'PATO:0000384',
'F66':'PATO:0000384',
'F78':'PATO:0000384',
'HDBR14804':'PATO:0000384',
'HDBR14808':'PATO:0000384',
'HDBR15167':'PATO:0000384',
'HDBR15168':'PATO:0000384',
'HDBR15219':'PATO:0000384',
'HDBR15233':'PATO:0000384',
'HDBR15280':'PATO:0000384',
'HDBR14706':'PATO:0000383',
'HDBR14732':'PATO:0000384',
'HDBR14774':'PATO:0000383',
'HDBR14776':'PATO:0000383',
'HDBR14787':'PATO:0000384',
'HDBR14794':'PATO:0000383',
'HDBR14806':'PATO:0000383',
'HDBR14815':'PATO:0000384',
'HDBR14944':'PATO:0000383',
'HDBR14969':'PATO:0000383',
'HDBR15024':'PATO:0000383',
'HDBR15084':'PATO:0000384',
'HDBR15111':'PATO:0000384',
'HDBR15204':'PATO:0000383',
'HDBR15246':'PATO:0000384',
'HDBR15279+15280':'unknown',
'HDBR15332':'PATO:0000384',
'HDBR15383':'PATO:0000384',
'HDBR15404':'PATO:0000383',
'HDBR15503':'PATO:0000384'}


In [134]:
adata.obs['sex_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [135]:
# change data type

In [136]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [137]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


#### **suspension_type**

In [138]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,3pv2_5pv1_5pv2,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


In [139]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [140]:
# change data type of column

In [141]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [142]:
# view obs

In [143]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,EFO:0009899,CL:0000623,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,EFO:0009899,CL:0000625,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,EFO:0009899,CL:0000990,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,EFO:0009899,CL:0000826,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell


#### **tissue_type**

In [144]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [145]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [146]:
# identify the column in adata.obs which corresponds to tissue

In [147]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [148]:
list(adata.obs['receptor_type'].unique())

['nan', 'TCR', 'BCR']

In [149]:
# add 'tissue_ontology_term_id' column

In [150]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [151]:
# change data type of column

In [152]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [153]:
#list the unique values in 'tissue_ontology_term_id' column

In [154]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002048']

In [155]:
# view obs

In [156]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [157]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [158]:
# view obsm

In [159]:
# check whether all columns are prefixed with X

In [160]:
adata.obsm

AxisArrays with keys: X_bbknn_umap, X_pca, X_umap, X_umap_original

#### **uns (Dataset Metadata)**

In [161]:
# View

In [162]:
adata.uns

{'batch_colors': array(['#FFFF00', '#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6',
        '#A30059', '#FFDBE5', '#7A4900', '#0000A6', '#63FFAC', '#B79762',
        '#004D43', '#8FB0FF', '#997D87', '#5A0007', '#809693', '#6A3A4C',
        '#1B4400', '#4FC601', '#3B5DFF', '#4A3B53', '#FF2F80', '#61615A',
        '#BA0900', '#6B7900', '#00C2A0', '#FFAA92', '#FF90C9', '#B903AA',
        '#D16100', '#DDEFFF', '#000035', '#7B4F4B', '#A1C299', '#300018',
        '#0AA6D8', '#013349', '#00846F', '#372101', '#FFB500', '#C2FFED',
        '#A079BF', '#CC0744', '#C0B9B2'], dtype=object),
 'celltype_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
        '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
        '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
        '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
        '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
        '#d16100', '

In [163]:
adata.uns.keys

<function dict.keys>

In [164]:
# Give a title for the dataset

In [165]:
adata.uns['title'] = 'Leukocytes of the fetal lung'

In [166]:
# Set the default embedding

In [167]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [168]:
# view anndata object

In [169]:
adata

AnnData object with n_obs × n_vars = 77559 × 33137
    obs: 'batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito', 'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered', 'n_cells-Gurdon'
    uns: 'batch_colors', 'celltype_colors', 'dendrogram_celltype', 'domain_colors', 'leiden', 'neighbors', 'new_celltype_colors', 'pca', 'predicted_Gurdon_colors', 'predicted_hi_colors', 'project_colors', 'rank_genes_groups', 'rank_g

In [170]:
# view obs and var data types

In [171]:
adata.obs.dtypes

batch                                       category
status                                      category
assignment                                  category
donor                                       category
stage                                         object
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int64
doublet_scores                               float64
bh_pval                                      float64
leiden                                      category
phase                                       category
S_score                                      float32
G2M_score                                    float32
project                                     category
domain                                      category
gender                                      category
leiden_R                                    category
celltype                                    ca

In [172]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [173]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed doublet_scores from float64 to float32
changed bh_pval from float64 to float32
changed n_genes from int64 to int32
changed stage from object to category
changed barcodes from object to category
changed assay from object to category


In [174]:
# view obs

In [175]:
adata.obs

Unnamed: 0,batch,status,assignment,donor,stage,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.016132,11710.0,2583,0.054545,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAACGGGCAGTGACAG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.034385,2387.0,1243,0.090426,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCCAAACTG-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.025609,15486.0,3622,0.159533,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCGCCATAA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.022353,16489.0,3686,0.083969,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGTAGAGACGCACA-Imm_FLNG8965963,Imm_FLNG8965963,,,HDBR14732,12.0,0.133360,4276.0,2415,0.040853,0.660933,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,WSSS_F_LNG8713184,,,HDBR15233,15.0,0.039884,9581.0,3133,0.330579,0.826283,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.034237,2706.0,1305,0.394231,0.122739,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.040634,7625.0,2735,0.330579,0.122739,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,WSSS_F_LNG8713185,,,HDBR15233,15.0,0.027229,13147.0,3721,0.245161,0.122739,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [176]:
adata.obs.columns

Index(['batch', 'status', 'assignment', 'donor', 'stage', 'percent_mito',
       'n_counts', 'n_genes', 'doublet_scores', 'bh_pval', 'leiden', 'phase',
       'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R',
       'celltype', 'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'broad_type', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [177]:
# delete unwanted columns in obs

In [178]:
del adata.obs['gender']
del adata.obs['barcodes']
del adata.obs['assay']
del adata.obs['donor']
del adata.obs['stage']
del adata.obs['batch']
del adata.uns['batch_colors']

In [179]:
# view obs

In [180]:
adata.obs

Unnamed: 0,status,assignment,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,,,0.016132,11710.0,2583,0.054545,0.660933,11,G1,-0.151007,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAACGGGCAGTGACAG-Imm_FLNG8965963,,,0.034385,2387.0,1243,0.090426,0.660933,21,G1,-0.158745,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCCAAACTG-Imm_FLNG8965963,,,0.025609,15486.0,3622,0.159533,0.660933,13,G1,-0.073351,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCGCCATAA-Imm_FLNG8965963,,,0.022353,16489.0,3686,0.083969,0.660933,11,G1,-0.142639,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGTAGAGACGCACA-Imm_FLNG8965963,,,0.133360,4276.0,2415,0.040853,0.660933,12,G2M,0.380279,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,,,0.039884,9581.0,3133,0.330579,0.826283,24,G1,-0.176634,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,,,0.034237,2706.0,1305,0.394231,0.122739,24,G1,-0.169248,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,,,0.040634,7625.0,2735,0.330579,0.122739,24,G1,-0.128864,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,,,0.027229,13147.0,3721,0.245161,0.122739,24,G1,-0.066071,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [181]:
# view var

In [182]:
adata.var

Unnamed: 0,feature_is_filtered,n_cells-Gurdon
ENSG00000121410,False,True
ENSG00000268895,False,787.0
ENSG00000148584,False,49.0
ENSG00000175899,False,True
ENSG00000245105,False,796.0
...,...,...
ENSG00000203995,False,50.0
ENSG00000162378,False,True
ENSG00000159840,False,52233.0
ENSG00000074755,False,True


In [183]:
araw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


In [184]:
#view uns

In [185]:
adata.uns

{'celltype_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
        '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
        '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
        '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
        '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
        '#d16100', '#ddefff', '#000035', '#7b4f4b', '#a1c299', '#300018',
        '#0aa6d8', '#013349', '#00846f', '#372101', '#ffb500', '#c2ffed',
        '#a079bf', '#cc0744', '#c0b9b2', '#c2ff99', '#001e09', '#00489c',
        '#6f0062', '#0cbd66', '#eec3ff', '#456d75', '#b77b68', '#7a87a1',
        '#788d66', '#885578', '#fad09f', '#ff8a9a', '#d157a0'],
       dtype=object),
 'dendrogram_celltype': {'categories_idx_ordered': array([14,  8, 10, 27, 23, 51, 24, 25,  7, 52,  3,  6, 44, 38, 46, 16,  9,
         47, 12, 37, 31, 42, 29, 43, 30, 55, 56, 26,  4,  5, 21, 45, 40, 54,
         36, 15, 53, 13, 1

In [186]:
list(adata.uns.keys())

['celltype_colors',
 'dendrogram_celltype',
 'domain_colors',
 'leiden',
 'neighbors',
 'new_celltype_colors',
 'pca',
 'predicted_Gurdon_colors',
 'predicted_hi_colors',
 'project_colors',
 'rank_genes_groups',
 'rank_genes_groups_filtered',
 'rank_genes_groups_global',
 'umap',
 'title',
 'default_embedding']

In [187]:
adata.obs.columns

Index(['status', 'assignment', 'percent_mito', 'n_counts', 'n_genes',
       'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'broad_type', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [188]:
del adata.uns['new_celltype_colors']
del adata.uns['predicted_Gurdon_colors']
del adata.uns['predicted_hi_colors']
del adata.var['n_cells-Gurdon']

In [189]:
# Remove unwanted columns in uns

In [190]:
#check the format of expression matrix

In [191]:
adata.X

<77559x33137 sparse matrix of type '<class 'numpy.float32'>'
	with 178652671 stored elements in Compressed Sparse Row format>

In [192]:
araw.X

<77559x33137 sparse matrix of type '<class 'numpy.float32'>'
	with 179594591 stored elements in Compressed Sparse Row format>

In [193]:
#Copy raw counts to adata.raw

In [194]:
adata.raw = araw

In [195]:
obs_dtype = adata.obs.dtypes

In [196]:
obs_dtype

status                                      category
assignment                                  category
percent_mito                                 float32
n_counts                                     float32
n_genes                                        int32
doublet_scores                               float32
bh_pval                                      float32
leiden                                      category
phase                                       category
S_score                                      float32
G2M_score                                    float32
project                                     category
domain                                      category
leiden_R                                    category
celltype                                    category
status_summary                              category
productive_summary                          category
isotype_summary                             category
receptor_type                               ca

In [197]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Final_objects/to_upload/scrnaseq/Leukocytes_of_the_fetal_lung.h5ad', compression = 'gzip')

In [198]:
adata.obs

Unnamed: 0,status,assignment,percent_mito,n_counts,n_genes,doublet_scores,bh_pval,leiden,phase,S_score,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACGGGAGATCCCAT-Imm_FLNG8965963,,,0.016132,11710.0,2583,0.054545,0.660933,11,G1,-0.151007,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAACGGGCAGTGACAG-Imm_FLNG8965963,,,0.034385,2387.0,1243,0.090426,0.660933,21,G1,-0.158745,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCCAAACTG-Imm_FLNG8965963,,,0.025609,15486.0,3622,0.159533,0.660933,13,G1,-0.073351,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGATGTCGCCATAA-Imm_FLNG8965963,,,0.022353,16489.0,3686,0.083969,0.660933,11,G1,-0.142639,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
AAAGTAGAGACGCACA-Imm_FLNG8965963,,,0.133360,4276.0,2415,0.040853,0.660933,12,G2M,0.380279,...,HsapDv:0000050,HDBR14732,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184,,,0.039884,9581.0,3133,0.330579,0.826283,24,G1,-0.176634,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185,,,0.034237,2706.0,1305,0.394231,0.122739,24,G1,-0.169248,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185,,,0.040634,7625.0,2735,0.330579,0.122739,24,G1,-0.128864,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185,,,0.027229,13147.0,3721,0.245161,0.122739,24,G1,-0.066071,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [199]:
adata.obs.columns

Index(['status', 'assignment', 'percent_mito', 'n_counts', 'n_genes',
       'doublet_scores', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'broad_type', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [200]:
adata.raw.var

ENSG00000121410
ENSG00000268895
ENSG00000148584
ENSG00000175899
ENSG00000245105
...
ENSG00000203995
ENSG00000162378
ENSG00000159840
ENSG00000074755
ENSG00000272920


In [201]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000121410,False
ENSG00000268895,False
ENSG00000148584,False
ENSG00000175899,False
ENSG00000245105,False
...,...
ENSG00000203995,False
ENSG00000162378,False
ENSG00000159840,False
ENSG00000074755,False
