### **Curating fetal_lung_pan_lung.h5ad**

Article: Early human lung immune cell development and its role in epithelial cell fate

DOI: 10.1126/sciimmunol.adf99

Data Source : https://fetal-lung-immune.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Data/fetal_lung_pan_lung.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 670749 × 25076
    obs: 'n_counts', 'n_genes', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id', 'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters', 'is_maternal_contaminant', 'anno_lvl_2_final_clean', 'celltype_annotation', 'batch', 'status', 'assignment', 'stage', 'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype', 'temp', 'dataset', 'predicted_anno', 'predicted_anno_probability', 'predicted_anno_prob'
    var: 'GeneName', 'means', 'dispersions', 'dispersions_norm', 'scvi_model_var'
    uns: 'celltype_annotation_colors', 'celltype_colors', 'dataset_colors', 'hvg', 'leiden', 'neighbors', 'scvi', 'umap'
    obsm: 'X_bbknn_umap', 'X_pca', 'X_scvi', 'X_umap', 'X_umap_original'
    layers: 'counts'
    obsp: 'connectivities', '

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<670749x25076 sparse matrix of type '<class 'numpy.float32'>'
	with 1513384167 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 2)	2.0145576
  (0, 4)	0.9648341
  (0, 5)	2.2106628
  (0, 7)	1.7703753
  (0, 8)	0.9648341
  (0, 17)	0.9648341
  (0, 18)	1.7703753
  (0, 20)	0.9648341
  (0, 21)	2.3745441
  (0, 22)	4.0578938
  (0, 27)	2.3745441
  (0, 28)	1.7703753
  (0, 31)	3.3538957
  (0, 38)	2.0145576
  (0, 39)	0.9648341
  (0, 41)	1.7703753
  (0, 44)	3.6037505
  (0, 46)	1.7703753
  (0, 48)	1.4466141
  (0, 49)	0.9648341
  (0, 50)	3.6884909
  (0, 51)	0.9648341
  (0, 52)	0.9648341
  (0, 54)	1.4466141
  (0, 58)	3.3538957
  :	:
  (670748, 24578)	1.7692441
  (670748, 24594)	3.2320719
  (670748, 24597)	2.3733077
  (670748, 24614)	3.6871614
  (670748, 24623)	1.7692441
  (670748, 24625)	1.7692441
  (670748, 24627)	1.7692441
  (670748, 24629)	2.3733077
  (670748, 24638)	2.3733077
  (670748, 24641)	1.7692441
  (670748, 24678)	1.7692441
  (670748, 24705)	1.7692441
  (670748, 24720)	1.7692441
  (670748, 24722)	1.7692441
  (670748, 24740)	2.3733077
  (670748, 24744)	1.7692441
  (670748, 24750)	1.7692441
  (670748, 24770)	1.769

In [11]:
print(adata.layers['counts'])

  (0, 2)	4.0
  (0, 4)	1.0
  (0, 5)	5.0
  (0, 7)	3.0
  (0, 8)	1.0
  (0, 17)	1.0
  (0, 18)	3.0
  (0, 20)	1.0
  (0, 21)	6.0
  (0, 22)	35.0
  (0, 27)	6.0
  (0, 28)	3.0
  (0, 31)	17.0
  (0, 38)	4.0
  (0, 39)	1.0
  (0, 41)	3.0
  (0, 44)	22.0
  (0, 46)	3.0
  (0, 48)	2.0
  (0, 49)	1.0
  (0, 50)	24.0
  (0, 51)	1.0
  (0, 52)	1.0
  (0, 54)	2.0
  (0, 58)	17.0
  :	:
  (670748, 24578)	1.0
  (670748, 24594)	5.0
  (670748, 24597)	2.0
  (670748, 24614)	8.0
  (670748, 24623)	1.0
  (670748, 24625)	1.0
  (670748, 24627)	1.0
  (670748, 24629)	2.0
  (670748, 24638)	2.0
  (670748, 24641)	1.0
  (670748, 24678)	1.0
  (670748, 24705)	1.0
  (670748, 24720)	1.0
  (670748, 24722)	1.0
  (670748, 24740)	2.0
  (670748, 24744)	1.0
  (670748, 24750)	1.0
  (670748, 24770)	1.0
  (670748, 24820)	1.0
  (670748, 24832)	2.0
  (670748, 24914)	1.0
  (670748, 24920)	1.0
  (670748, 24924)	1.0
  (670748, 24931)	1.0
  (670748, 24995)	1.0


##### **Raw counts matrix**

In [12]:
araw = ad.AnnData(X=adata.layers['counts'].copy(), obs=adata.obs.copy(), var=adata.var.copy())

In [13]:
araw

AnnData object with n_obs × n_vars = 670749 × 25076
    obs: 'n_counts', 'n_genes', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id', 'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters', 'is_maternal_contaminant', 'anno_lvl_2_final_clean', 'celltype_annotation', 'batch', 'status', 'assignment', 'stage', 'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype', 'temp', 'dataset', 'predicted_anno', 'predicted_anno_probability', 'predicted_anno_prob'
    var: 'GeneName', 'means', 'dispersions', 'dispersions_norm', 'scvi_model_var'

In [14]:
del adata.layers['counts']

In [15]:
#adata.raw = araw

##### **Variables(var)**

In [16]:
# View the var of anndata and raw object

In [17]:
adata.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285476,AC139491.7,0.003337,3.235380,0.071965,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [18]:
araw.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285476,AC139491.7,0.003337,3.235380,0.071965,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [19]:
# Load the approved genes file.

In [20]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [21]:
#Create a dictionary from the approved genes file 

In [22]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [23]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [24]:
len(genedict)

119799

In [25]:
#Filter out the genes which are not in the approved genes file.

In [26]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [27]:
len(var_to_keep_adata)

24831

In [28]:
len(var_to_keep_araw)

24831

In [29]:
adata.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285476,AC139491.7,0.003337,3.235380,0.071965,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [30]:
araw.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285476,AC139491.7,0.003337,3.235380,0.071965,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [31]:
# Modify the anndata object by filtering out the filtered genes.

In [32]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [33]:
adata.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285454,AC111006.1,0.022275,3.140733,-0.112168,False
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [34]:
# View var

In [35]:
adata.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285454,AC111006.1,0.022275,3.140733,-0.112168,False
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [36]:
araw.var

Unnamed: 0,GeneName,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,TSPAN6,0.569293,9.358764,9.910923,False
ENSG00000000005,TNMD,0.017808,3.547950,0.680063,True
ENSG00000000419,DPM1,1.664187,2.988196,-0.571544,False
ENSG00000000457,SCYL3,0.539670,3.081378,-0.309733,False
ENSG00000000460,C1orf112,0.627239,2.894128,-0.614609,False
...,...,...,...,...,...
ENSG00000285454,AC111006.1,0.022275,3.140733,-0.112168,False
ENSG00000285458,AC093827.5,0.051582,3.025712,-0.335938,False
ENSG00000285486,AC003043.2,0.028079,3.184949,-0.026146,True
ENSG00000285492,AL356417.3,0.055600,3.359548,0.313531,False


In [37]:
del adata.var['GeneName']
del araw.var['GeneName']

feature is filtered

In [38]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [39]:
adata.var

Unnamed: 0,means,dispersions,dispersions_norm,scvi_model_var,feature_is_filtered
ENSG00000000003,0.569293,9.358764,9.910923,False,False
ENSG00000000005,0.017808,3.547950,0.680063,True,False
ENSG00000000419,1.664187,2.988196,-0.571544,False,False
ENSG00000000457,0.539670,3.081378,-0.309733,False,False
ENSG00000000460,0.627239,2.894128,-0.614609,False,False
...,...,...,...,...,...
ENSG00000285454,0.022275,3.140733,-0.112168,False,False
ENSG00000285458,0.051582,3.025712,-0.335938,False,False
ENSG00000285486,0.028079,3.184949,-0.026146,True,False
ENSG00000285492,0.055600,3.359548,0.313531,False,False


In [40]:
araw.var

Unnamed: 0,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,0.569293,9.358764,9.910923,False
ENSG00000000005,0.017808,3.547950,0.680063,True
ENSG00000000419,1.664187,2.988196,-0.571544,False
ENSG00000000457,0.539670,3.081378,-0.309733,False
ENSG00000000460,0.627239,2.894128,-0.614609,False
...,...,...,...,...
ENSG00000285454,0.022275,3.140733,-0.112168,False
ENSG00000285458,0.051582,3.025712,-0.335938,False
ENSG00000285486,0.028079,3.184949,-0.026146,True
ENSG00000285492,0.055600,3.359548,0.313531,False


#### **obs (Cell metadata)**

In [41]:
#view obs

In [42]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,temp,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,,,,,,15.0,query,MAST_CELL,0.99,0.99
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,,,,,,15.0,query,EOSINOPHIL_BASOPHIL,0.65,0.65
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,,,,,,15.0,query,MAST_CELL,0.76,0.76
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,,,,,,15.0,query,EOSINOPHIL_BASOPHIL,0.82,0.82


In [43]:
# view the column names in obs

In [44]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob'],
      dtype='object')

In [45]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,temp,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,,,,,,15.0,query,MAST_CELL,0.99,0.99
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,,,,,,15.0,query,EOSINOPHIL_BASOPHIL,0.65,0.65
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,,,,,,15.0,query,MAST_CELL,0.76,0.76
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,,,,,,15.0,query,EOSINOPHIL_BASOPHIL,0.82,0.82


#### **assay_ontology_term_id**

In [46]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,status_summary,productive_summary,isotype_summary,receptor_type,receptor_subtype,temp,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,,,,,reference,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,,,,,,15.0,query,MAST_CELL,0.99,0.99
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,,,,,,15.0,query,EOSINOPHIL_BASOPHIL,0.65,0.65
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,,,,,,15.0,query,MAST_CELL,0.76,0.76
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,,,,,,15.0,query,EOSINOPHIL_BASOPHIL,0.82,0.82


In [47]:
adata.obs['barcodes'] = adata.obs_names

In [48]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [49]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [50]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [51]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [52]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+3pv3', '3pv2_5pv1_5pv2+multiome', nan]

In [53]:
mapping= {'3pv2_5pv1_5pv2':'EFO:0030004', '3pv2_5pv1_5pv2,3pv3':'EFO:0030004', '3pv2_5pv1_5pv2,multiome':'EFO:0030004','nan':'EFO:0030004'}

In [54]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['method'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['method'] == assay, 'assay'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)

Unique values in 'assay': ['5GEX', '3GEX', 'nan']
Categories (3, object): ['3GEX', '5GEX', 'nan']
Unique 'assays' for assay 5GEX: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay 3GEX: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay nan: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]


In [55]:
mapping= { '3GEX' :'EFO:0009899', '5GEX':'EFO:0030004' , 'nan' :'EFO:0030004'}

In [56]:
adata.obs['assay_ontology_term_id'] =  adata.obs['method'].map(mapping)

In [57]:
adata.obs.loc[adata.obs['batch'].str.startswith(('5891', 'WSSS')), 'assay_ontology_term_id'] = 'EFO:0011025'

In [58]:
import pandas as pd
import scanpy as sc

# Suppose 'adata' is your AnnData object already loaded.

# Step 1: Check unique values in 'assay'
unique_assays = adata.obs['assay_ontology_term_id'].unique()
print("Unique values in 'assay':", unique_assays)

# Step 2: Display unique 'assays' for each unique 'assay'
for assay in unique_assays:
    unique_assays_for_group = adata.obs.loc[adata.obs['assay_ontology_term_id'] == assay, 'assay'].unique()
    print(f"Unique 'assays' for assay {assay}:", unique_assays_for_group)

Unique values in 'assay': ['EFO:0030004' 'EFO:0009899' 'EFO:0011025']
Unique 'assays' for assay EFO:0030004: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay EFO:0009899: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome' nan]
Unique 'assays' for assay EFO:0011025: ['3pv2_5pv1_5pv2' '3pv2_5pv1_5pv2+3pv3' '3pv2_5pv1_5pv2+multiome']


In [59]:
unique_combinations = adata.obs[['method', 'assay', 'assay_ontology_term_id','batch']].drop_duplicates().reset_index(drop=True)

In [60]:
unique_combinations 

Unnamed: 0,method,assay,assay_ontology_term_id,batch
0,5GEX,3pv2_5pv1_5pv2,EFO:0030004,
1,5GEX,3pv2_5pv1_5pv2+3pv3,EFO:0030004,
2,5GEX,3pv2_5pv1_5pv2+multiome,EFO:0030004,
3,3GEX,3pv2_5pv1_5pv2,EFO:0009899,
4,3GEX,3pv2_5pv1_5pv2+3pv3,EFO:0009899,
...,...,...,...,...
116,,3pv2_5pv1_5pv2+3pv3,EFO:0011025,WSSS8012016
117,,3pv2_5pv1_5pv2+multiome,EFO:0011025,WSSS_F_LNG8713178
118,,3pv2_5pv1_5pv2+multiome,EFO:0011025,WSSS_F_LNG8713187
119,,3pv2_5pv1_5pv2+3pv3,EFO:0011025,WSSS_F_LNG8713188


In [61]:
unique_combinations.to_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Suppl_info/unique_combinations.csv', index=False)

In [62]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [63]:
# view adata.obs

In [64]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,receptor_type,receptor_subtype,temp,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,,reference,,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,,reference,,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,,reference,,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,,reference,,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,,reference,,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,,,15.0,query,MAST_CELL,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,,,15.0,query,EOSINOPHIL_BASOPHIL,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,,,15.0,query,MAST_CELL,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,,,15.0,query,EOSINOPHIL_BASOPHIL,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025


#### **cell_type_ontology_term_id**

In [65]:
#identify the column in adata.obs related. to cell type annotation

In [66]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id'],
      dtype='object')

In [67]:
list(adata.obs['celltype'].unique())

['nan',
 'DC2',
 'Intermediate NK',
 'CD8 T',
 'Pro-B',
 'ILC3',
 'MΦ',
 'CD56bright NK',
 'CD16+ NK',
 'Cycling NK',
 'CD4 T',
 'pDC',
 'Pre-pDC/DC5',
 'ILCP',
 'Type 3 innate T',
 'CD5+ Mature B',
 'ILC2',
 'Type 1 innate T',
 'CD5- Mature B',
 'Alveolar fibro',
 'Mast',
 'Neutrophil',
 'TNC+ fibro',
 'S100A12-hi CD14+ mono',
 'Pro-B/Pre-B transition',
 'Late pro-B',
 'aNK',
 'DC1',
 'SMC',
 'Pre-pro-B',
 'CXCL9+ MΦ',
 'Large pre-B',
 'CD16+ mono',
 'S100A12-lo CD14+ mono',
 'Treg',
 'Megk progenitor',
 'κ small pre-B',
 'Immature B',
 'GMP',
 'Adventitial fibro',
 'Distal epithelial',
 'Late pre-B',
 'Secretory epithelial',
 'Pericyte',
 'aDC',
 'Endo',
 'Erythroid',
 'Pro-monocyte',
 'Baso/Eosino',
 'CMP',
 'LMPP/ELP',
 'MEP',
 'HSC/MPP',
 'λ small pre-B',
 'APOE+ MΦ',
 'Myelocyte-like',
 'Pro-myelocyte',
 'T progenitors',
 'Megk',
 'Ciliated']

In [68]:
# create a dictionary of cell type and ontology term

In [69]:
mapping= {
'DC2':'CL:0000990',
'Intermediate NK':'CL:0000623',
'CD8 T':'CL:0000625',
'Pro-B':'CL:0000826',
'ILC3':'CL:0001078',
'MΦ':'CL:0000235',
'CD56bright NK':'CL:0000938',
'CD16+ NK':'CL:0000939',
'Cycling NK':'CL:0000623',
'CD4 T':'CL:0000624',
'pDC':'CL:0000784',
'Pre-pDC/DC5':'CL:0000784',
'ILCP':'CL:0001065',
'Type 3 innate T':'CL:0001078',
'CD5+ Mature B':'CL:0000785',
'ILC2':'CL:0001069',
'Type 1 innate T':'CL:0001067',
'CD5- Mature B':'CL:0000785',
'Alveolar fibro':'CL:4028004',
'Mast':'CL:0000097',
'Neutrophil':'CL:0000775',
'TNC+ fibro':'CL:0000057',
'S100A12-hi CD14+ mono':'CL:0001054',
'Pro-B/Pre-B transition':'CL:0002045',
'Late pro-B':'CL:0002048',
'aNK':'CL:0000623',
'DC1':'CL:0000990',
'SMC':'CL:0000192',
'Pre-pro-B':'CL:0002046',
'CXCL9+ MΦ':'CL:0000235',
'Large pre-B':'CL:0000957',
'CD16+ mono':'CL:0002396',
'S100A12-lo CD14+ mono':'CL:0001054',
'Treg':'CL:0000815',
'Megk progenitor':'CL:0000553',
'κ small pre-B':'CL:0002053',
'Immature B':'CL:0000816',
'GMP':'CL:0000557',
'Adventitial fibro':'CL:4028006',
'Distal epithelial':'CL:0002305',
'Late pre-B':'CL:0000817',
'Secretory epithelial':'CL:1000272',
'Pericyte':'CL:0009089',
'aDC':'CL:0000451',
'Endo':'CL:0000115',
'Erythroid':'CL:0000764',
'Pro-monocyte':'CL:0000576',
'Baso/Eosino':'CL:0000094',
'CMP':'CL:0000049',
'LMPP/ELP':'CL:0000936',
'MEP':'CL:0000050',
'HSC/MPP':'CL:0000837',
'λ small pre-B':'CL:0002053',
'APOE+ MΦ':'CL:0000235',
'Myelocyte-like':'CL:0002193',
'Pro-myelocyte':'CL:0002193',
'T progenitors':'CL:0000827',
'Megk':'CL:0000556',
'Ciliated':'CL:0000064',
'nan':'unknown'}

In [70]:
# add the cell_type_ontology_term_id column

In [71]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['celltype'].map(mapping)

In [72]:
# change datatype of the column

In [73]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [74]:
# view adata.obs

In [75]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,receptor_subtype,temp,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,reference,,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,reference,,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,reference,,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,reference,,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,reference,,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,,15.0,query,MAST_CELL,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,,15.0,query,EOSINOPHIL_BASOPHIL,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,,15.0,query,MAST_CELL,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,,15.0,query,EOSINOPHIL_BASOPHIL,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094


In [76]:
 list(adata.obs['cell_type_ontology_term_id'].unique())

['unknown',
 'CL:0000990',
 'CL:0000623',
 'CL:0000625',
 'CL:0000826',
 'CL:0001078',
 'CL:0000235',
 'CL:0000938',
 'CL:0000939',
 'CL:0000624',
 'CL:0000784',
 'CL:0001065',
 'CL:0000785',
 'CL:0001069',
 'CL:0001067',
 'CL:4028004',
 'CL:0000097',
 'CL:0000775',
 'CL:0000057',
 'CL:0001054',
 'CL:0002045',
 'CL:0002048',
 'CL:0000192',
 'CL:0002046',
 'CL:0000957',
 'CL:0002396',
 'CL:0000815',
 'CL:0000553',
 'CL:0002053',
 'CL:0000816',
 'CL:0000557',
 'CL:4028006',
 'CL:0002305',
 'CL:0000817',
 'CL:1000272',
 'CL:0009089',
 'CL:0000451',
 'CL:0000115',
 'CL:0000764',
 'CL:0000576',
 'CL:0000094',
 'CL:0000049',
 'CL:0000936',
 'CL:0000050',
 'CL:0000837',
 'CL:0002193',
 'CL:0000827',
 'CL:0000556',
 'CL:0000064']

#### **development_stage_ontology_term_id**

In [77]:
# identify the column in adata which corresponds to age

In [78]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id'],
      dtype='object')

In [79]:
list(adata.obs['stage'].unique())

['nan',
 '12.0',
 '20.0',
 '9.0',
 '8.0',
 '18.0',
 '22.0',
 '15.0',
 '6.86',
 '5.0',
 '11.0']

In [80]:
mapping={'HDBR14732':'HsapDv:0000050',
'HDBR14774':'HsapDv:0000058',
'HDBR14776':'HsapDv:0000050',
'HDBR14787':'HsapDv:0000058',
'HDBR14794':'HsapDv:0000050',
'HDBR14806':'HsapDv:0000058',
'HDBR14815':'HsapDv:0000050',
'HDBR14944':'HsapDv:0000047',
'HDBR14969':'HsapDv:0000046',
'HDBR15024':'HsapDv:0000058',
'HDBR15084':'HsapDv:0000046',
'HDBR15111':'HsapDv:0000047',
'HDBR15204':'HsapDv:0000058',
'HDBR15246':'HsapDv:0000058',
'HDBR15279+15280':'unknown',
'HDBR15332':'HsapDv:0000050',
'HDBR15383':'HsapDv:0000058',
'HDBR15404':'HsapDv:0000047',
'HDBR15503':'HsapDv:0000047',
'HDBR14804':'HsapDv:0000059',
'HDBR15168':'HsapDv:0000059',
'HDBR14854':'HsapDv:0000057',
'HDBR15167':'HsapDv:0000057',
'HDBR14808':'HsapDv:0000055',
'HDBR15219':'HsapDv:0000055',
'F29':'HsapDv:0000054',
'F41':'HsapDv:0000054',
'F78':'HsapDv:0000054',
'F21':'HsapDv:0000053',
'F72':'HsapDv:0000053',
'F30':'HsapDv:0000052',
'F50':'HsapDv:0000052',
'F51':'HsapDv:0000052',
'F66':'HsapDv:0000052',
'F73':'HsapDv:0000052',
'HDBR14853':'HsapDv:0000052',
'HDBR15233':'HsapDv:0000052',
'F38':'HsapDv:0000051',
'F71':'HsapDv:0000051',
'F45':'HsapDv:0000050',
'F67':'HsapDv:0000049',
'F23':'HsapDv:0000048',
'F64':'HsapDv:0000048',
'F69':'HsapDv:0000048',
'HDBR15280':'HsapDv:0000048',
'F19':'HsapDv:0000047',
'F22':'HsapDv:0000047',
'F33':'HsapDv:0000046',
'HDBR15279':'HsapDv:0000046',
'F34':'HsapDv:0000030',
'F61':'HsapDv:0000027',
'BRC2192':'HsapDv:0000026',
'F32':'HsapDv:0000026',
'BRC2188':'HsapDv:0000023',
'F37':'HsapDv:0000020',
'F35':'HsapDv:0000002',
'HDBR14706':'HsapDv:0000058'}

In [81]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [82]:
# change datatype of the column

In [83]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [84]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000050',
 'HsapDv:0000052',
 'HsapDv:0000020',
 'HsapDv:0000048',
 'HsapDv:0000051',
 'HsapDv:0000054',
 'HsapDv:0000046',
 'HsapDv:0000027',
 'HsapDv:0000047',
 'HsapDv:0000030',
 'HsapDv:0000002',
 'HsapDv:0000053',
 'HsapDv:0000049',
 'HsapDv:0000026',
 'HsapDv:0000058',
 'HsapDv:0000055',
 'HsapDv:0000059',
 'HsapDv:0000057',
 'HsapDv:0000023',
 'unknown']

In [85]:
# view adata.obs

In [86]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,temp,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,reference,,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,reference,,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,reference,,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,reference,,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,reference,,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,15.0,query,MAST_CELL,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,15.0,query,EOSINOPHIL_BASOPHIL,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,15.0,query,MAST_CELL,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,15.0,query,EOSINOPHIL_BASOPHIL,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052


#### **donor_id**

In [87]:
#identify the column in adata.obs which provides donor information

In [88]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [89]:
# add the donor_id column

In [90]:
adata.obs['donor_id'] = adata.obs['donor']

In [91]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [92]:
# change datatype of the column

In [93]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [94]:
# view unique values of donor_id column

In [95]:
list(adata.obs['donor_id'].unique())

['F45',
 'F51',
 'F37',
 'F23',
 'F30',
 'F38',
 'F41',
 'F73',
 'F78',
 'F66',
 'F29',
 'F69',
 'F33',
 'F61',
 'F22',
 'F71',
 'F34',
 'F35',
 'F19',
 'F72',
 'F67',
 'F50',
 'F21',
 'F32',
 'F64',
 'HDBR14732',
 'HDBR14706',
 'HDBR14806',
 'HDBR14794',
 'HDBR15246',
 'HDBR14774',
 'HDBR14787',
 'HDBR15204',
 'HDBR14815',
 'HDBR15111',
 'HDBR14969',
 'HDBR15084',
 'HDBR14944',
 'HDBR15024',
 'HDBR14776',
 'HDBR15332',
 'HDBR15383',
 'HDBR15503',
 'HDBR15404',
 'HDBR14808',
 'HDBR14804',
 'HDBR14853',
 'HDBR14854',
 'BRC2192',
 'BRC2188',
 'HDBR15167',
 'HDBR15168',
 'HDBR15233',
 'HDBR15219',
 'HDBR15279',
 'HDBR15280',
 'HDBR15279+15280']

In [96]:
adata.obs['donor_id'] = adata.obs['donor_id'].replace('HDBR15279+15280', 'pooled')

  adata.obs['donor_id'] = adata.obs['donor_id'].replace('HDBR15279+15280', 'pooled')


In [97]:
#view obs

In [98]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,dataset,predicted_anno,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,reference,,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,reference,,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,reference,,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,reference,,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,reference,,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,query,MAST_CELL,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,query,EOSINOPHIL_BASOPHIL,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,query,MAST_CELL,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,query,EOSINOPHIL_BASOPHIL,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233


In [99]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [100]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [101]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,predicted_anno,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,MAST_CELL,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,EOSINOPHIL_BASOPHIL,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,MAST_CELL,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,EOSINOPHIL_BASOPHIL,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461


In [102]:
# change datatype of the column

In [103]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [104]:
# view obs

In [105]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,predicted_anno,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,MAST_CELL,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,EOSINOPHIL_BASOPHIL,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,MAST_CELL,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,EOSINOPHIL_BASOPHIL,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461


#### **is_primary_data**

In [106]:
import pandas as pd

# Assuming adata.obs is a pandas DataFrame
# Replace 'adata.obs' with the actual name of your DataFrame if different

# Set 'is_primary_data' to True for all entries initially
adata.obs['is_primary_data'] = True

# Set 'is_primary_data' to False for entries with 'batch' values starting with '5891' or 'WSSS'
adata.obs.loc[adata.obs['batch'].str.startswith(('5891', 'WSSS')), 'is_primary_data'] = False


In [107]:
list(adata.obs['is_primary_data'].unique())

[True, False]

In [108]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False


In [109]:
#change data type of column

In [110]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [111]:
# view obs

In [112]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,predicted_anno_probability,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,0.99,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,0.65,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,0.76,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,0.82,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False


#### **organism_ontology_term_id**

In [113]:
# assign organism id 

In [114]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [115]:
#change data type of column

In [116]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [117]:
# view obs

In [118]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,predicted_anno_prob,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,0.99,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,0.65,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,0.76,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,0.82,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [119]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [120]:
# change data type

In [121]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [122]:
# view obs

In [123]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown


In [124]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [125]:
# identify the column in adata.obs which corresponds to sex

In [126]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [127]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,ATTACTCTCGATGAGG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,CAGCCGAGTACATCCA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,TGCTACCTCATGTAGC,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,ACGGCCACAAGCTGAG,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,CTAATGGCACTGTGTA,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,GTCACGGAGCCCTAAT,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,ACGGGCTGTTCAGCGC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,CATCGAACATTAACCG,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,CTCAGAAGTACTCTCC,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown


In [128]:
# list the unique values 

In [129]:
list(adata.obs['gender'].unique())

['nan', 'M', 'F', 'Unknown']

In [130]:
list(adata.obs['sex'].unique())

['female', 'male', 'nan']

In [131]:
# create a dictionary of sex and sex ontology term id

In [132]:
mapping= {'F': 'PATO:0000383', 'M': 'PATO:0000384', 'Unknown':'unknown','nan':'unknown'}

In [133]:
mapping={
'BRC2188':'PATO:0000383',
'BRC2192':'PATO:0000383',
'F19':'PATO:0000383',
'F22':'PATO:0000383',
'F29':'PATO:0000383',
'F32':'PATO:0000383',
'F33':'PATO:0000383',
'F34':'PATO:0000383',
'F35':'PATO:0000383',
'F37':'PATO:0000383',
'F41':'PATO:0000383',
'F45':'PATO:0000383',
'F50':'PATO:0000383',
'F51':'PATO:0000383',
'F67':'PATO:0000383',
'F69':'PATO:0000383',
'F71':'PATO:0000383',
'F72':'PATO:0000383',
'F73':'PATO:0000383',
'HDBR14853':'PATO:0000383',
'HDBR14854':'PATO:0000383',
'HDBR15279':'PATO:0000383',
'F21':'PATO:0000384',
'F23':'PATO:0000384',
'F30':'PATO:0000384',
'F38':'PATO:0000384',
'F61':'PATO:0000384',
'F64':'PATO:0000384',
'F66':'PATO:0000384',
'F78':'PATO:0000384',
'HDBR14804':'PATO:0000384',
'HDBR14808':'PATO:0000384',
'HDBR15167':'PATO:0000384',
'HDBR15168':'PATO:0000384',
'HDBR15219':'PATO:0000384',
'HDBR15233':'PATO:0000384',
'HDBR15280':'PATO:0000384',
'HDBR14706':'PATO:0000383',
'HDBR14732':'PATO:0000384',
'HDBR14774':'PATO:0000383',
'HDBR14776':'PATO:0000383',
'HDBR14787':'PATO:0000384',
'HDBR14794':'PATO:0000383',
'HDBR14806':'PATO:0000383',
'HDBR14815':'PATO:0000384',
'HDBR14944':'PATO:0000383',
'HDBR14969':'PATO:0000383',
'HDBR15024':'PATO:0000383',
'HDBR15084':'PATO:0000384',
'HDBR15111':'PATO:0000384',
'HDBR15204':'PATO:0000383',
'HDBR15246':'PATO:0000384',
'HDBR15279+15280':'unknown',
'HDBR15332':'PATO:0000384',
'HDBR15383':'PATO:0000384',
'HDBR15404':'PATO:0000383',
'HDBR15503':'PATO:0000384'}

In [134]:
# add sex_ontology_term_id column

In [135]:
adata.obs['sex_ontology_term_id'] = adata.obs['donor'].map(mapping)

In [136]:
# change data type

In [137]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [138]:
list(adata.obs['sex_ontology_term_id'].unique())

['PATO:0000383', 'PATO:0000384', 'unknown']

In [139]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


#### **suspension_type**

In [140]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,3pv2_5pv1_5pv2,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,3pv2_5pv1_5pv2,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384


In [141]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [142]:
# change data type of column

In [143]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [144]:
# view obs

In [145]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,EFO:0030004,unknown,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,EFO:0011025,CL:0000097,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,EFO:0011025,CL:0000094,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell


#### **tissue_type**

In [146]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [147]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [148]:
# identify the column in adata.obs which corresponds to tissue

In [149]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspensio

In [150]:
list(adata.obs['receptor_type'].unique())

['nan', 'TCR', 'BCR']

In [151]:
# add 'tissue_ontology_term_id' column

In [152]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [153]:
# change data type of column

In [154]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [155]:
#list the unique values in 'tissue_ontology_term_id' column

In [156]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002048']

In [157]:
# view obs

In [158]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [159]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspensio

#### **obsm (Embeddings)**

In [160]:
# view obsm

In [161]:
# check whether all columns are prefixed with X

In [162]:
adata.obsm

AxisArrays with keys: X_bbknn_umap, X_pca, X_scvi, X_umap, X_umap_original

#### **uns (Dataset Metadata)**

In [163]:
# View

In [164]:
adata.uns

{'celltype_annotation_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
        '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
        '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
        '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
        '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
        '#d16100', '#ddefff', '#000035', '#7b4f4b', '#a1c299', '#300018',
        '#0aa6d8', '#013349', '#00846f', '#372101', '#ffb500', '#c2ffed',
        '#a079bf', '#cc0744', '#c0b9b2', '#c2ff99', '#001e09', '#00489c',
        '#6f0062', '#0cbd66', '#eec3ff', '#456d75', '#b77b68', '#7a87a1',
        '#788d66', '#885578', '#fad09f', '#ff8a9a', '#d157a0', '#bec459',
        '#456648', '#0086ed', '#886f4c', '#34362d', '#b4a8bd', '#00a6aa',
        '#452c2c', '#636375', '#a3c8c9', '#ff913f', '#938a81', '#575329',
        '#00fecf', '#b05b6f', '#8cd0ff', '#3b9700', '#04f757', '#c8a1a1',
        

In [165]:
adata.uns.keys

<function dict.keys>

In [166]:
# Give a title for the dataset

In [167]:
adata.uns['title'] = 'Fetal lung + Pan-fetal immune'

In [168]:
# Set the default embedding

In [169]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [170]:
# view anndata object

In [171]:
adata

AnnData object with n_obs × n_vars = 670749 × 24831
    obs: 'n_counts', 'n_genes', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id', 'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters', 'is_maternal_contaminant', 'anno_lvl_2_final_clean', 'celltype_annotation', 'batch', 'status', 'assignment', 'stage', 'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary', 'productive_summary', 'isotype_summary', 'receptor_type', 'receptor_subtype', 'temp', 'dataset', 'predicted_anno', 'predicted_anno_probability', 'predicted_anno_prob', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_ter

In [172]:
# view obs and var data types

In [173]:
adata.obs.dtypes

n_counts                                    category
n_genes                                     category
file                                        category
mito                                        category
doublet_scores                              category
predicted_doublets                          category
old_annotation_uniform                      category
organ                                       category
Sort_id                                     category
age                                         category
method                                      category
donor                                       category
sex                                         category
Sample                                      category
scvi_clusters                               category
is_maternal_contaminant                     category
anno_lvl_2_final_clean                      category
celltype_annotation                         category
batch                                       ca

In [174]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

changed means from float64 to float32
changed dispersions from float64 to float32


In [175]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed predicted_anno_probability from float64 to float32
changed predicted_anno_prob from float64 to float32
changed barcodes from object to category
changed assay from object to category


In [176]:
# view obs

In [177]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,organ,Sort_id,age,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,SK,CD45P,12.0,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [178]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id',
       'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters',
       'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'stage',
       'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score',
       'project', 'domain', 'gender', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob', 'barcodes',
       'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspensio

In [179]:
# delete unwanted columns in obs

In [180]:
del adata.obs['gender']
del adata.obs['barcodes']
del adata.obs['assay']
del adata.obs['sex']
del adata.obs['stage']
del adata.obs['age']
del adata.obs['organ']
del adata.obs['method']

In [181]:
# view obs

In [182]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,Sort_id,donor,Sample,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,CD45P,F45,F45_SK_CD45P_FCAImmP7579224,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,CD45P,F45,F45_SK_CD45P_FCAImmP7579224,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,CD45P,F45,F45_SK_CD45P_FCAImmP7579224,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,CD45P,F45,F45_SK_CD45P_FCAImmP7579224,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,CD45P,F45,F45_SK_CD45P_FCAImmP7579224,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,HDBR15233,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,HDBR15233,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,HDBR15233,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,HDBR15233,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [183]:
# view var

In [184]:
adata.var

Unnamed: 0,means,dispersions,dispersions_norm,scvi_model_var,feature_is_filtered
ENSG00000000003,0.569293,9.358765,9.910923,False,False
ENSG00000000005,0.017808,3.547950,0.680063,True,False
ENSG00000000419,1.664187,2.988196,-0.571544,False,False
ENSG00000000457,0.539670,3.081378,-0.309733,False,False
ENSG00000000460,0.627239,2.894128,-0.614609,False,False
...,...,...,...,...,...
ENSG00000285454,0.022275,3.140733,-0.112168,False,False
ENSG00000285458,0.051582,3.025712,-0.335938,False,False
ENSG00000285486,0.028079,3.184949,-0.026146,True,False
ENSG00000285492,0.055600,3.359548,0.313531,False,False


In [185]:
araw.var

Unnamed: 0,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,0.569293,9.358764,9.910923,False
ENSG00000000005,0.017808,3.547950,0.680063,True
ENSG00000000419,1.664187,2.988196,-0.571544,False
ENSG00000000457,0.539670,3.081378,-0.309733,False
ENSG00000000460,0.627239,2.894128,-0.614609,False
...,...,...,...,...
ENSG00000285454,0.022275,3.140733,-0.112168,False
ENSG00000285458,0.051582,3.025712,-0.335938,False
ENSG00000285486,0.028079,3.184949,-0.026146,True
ENSG00000285492,0.055600,3.359548,0.313531,False


In [186]:
#view uns

In [187]:
adata.uns

{'celltype_annotation_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
        '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
        '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
        '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
        '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa',
        '#d16100', '#ddefff', '#000035', '#7b4f4b', '#a1c299', '#300018',
        '#0aa6d8', '#013349', '#00846f', '#372101', '#ffb500', '#c2ffed',
        '#a079bf', '#cc0744', '#c0b9b2', '#c2ff99', '#001e09', '#00489c',
        '#6f0062', '#0cbd66', '#eec3ff', '#456d75', '#b77b68', '#7a87a1',
        '#788d66', '#885578', '#fad09f', '#ff8a9a', '#d157a0', '#bec459',
        '#456648', '#0086ed', '#886f4c', '#34362d', '#b4a8bd', '#00a6aa',
        '#452c2c', '#636375', '#a3c8c9', '#ff913f', '#938a81', '#575329',
        '#00fecf', '#b05b6f', '#8cd0ff', '#3b9700', '#04f757', '#c8a1a1',
        

In [188]:
list(adata.uns.keys())

['celltype_annotation_colors',
 'celltype_colors',
 'dataset_colors',
 'hvg',
 'leiden',
 'neighbors',
 'scvi',
 'umap',
 'title',
 'default_embedding']

In [189]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'Sort_id', 'donor',
       'Sample', 'scvi_clusters', 'is_maternal_contaminant',
       'anno_lvl_2_final_clean', 'celltype_annotation', 'batch', 'status',
       'assignment', 'percent_mito', 'bh_pval', 'leiden', 'phase', 'S_score',
       'G2M_score', 'project', 'domain', 'leiden_R', 'celltype',
       'status_summary', 'productive_summary', 'isotype_summary',
       'receptor_type', 'receptor_subtype', 'temp', 'dataset',
       'predicted_anno', 'predicted_anno_probability', 'predicted_anno_prob',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# Remove unwanted columns in uns

In [191]:
#check the format of expression matrix

In [192]:
adata.X

<670749x24831 sparse matrix of type '<class 'numpy.float32'>'
	with 1511093218 stored elements in Compressed Sparse Row format>

In [193]:
araw.X

<670749x24831 sparse matrix of type '<class 'numpy.float32'>'
	with 1511093218 stored elements in Compressed Sparse Row format>

In [194]:
#Copy raw counts to adata.raw

In [195]:
adata.raw = araw

In [196]:
obs_dtype = adata.obs.dtypes

In [197]:
obs_dtype

n_counts                                    category
n_genes                                     category
file                                        category
mito                                        category
doublet_scores                              category
predicted_doublets                          category
old_annotation_uniform                      category
Sort_id                                     category
donor                                       category
Sample                                      category
scvi_clusters                               category
is_maternal_contaminant                     category
anno_lvl_2_final_clean                      category
celltype_annotation                         category
batch                                       category
status                                      category
assignment                                  category
percent_mito                                category
bh_pval                                     ca

In [198]:
del adata.obs['donor']

In [199]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/early_human_lung_immune/Final_objects/to_upload/scrnaseq/fetal_lung_pan_lung.h5ad', compression = 'gzip')

In [200]:
adata.obs

Unnamed: 0,n_counts,n_genes,file,mito,doublet_scores,predicted_doublets,old_annotation_uniform,Sort_id,Sample,scvi_clusters,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
FCAImmP7579224-ATTACTCTCGATGAGG-panfetal,61563.0,6117,FCAImmP7579224,0.0363205187022686,0.0872210953346855,False,,CD45P,F45_SK_CD45P_FCAImmP7579224,10,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CAGCCGAGTACATCCA-panfetal,61511.0,6658,FCAImmP7579224,0.0519094131886959,0.1253196930946291,False,,CD45P,F45_SK_CD45P_FCAImmP7579224,4,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-TGCTACCTCATGTAGC-panfetal,54545.0,6485,FCAImmP7579224,0.0452837124466896,0.1640866873065015,False,,CD45P,F45_SK_CD45P_FCAImmP7579224,10,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-ACGGCCACAAGCTGAG-panfetal,51992.0,5768,FCAImmP7579224,0.0400830879807472,0.0924369747899159,False,,CD45P,F45_SK_CD45P_FCAImmP7579224,12,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
FCAImmP7579224-CTAATGGCACTGTGTA-panfetal,51305.0,5492,FCAImmP7579224,0.0464672073721885,0.176470588235294,False,,CD45P,F45_SK_CD45P_FCAImmP7579224,19,...,HsapDv:0000050,F45,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0002048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCACGGAGCCCTAAT-WSSS_F_LNG8713184-lung,9581.0,3120,,,0.3305785123966941,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
ACGGGCTGTTCAGCGC-WSSS_F_LNG8713185-lung,2706.0,1293,,,0.3942307692307692,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CATCGAACATTAACCG-WSSS_F_LNG8713185-lung,7625.0,2721,,,0.3305785123966941,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048
CTCAGAAGTACTCTCC-WSSS_F_LNG8713185-lung,13147.0,3708,,,0.24516129032258052,,,,,,...,HsapDv:0000052,HDBR15233,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0002048


In [201]:
adata.obs.columns

Index(['n_counts', 'n_genes', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'old_annotation_uniform', 'Sort_id', 'Sample',
       'scvi_clusters', 'is_maternal_contaminant', 'anno_lvl_2_final_clean',
       'celltype_annotation', 'batch', 'status', 'assignment', 'percent_mito',
       'bh_pval', 'leiden', 'phase', 'S_score', 'G2M_score', 'project',
       'domain', 'leiden_R', 'celltype', 'status_summary',
       'productive_summary', 'isotype_summary', 'receptor_type',
       'receptor_subtype', 'temp', 'dataset', 'predicted_anno',
       'predicted_anno_probability', 'predicted_anno_prob',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [202]:
adata.raw.var

Unnamed: 0,means,dispersions,dispersions_norm,scvi_model_var
ENSG00000000003,0.569293,9.358764,9.910923,False
ENSG00000000005,0.017808,3.547950,0.680063,True
ENSG00000000419,1.664187,2.988196,-0.571544,False
ENSG00000000457,0.539670,3.081378,-0.309733,False
ENSG00000000460,0.627239,2.894128,-0.614609,False
...,...,...,...,...
ENSG00000285454,0.022275,3.140733,-0.112168,False
ENSG00000285458,0.051582,3.025712,-0.335938,False
ENSG00000285486,0.028079,3.184949,-0.026146,True
ENSG00000285492,0.055600,3.359548,0.313531,False


In [203]:
adata.var

Unnamed: 0,means,dispersions,dispersions_norm,scvi_model_var,feature_is_filtered
ENSG00000000003,0.569293,9.358765,9.910923,False,False
ENSG00000000005,0.017808,3.547950,0.680063,True,False
ENSG00000000419,1.664187,2.988196,-0.571544,False,False
ENSG00000000457,0.539670,3.081378,-0.309733,False,False
ENSG00000000460,0.627239,2.894128,-0.614609,False,False
...,...,...,...,...,...
ENSG00000285454,0.022275,3.140733,-0.112168,False,False
ENSG00000285458,0.051582,3.025712,-0.335938,False,False
ENSG00000285486,0.028079,3.184949,-0.026146,True,False
ENSG00000285492,0.055600,3.359548,0.313531,False,False
