# **Curating `tissue-immune` dataset:**

### T/ILC dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

---

## 2. Pre-prepare before schema adjustments

### Load raw matrix

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/tissueimmune/CountAdded_PIP_T_object_for_cellxgene.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 216611 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [4]:
adata.obs

Unnamed: 0,Organ,Donor,Chemistry,Predicted_labels_CellTypist,Majority_voting_CellTypist,Manually_curated_celltype,Sex,Age_range
Pan_T7935490_AAACCTGCAAATTGCC,ILE,A29,5v1,Type 17 helper T cells,Type 17 helper T cells,Trm_Th1/Th17,Female,65-70
Pan_T7935490_AAACGGGCATCTGGTA,ILE,A29,5v1,Cytotoxic T cells,Cytotoxic T cells,Trm_gut_CD8,Female,65-70
Pan_T7935490_AAACGGGTCTTGCATT,ILE,A29,5v1,Tem/Effector cytotoxic T cells,Cytotoxic T cells,Trm_gut_CD8,Female,65-70
Pan_T7935490_AAAGCAATCATCGCTC,ILE,A29,5v1,Cytotoxic T cells,Cytotoxic T cells,Trm_gut_CD8,Female,65-70
Pan_T7935490_AAAGTAGCAGTCACTA,ILE,A29,5v1,gamma-delta T cells,gamma-delta T cells,Trm_Tgd,Female,65-70
...,...,...,...,...,...,...,...,...
CZINY-0112_TTTGGTTAGGCCACTC,JEJEPI,D496,3,MAIT cells,Cytotoxic T cells,Trm_gut_CD8,Male,55-60
CZINY-0112_TTTGGTTCACAAAGTA,SPL,D496,3,Helper T cells,Helper T cells,Tfh,Male,55-60
CZINY-0112_TTTGGTTTCCATCACC,LNG,D496,3,Cytotoxic T cells,Cytotoxic T cells,Tem/emra_CD8,Male,55-60
CZINY-0112_TTTGTTGCAGCGGTTC,SPL,D496,3,Helper T cells,Follicular helper T cells,Tfh,Male,55-60


In [5]:
adata2 = sc.read_10x_mtx('/warehouse/cellgeni/tic-1168/GSE148073/GSM4453619/output/Gene/filtered')

In [6]:
adata.var

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
AC141272.1
AC023491.2
AC007325.1
AC007325.4
AC007325.2


In [7]:
adata.var = adata2.var

In [8]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


### Remove the genes that do not match in gene-ensemblID file

In [9]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


In [10]:
adata.var['gene_symbols'] = adata.var_names

In [11]:
adata.var_names = adata.var['gene_ids']

In [12]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,AL627309.3
...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,AC141272.1
ENSG00000278633,ENSG00000278633,Gene Expression,AC023491.2
ENSG00000276017,ENSG00000276017,Gene Expression,AC007325.1
ENSG00000278817,ENSG00000278817,Gene Expression,AC007325.4


In [13]:
genes = pd.read_csv('/home/jovyan/genes_homo_sapiens.csv', header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [14]:
adata

AnnData object with n_obs × n_vars = 216611 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_ids', 'feature_types', 'gene_symbols'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [15]:
adata = adata[:,adata.var_names.isin(genes[0])]

  res = method(*args, **kwargs)


In [16]:
adata

View of AnnData object with n_obs × n_vars = 216611 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_ids', 'feature_types', 'gene_symbols'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [17]:
adata.var['feature_biotype'] = ['gene'] * len(adata.var.index)

Trying to set attribute `.var` of view, copying.


In [18]:
del adata.var['feature_types']
del adata.var['gene_ids']

In [19]:
raw = adata.copy()

In [20]:
raw

AnnData object with n_obs × n_vars = 216611 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_symbols', 'feature_biotype'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [21]:
raw.X = adata.layers['counts']

In [22]:
del raw.obs
del raw.uns
del raw.obsm
del raw.layers

In [23]:
del raw.var['gene_symbols']

In [24]:
adata.raw = raw

In [25]:
del raw

---

## 3. Apply the schema requirements:

# <ins>X

adjusted!

# <ins>obs

In [26]:
adata.obs.columns

Index(['Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist',
       'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex',
       'Age_range'],
      dtype='object')

In [27]:
adata.obs

Unnamed: 0,Organ,Donor,Chemistry,Predicted_labels_CellTypist,Majority_voting_CellTypist,Manually_curated_celltype,Sex,Age_range
Pan_T7935490_AAACCTGCAAATTGCC,ILE,A29,5v1,Type 17 helper T cells,Type 17 helper T cells,Trm_Th1/Th17,Female,65-70
Pan_T7935490_AAACGGGCATCTGGTA,ILE,A29,5v1,Cytotoxic T cells,Cytotoxic T cells,Trm_gut_CD8,Female,65-70
Pan_T7935490_AAACGGGTCTTGCATT,ILE,A29,5v1,Tem/Effector cytotoxic T cells,Cytotoxic T cells,Trm_gut_CD8,Female,65-70
Pan_T7935490_AAAGCAATCATCGCTC,ILE,A29,5v1,Cytotoxic T cells,Cytotoxic T cells,Trm_gut_CD8,Female,65-70
Pan_T7935490_AAAGTAGCAGTCACTA,ILE,A29,5v1,gamma-delta T cells,gamma-delta T cells,Trm_Tgd,Female,65-70
...,...,...,...,...,...,...,...,...
CZINY-0112_TTTGGTTAGGCCACTC,JEJEPI,D496,3,MAIT cells,Cytotoxic T cells,Trm_gut_CD8,Male,55-60
CZINY-0112_TTTGGTTCACAAAGTA,SPL,D496,3,Helper T cells,Helper T cells,Tfh,Male,55-60
CZINY-0112_TTTGGTTTCCATCACC,LNG,D496,3,Cytotoxic T cells,Cytotoxic T cells,Tem/emra_CD8,Male,55-60
CZINY-0112_TTTGTTGCAGCGGTTC,SPL,D496,3,Helper T cells,Follicular helper T cells,Tfh,Male,55-60


## assay_ontology_term_id

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> cDNA libraries were prepared from single-cell suspensions following the **10x Genomics 3′ v2 protocol**, and 2 samples per lane sequenced on HiSeq400 ...

In [28]:
adata.obs['assay_ontology_term_id'] = adata.obs['Chemistry']

In [29]:
adata.obs['assay_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    5v1
Pan_T7935490_AAACGGGCATCTGGTA    5v1
Pan_T7935490_AAACGGGTCTTGCATT    5v1
Pan_T7935490_AAAGCAATCATCGCTC    5v1
Pan_T7935490_AAAGTAGCAGTCACTA    5v1
                                ... 
CZINY-0112_TTTGGTTAGGCCACTC        3
CZINY-0112_TTTGGTTCACAAAGTA        3
CZINY-0112_TTTGGTTTCCATCACC        3
CZINY-0112_TTTGTTGCAGCGGTTC        3
CZINY-0112_TTTGTTGGTGCGACAA        3
Name: assay_ontology_term_id, Length: 216611, dtype: category
Categories (3, object): ['3', '5v1', '5v2']

In [30]:
mapping = {'3': 'EFO:0009922', '5v1': 'EFO:0011025', '5v2': 'EFO:0009900'}

In [31]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].map(mapping)

In [32]:
adata.obs['assay_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    EFO:0011025
Pan_T7935490_AAACGGGCATCTGGTA    EFO:0011025
Pan_T7935490_AAACGGGTCTTGCATT    EFO:0011025
Pan_T7935490_AAAGCAATCATCGCTC    EFO:0011025
Pan_T7935490_AAAGTAGCAGTCACTA    EFO:0011025
                                    ...     
CZINY-0112_TTTGGTTAGGCCACTC      EFO:0009922
CZINY-0112_TTTGGTTCACAAAGTA      EFO:0009922
CZINY-0112_TTTGGTTTCCATCACC      EFO:0009922
CZINY-0112_TTTGTTGCAGCGGTTC      EFO:0009922
CZINY-0112_TTTGTTGGTGCGACAA      EFO:0009922
Name: assay_ontology_term_id, Length: 216611, dtype: category
Categories (3, object): ['EFO:0009922', 'EFO:0011025', 'EFO:0009900']

## cell_type_ontology_term_id

In [33]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['Manually_curated_celltype']

In [34]:
adata.obs['cell_type_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    Trm_Th1/Th17
Pan_T7935490_AAACGGGCATCTGGTA     Trm_gut_CD8
Pan_T7935490_AAACGGGTCTTGCATT     Trm_gut_CD8
Pan_T7935490_AAAGCAATCATCGCTC     Trm_gut_CD8
Pan_T7935490_AAAGTAGCAGTCACTA         Trm_Tgd
                                     ...     
CZINY-0112_TTTGGTTAGGCCACTC       Trm_gut_CD8
CZINY-0112_TTTGGTTCACAAAGTA               Tfh
CZINY-0112_TTTGGTTTCCATCACC      Tem/emra_CD8
CZINY-0112_TTTGTTGCAGCGGTTC               Tfh
CZINY-0112_TTTGTTGGTGCGACAA        Trm/em_CD8
Name: cell_type_ontology_term_id, Length: 216611, dtype: category
Categories (18, object): ['Cycling T&NK', 'ILC3', 'MAIT', 'NK_CD16+', ..., 'Trm/em_CD8', 'Trm_Tgd', 'Trm_Th1/Th17', 'Trm_gut_CD8']

In [35]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['Trm_Th1/Th17',
 'Trm_gut_CD8',
 'Trm_Tgd',
 'T_CD4/CD8',
 'Trm/em_CD8',
 'Cycling T&NK',
 'Tregs',
 'Teffector/EM_CD4',
 'Tem/emra_CD8',
 'Tfh',
 'ILC3',
 'Tnaive/CM_CD4',
 'Tgd_CRTAM+',
 'NK_CD16+',
 'Tnaive/CM_CD8',
 'MAIT',
 'NK_CD56bright_CD16-',
 'Tnaive/CM_CD4_activated']

In [36]:
mapping = {'Trm_Th1/Th17': 'CL:0000492', # not trm (not exist in CL) but group contains th1 th17
 'Trm_gut_CD8': 'CL:0000909', # not trm (not exist in CL) but memory CD8 T
 'Trm_Tgd': 'CL:0000798', # not trm (not exist in CL) but tgd
 'T_CD4/CD8': 'CL:0000789',
 'Trm/em_CD8': 'CL:0001203', # cd8 group that contains effector and central (resident?) memory t cells
 'Cycling T&NK': 'CL:0000542', # lymphocyte (group that contains both NK and T cells)
 'Tregs': 'CL:0000815',
 'Teffector/EM_CD4': 'CL:0000905',
 'Tem/emra_CD8': 'CL:0001062',
 'Tfh': 'CL:0002038',
 'ILC3': 'CL:0001071',
 'Tnaive/CM_CD4': 'CL:0000895',
 'Tgd_CRTAM+': 'CL:0000798', # not crtam+
 'NK_CD16+': 'CL:0000939',
 'Tnaive/CM_CD8': 'CL:0000900',
 'MAIT': 'CL:0000940',
 'NK_CD56bright_CD16-': 'CL:0000938',
 'Tnaive/CM_CD4_activated': 'CL:0000895'} # ?? not sure

In [37]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [38]:
adata.obs['cell_type_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    CL:0000492
Pan_T7935490_AAACGGGCATCTGGTA    CL:0000909
Pan_T7935490_AAACGGGTCTTGCATT    CL:0000909
Pan_T7935490_AAAGCAATCATCGCTC    CL:0000909
Pan_T7935490_AAAGTAGCAGTCACTA    CL:0000798
                                    ...    
CZINY-0112_TTTGGTTAGGCCACTC      CL:0000909
CZINY-0112_TTTGGTTCACAAAGTA      CL:0002038
CZINY-0112_TTTGGTTTCCATCACC      CL:0001062
CZINY-0112_TTTGTTGCAGCGGTTC      CL:0002038
CZINY-0112_TTTGTTGGTGCGACAA      CL:0001203
Name: cell_type_ontology_term_id, Length: 216611, dtype: object

## development_stage_ontology_term_id

343B: 70-75(F)  
356C: 40-45(F)  
367C: 65-70(M)  
368C: 55-60(M)  
390C: 65-70(F)

In [39]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Age_range']

In [40]:
adata.obs['development_stage_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    65-70
Pan_T7935490_AAACGGGCATCTGGTA    65-70
Pan_T7935490_AAACGGGTCTTGCATT    65-70
Pan_T7935490_AAAGCAATCATCGCTC    65-70
Pan_T7935490_AAAGTAGCAGTCACTA    65-70
                                 ...  
CZINY-0112_TTTGGTTAGGCCACTC      55-60
CZINY-0112_TTTGGTTCACAAAGTA      55-60
CZINY-0112_TTTGGTTTCCATCACC      55-60
CZINY-0112_TTTGTTGCAGCGGTTC      55-60
CZINY-0112_TTTGTTGGTGCGACAA      55-60
Name: development_stage_ontology_term_id, Length: 216611, dtype: category
Categories (5, object): ['50-54', '55-60', '60-64', '65-70', '70-74']

In [41]:
mapping = {'50-54': 'HsapDv:0000240', # 50-60
           '55-60': 'HsapDv:0000240', # 50-60
           '60-64': 'HsapDv:0000241', # 60-70
           '65-70': 'HsapDv:0000241', # 60-70
           '70-74': 'HsapDv:0000242'} # 70-80

In [42]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [43]:
adata.obs['development_stage_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    HsapDv:0000241
Pan_T7935490_AAACGGGCATCTGGTA    HsapDv:0000241
Pan_T7935490_AAACGGGTCTTGCATT    HsapDv:0000241
Pan_T7935490_AAAGCAATCATCGCTC    HsapDv:0000241
Pan_T7935490_AAAGTAGCAGTCACTA    HsapDv:0000241
                                      ...      
CZINY-0112_TTTGGTTAGGCCACTC      HsapDv:0000240
CZINY-0112_TTTGGTTCACAAAGTA      HsapDv:0000240
CZINY-0112_TTTGGTTTCCATCACC      HsapDv:0000240
CZINY-0112_TTTGTTGCAGCGGTTC      HsapDv:0000240
CZINY-0112_TTTGTTGGTGCGACAA      HsapDv:0000240
Name: development_stage_ontology_term_id, Length: 216611, dtype: object

## disease_ontology_term_id

"PATO:0000461" if healthy

In [44]:
adata.obs['disease_ontology_term_id'] = ["PATO:0000461"] * len(adata.obs)

In [45]:
adata.obs['disease_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    PATO:0000461
Pan_T7935490_AAACGGGCATCTGGTA    PATO:0000461
Pan_T7935490_AAACGGGTCTTGCATT    PATO:0000461
Pan_T7935490_AAAGCAATCATCGCTC    PATO:0000461
Pan_T7935490_AAAGTAGCAGTCACTA    PATO:0000461
                                     ...     
CZINY-0112_TTTGGTTAGGCCACTC      PATO:0000461
CZINY-0112_TTTGGTTCACAAAGTA      PATO:0000461
CZINY-0112_TTTGGTTTCCATCACC      PATO:0000461
CZINY-0112_TTTGTTGCAGCGGTTC      PATO:0000461
CZINY-0112_TTTGTTGGTGCGACAA      PATO:0000461
Name: disease_ontology_term_id, Length: 216611, dtype: object

## ethnicity_ontology_term_id

In [46]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [47]:
adata.obs['ethnicity_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    unknown
Pan_T7935490_AAACGGGCATCTGGTA    unknown
Pan_T7935490_AAACGGGTCTTGCATT    unknown
Pan_T7935490_AAAGCAATCATCGCTC    unknown
Pan_T7935490_AAAGTAGCAGTCACTA    unknown
                                  ...   
CZINY-0112_TTTGGTTAGGCCACTC      unknown
CZINY-0112_TTTGGTTCACAAAGTA      unknown
CZINY-0112_TTTGGTTTCCATCACC      unknown
CZINY-0112_TTTGTTGCAGCGGTTC      unknown
CZINY-0112_TTTGTTGGTGCGACAA      unknown
Name: ethnicity_ontology_term_id, Length: 216611, dtype: object

## is_primary_data

In [48]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [49]:
adata.obs['is_primary_data']

Pan_T7935490_AAACCTGCAAATTGCC    False
Pan_T7935490_AAACGGGCATCTGGTA    False
Pan_T7935490_AAACGGGTCTTGCATT    False
Pan_T7935490_AAAGCAATCATCGCTC    False
Pan_T7935490_AAAGTAGCAGTCACTA    False
                                 ...  
CZINY-0112_TTTGGTTAGGCCACTC      False
CZINY-0112_TTTGGTTCACAAAGTA      False
CZINY-0112_TTTGGTTTCCATCACC      False
CZINY-0112_TTTGTTGCAGCGGTTC      False
CZINY-0112_TTTGTTGGTGCGACAA      False
Name: is_primary_data, Length: 216611, dtype: bool

In [50]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [51]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [52]:
adata.obs['organism_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    NCBITaxon:9606
Pan_T7935490_AAACGGGCATCTGGTA    NCBITaxon:9606
Pan_T7935490_AAACGGGTCTTGCATT    NCBITaxon:9606
Pan_T7935490_AAAGCAATCATCGCTC    NCBITaxon:9606
Pan_T7935490_AAAGTAGCAGTCACTA    NCBITaxon:9606
                                      ...      
CZINY-0112_TTTGGTTAGGCCACTC      NCBITaxon:9606
CZINY-0112_TTTGGTTCACAAAGTA      NCBITaxon:9606
CZINY-0112_TTTGGTTTCCATCACC      NCBITaxon:9606
CZINY-0112_TTTGTTGCAGCGGTTC      NCBITaxon:9606
CZINY-0112_TTTGTTGGTGCGACAA      NCBITaxon:9606
Name: organism_ontology_term_id, Length: 216611, dtype: object

In [53]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [54]:
adata.obs['organism_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    NCBITaxon:9606
Pan_T7935490_AAACGGGCATCTGGTA    NCBITaxon:9606
Pan_T7935490_AAACGGGTCTTGCATT    NCBITaxon:9606
Pan_T7935490_AAAGCAATCATCGCTC    NCBITaxon:9606
Pan_T7935490_AAAGTAGCAGTCACTA    NCBITaxon:9606
                                      ...      
CZINY-0112_TTTGGTTAGGCCACTC      NCBITaxon:9606
CZINY-0112_TTTGGTTCACAAAGTA      NCBITaxon:9606
CZINY-0112_TTTGGTTTCCATCACC      NCBITaxon:9606
CZINY-0112_TTTGTTGCAGCGGTTC      NCBITaxon:9606
CZINY-0112_TTTGTTGGTGCGACAA      NCBITaxon:9606
Name: organism_ontology_term_id, Length: 216611, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [55]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sex']

In [56]:
adata.obs['sex_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    Female
Pan_T7935490_AAACGGGCATCTGGTA    Female
Pan_T7935490_AAACGGGTCTTGCATT    Female
Pan_T7935490_AAAGCAATCATCGCTC    Female
Pan_T7935490_AAAGTAGCAGTCACTA    Female
                                  ...  
CZINY-0112_TTTGGTTAGGCCACTC        Male
CZINY-0112_TTTGGTTCACAAAGTA        Male
CZINY-0112_TTTGGTTTCCATCACC        Male
CZINY-0112_TTTGTTGCAGCGGTTC        Male
CZINY-0112_TTTGTTGGTGCGACAA        Male
Name: sex_ontology_term_id, Length: 216611, dtype: category
Categories (2, object): ['Female', 'Male']

**PATO_0000384 for male  
PATO_0000383 for female**

In [57]:
mapping = {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [58]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [59]:
adata.obs['sex_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    PATO:0000383
Pan_T7935490_AAACGGGCATCTGGTA    PATO:0000383
Pan_T7935490_AAACGGGTCTTGCATT    PATO:0000383
Pan_T7935490_AAAGCAATCATCGCTC    PATO:0000383
Pan_T7935490_AAAGTAGCAGTCACTA    PATO:0000383
                                     ...     
CZINY-0112_TTTGGTTAGGCCACTC      PATO:0000384
CZINY-0112_TTTGGTTCACAAAGTA      PATO:0000384
CZINY-0112_TTTGGTTTCCATCACC      PATO:0000384
CZINY-0112_TTTGTTGCAGCGGTTC      PATO:0000384
CZINY-0112_TTTGTTGGTGCGACAA      PATO:0000384
Name: sex_ontology_term_id, Length: 216611, dtype: category
Categories (2, object): ['PATO:0000383', 'PATO:0000384']

## tissue_ontology_term_id

> All tissue dissociation protocols are available on protocols.io [40]: spleen (UBERON:0002106), esophagus epithelium (UBERON:0001976), and lung parenchyma (UBERON:0008946).

In [60]:
adata.obs['tissue_ontology_term_id'] = adata.obs['Organ']

In [61]:
adata.obs['tissue_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC       ILE
Pan_T7935490_AAACGGGCATCTGGTA       ILE
Pan_T7935490_AAACGGGTCTTGCATT       ILE
Pan_T7935490_AAAGCAATCATCGCTC       ILE
Pan_T7935490_AAAGTAGCAGTCACTA       ILE
                                  ...  
CZINY-0112_TTTGGTTAGGCCACTC      JEJEPI
CZINY-0112_TTTGGTTCACAAAGTA         SPL
CZINY-0112_TTTGGTTTCCATCACC         LNG
CZINY-0112_TTTGTTGCAGCGGTTC         SPL
CZINY-0112_TTTGTTGGTGCGACAA         SPL
Name: tissue_ontology_term_id, Length: 216611, dtype: category
Categories (17, object): ['BLD', 'BMA', 'CAE', 'DUO', ..., 'SKM', 'SPL', 'TCL', 'THY']

In [62]:
list(adata.obs['tissue_ontology_term_id'].unique())

['ILE',
 'LNG',
 'LLN',
 'MLN',
 'BMA',
 'SKM',
 'LIV',
 'SPL',
 'OME',
 'CAE',
 'THY',
 'DUO',
 'BLD',
 'TCL',
 'SCL',
 'JEJLP',
 'JEJEPI']

In [63]:
mapping = {'ILE': 'UBERON:0002116',
 'LNG': 'UBERON:0002048',
 'LLN': 'UBERON:0007644',
 'MLN': 'UBERON:0002509',
 'BMA': 'UBERON:0002371',
 'SKM': 'UBERON:0001134',
 'LIV': 'UBERON:0002107',
 'SPL': 'UBERON:0002106',
 'OME': 'UBERON:0003688',
 'CAE': 'UBERON:0001153',
 'THY': 'UBERON:0002370',
 'DUO': 'UBERON:0002114',
 'BLD': 'UBERON:0000178',
 'TCL': 'UBERON:0001157',
 'SCL': 'UBERON:0001159',
 'JEJLP': 'UBERON:0000030',
 'JEJEPI': 'UBERON:0000400'}

In [64]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].map(mapping)

In [65]:
adata.obs['tissue_ontology_term_id']

Pan_T7935490_AAACCTGCAAATTGCC    UBERON:0002116
Pan_T7935490_AAACGGGCATCTGGTA    UBERON:0002116
Pan_T7935490_AAACGGGTCTTGCATT    UBERON:0002116
Pan_T7935490_AAAGCAATCATCGCTC    UBERON:0002116
Pan_T7935490_AAAGTAGCAGTCACTA    UBERON:0002116
                                      ...      
CZINY-0112_TTTGGTTAGGCCACTC      UBERON:0000400
CZINY-0112_TTTGGTTCACAAAGTA      UBERON:0002106
CZINY-0112_TTTGGTTTCCATCACC      UBERON:0002048
CZINY-0112_TTTGTTGCAGCGGTTC      UBERON:0002106
CZINY-0112_TTTGTTGGTGCGACAA      UBERON:0002106
Name: tissue_ontology_term_id, Length: 216611, dtype: category
Categories (17, object): ['UBERON:0000178', 'UBERON:0002371', 'UBERON:0001153', 'UBERON:0002114', ..., 'UBERON:0001134', 'UBERON:0002106', 'UBERON:0001157', 'UBERON:0002370']

---

# <ins>var and raw.var

In [66]:
adata.var

Unnamed: 0_level_0,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,gene
ENSG00000237613,FAM138A,gene
ENSG00000186092,OR4F5,gene
ENSG00000238009,AL627309.1,gene
ENSG00000239945,AL627309.3,gene
...,...,...
ENSG00000277836,AC141272.1,gene
ENSG00000278633,AC023491.2,gene
ENSG00000276017,AC007325.1,gene
ENSG00000278817,AC007325.4,gene


## feature_is_filtered

In [69]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [70]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000243485    False
ENSG00000237613    False
ENSG00000186092    False
ENSG00000238009    False
ENSG00000239945    False
                   ...  
ENSG00000277836    False
ENSG00000278633    False
ENSG00000276017    False
ENSG00000278817    False
ENSG00000277196    False
Name: feature_is_filtered, Length: 36503, dtype: bool

# <ins>obsm

In [73]:
adata.obsm

AxisArrays with keys: X_umap

# <ins>uns

In [74]:
adata.uns['schema_version'] = '2.0.0'

In [75]:
adata.uns['title'] = 'T & innate lymphoid cells'

In [76]:
adata.uns['X_normalization'] = 'Scanpy'

In [77]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [78]:
adata

AnnData object with n_obs × n_vars = 216611 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'Age_range_colors', 'Sex_colors', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    layers: 'counts'

In [79]:
adata.uns['development_stage_ontology_term_id_colors'] = adata.uns['Age_range_colors']
adata.uns['sex_ontology_term_id_colors'] = adata.uns['Sex_colors']

In [80]:
# Removing the same slots created for the schema
del adata.uns['Age_range_colors']
del adata.uns['Sex_colors']
del adata.obs['Organ'] # already in tissue_ontology_term_id
del adata.obs['Sex'] # already in sex_ontology_term_id
del adata.obs['Age_range'] # already in developmental_stage_ontology_term_id
del adata.obs['Chemistry'] # already in assay_ontology_term_id
del adata.layers

In [81]:
adata

AnnData object with n_obs × n_vars = 216611 × 36503
    obs: 'Donor', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding', 'development_stage_ontology_term_id_colors', 'sex_ontology_term_id_colors'
    obsm: 'X_umap'

In [82]:
adata.X

<216611x36503 sparse matrix of type '<class 'numpy.float32'>'
	with 373413588 stored elements in Compressed Sparse Row format>

In [83]:
adata.raw.X

<216611x36503 sparse matrix of type '<class 'numpy.float32'>'
	with 373413588 stored elements in Compressed Sparse Row format>

In [84]:
adata.write('imm_tilc.h5ad', compression = 'gzip')

... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical


---