# **Curating `tissue-immune` dataset:**

### Myeloid dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

---

## 2. Pre-prepare before schema adjustments

### Load raw matrix

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/tissueimmune/CountAdded_PIP_myeloid_object_for_cellxgene.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 51552 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [4]:
adata.obs

Unnamed: 0,Organ,Donor,Chemistry,Predicted_labels_CellTypist,Majority_voting_CellTypist,Manually_curated_celltype,Sex,Age_range
Pan_T7935490_AACCATGTCTCGTTTA,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CACACCTGTCGACTAT,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CGTAGGCCAATGGACG,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CGTCAGGCAGGGATTG,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CTACACCAGCGCTCCA,ILE,A29,5v1,DC2,DC2,DC2,Female,65-70
...,...,...,...,...,...,...,...,...
CZINY-0112_TTTGACTGTTCAAGGG,LNG,D496,3,Macrophages,Macrophages,Alveolar macrophages,Male,55-60
CZINY-0112_TTTGATCCAACCCTCT,LNG,D496,3,Macrophages,Macrophages,Alveolar macrophages,Male,55-60
CZINY-0112_TTTGATCGTCTTAGTG,BMA,D496,3,Classical monocytes,Classical monocytes,Classical monocytes,Male,55-60
CZINY-0112_TTTGGAGGTCAGGCAA,SPL,D496,3,Classical monocytes,Classical monocytes,Classical monocytes,Male,55-60


In [5]:
adata2 = sc.read_10x_mtx('/warehouse/cellgeni/tic-1168/GSE148073/GSM4453619/output/Gene/filtered')

In [6]:
adata.var

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
AC141272.1
AC023491.2
AC007325.1
AC007325.4
AC007325.2


In [7]:
adata.var = adata2.var

In [8]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


### Remove the genes that do not match in gene-ensemblID file

In [9]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


In [10]:
adata.var['gene_symbols'] = adata.var_names

In [11]:
adata.var_names = adata.var['gene_ids']

In [12]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,AL627309.3
...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,AC141272.1
ENSG00000278633,ENSG00000278633,Gene Expression,AC023491.2
ENSG00000276017,ENSG00000276017,Gene Expression,AC007325.1
ENSG00000278817,ENSG00000278817,Gene Expression,AC007325.4


In [13]:
genes = pd.read_csv('/home/jovyan/genes_homo_sapiens.csv', header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [14]:
adata

AnnData object with n_obs × n_vars = 51552 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_ids', 'feature_types', 'gene_symbols'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [15]:
adata = adata[:,adata.var_names.isin(genes[0])]

  res = method(*args, **kwargs)


In [16]:
adata

View of AnnData object with n_obs × n_vars = 51552 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_ids', 'feature_types', 'gene_symbols'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [17]:
adata.var['feature_biotype'] = ['gene'] * len(adata.var.index)

Trying to set attribute `.var` of view, copying.


In [18]:
del adata.var['feature_types']
del adata.var['gene_ids']

In [19]:
raw = adata.copy()

In [20]:
raw

AnnData object with n_obs × n_vars = 51552 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_symbols', 'feature_biotype'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [21]:
raw.X = adata.layers['counts']

In [22]:
del raw.obs
del raw.uns
del raw.obsm
del raw.layers

In [23]:
del raw.var['gene_symbols']

In [24]:
adata.raw = raw

In [25]:
del raw

---

## 3. Apply the schema requirements:

# <ins>X

adjusted!

# <ins>obs

In [26]:
adata.obs.columns

Index(['Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist',
       'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex',
       'Age_range'],
      dtype='object')

In [27]:
adata.obs

Unnamed: 0,Organ,Donor,Chemistry,Predicted_labels_CellTypist,Majority_voting_CellTypist,Manually_curated_celltype,Sex,Age_range
Pan_T7935490_AACCATGTCTCGTTTA,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CACACCTGTCGACTAT,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CGTAGGCCAATGGACG,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CGTCAGGCAGGGATTG,ILE,A29,5v1,Macrophages,Macrophages,Erythrophagocytic macrophages,Female,65-70
Pan_T7935490_CTACACCAGCGCTCCA,ILE,A29,5v1,DC2,DC2,DC2,Female,65-70
...,...,...,...,...,...,...,...,...
CZINY-0112_TTTGACTGTTCAAGGG,LNG,D496,3,Macrophages,Macrophages,Alveolar macrophages,Male,55-60
CZINY-0112_TTTGATCCAACCCTCT,LNG,D496,3,Macrophages,Macrophages,Alveolar macrophages,Male,55-60
CZINY-0112_TTTGATCGTCTTAGTG,BMA,D496,3,Classical monocytes,Classical monocytes,Classical monocytes,Male,55-60
CZINY-0112_TTTGGAGGTCAGGCAA,SPL,D496,3,Classical monocytes,Classical monocytes,Classical monocytes,Male,55-60


## assay_ontology_term_id

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> cDNA libraries were prepared from single-cell suspensions following the **10x Genomics 3′ v2 protocol**, and 2 samples per lane sequenced on HiSeq400 ...

In [28]:
adata.obs['assay_ontology_term_id'] = adata.obs['Chemistry']

In [29]:
adata.obs['assay_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    5v1
Pan_T7935490_CACACCTGTCGACTAT    5v1
Pan_T7935490_CGTAGGCCAATGGACG    5v1
Pan_T7935490_CGTCAGGCAGGGATTG    5v1
Pan_T7935490_CTACACCAGCGCTCCA    5v1
                                ... 
CZINY-0112_TTTGACTGTTCAAGGG        3
CZINY-0112_TTTGATCCAACCCTCT        3
CZINY-0112_TTTGATCGTCTTAGTG        3
CZINY-0112_TTTGGAGGTCAGGCAA        3
CZINY-0112_TTTGGAGTCGTTAGTG        3
Name: assay_ontology_term_id, Length: 51552, dtype: category
Categories (3, object): ['3', '5v1', '5v2']

In [30]:
mapping = {'3': 'EFO:0009922', '5v1': 'EFO:0011025', '5v2': 'EFO:0009900'}

In [31]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].map(mapping)

In [32]:
adata.obs['assay_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    EFO:0011025
Pan_T7935490_CACACCTGTCGACTAT    EFO:0011025
Pan_T7935490_CGTAGGCCAATGGACG    EFO:0011025
Pan_T7935490_CGTCAGGCAGGGATTG    EFO:0011025
Pan_T7935490_CTACACCAGCGCTCCA    EFO:0011025
                                    ...     
CZINY-0112_TTTGACTGTTCAAGGG      EFO:0009922
CZINY-0112_TTTGATCCAACCCTCT      EFO:0009922
CZINY-0112_TTTGATCGTCTTAGTG      EFO:0009922
CZINY-0112_TTTGGAGGTCAGGCAA      EFO:0009922
CZINY-0112_TTTGGAGTCGTTAGTG      EFO:0009922
Name: assay_ontology_term_id, Length: 51552, dtype: category
Categories (3, object): ['EFO:0009922', 'EFO:0011025', 'EFO:0009900']

## cell_type_ontology_term_id

In [33]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['Manually_curated_celltype']

In [34]:
adata.obs['cell_type_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    Erythrophagocytic macrophages
Pan_T7935490_CACACCTGTCGACTAT    Erythrophagocytic macrophages
Pan_T7935490_CGTAGGCCAATGGACG    Erythrophagocytic macrophages
Pan_T7935490_CGTCAGGCAGGGATTG    Erythrophagocytic macrophages
Pan_T7935490_CTACACCAGCGCTCCA                              DC2
                                             ...              
CZINY-0112_TTTGACTGTTCAAGGG               Alveolar macrophages
CZINY-0112_TTTGATCCAACCCTCT               Alveolar macrophages
CZINY-0112_TTTGATCGTCTTAGTG                Classical monocytes
CZINY-0112_TTTGGAGGTCAGGCAA                Classical monocytes
CZINY-0112_TTTGGAGTCGTTAGTG                Classical monocytes
Name: cell_type_ontology_term_id, Length: 51552, dtype: category
Categories (11, object): ['Alveolar macrophages', 'Classical monocytes', 'Cycling', 'DC1', ..., 'Intestinal macrophages', 'MNP/T doublets', 'Nonclassical monocytes', 'migDC']

In [35]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['Erythrophagocytic macrophages',
 'DC2',
 'DC1',
 'Intermediate macrophages',
 'Alveolar macrophages',
 'Classical monocytes',
 'Nonclassical monocytes',
 'Cycling',
 'migDC',
 'MNP/T doublets',
 'Intestinal macrophages']

In [36]:
mapping = {'Erythrophagocytic macrophages': 'CL:0000235', # not intermediate
 'DC2': 'CL:0000990',
 'DC1': 'CL:0000990',
 'Intermediate macrophages': 'CL:0000235', # not intermediate
 'Alveolar macrophages': 'CL:0000583',
 'Classical monocytes': 'CL:0000860',
 'Nonclassical monocytes': 'CL:0000875',
 'Cycling': 'CL:0000548', # not cycling (animal cell)
 'migDC': 'CL:0001056',
 'MNP/T doublets': 'CL:0000548',  # no doublets (animal cell)
 'Intestinal macrophages': 'CL:0000235'} # not intestinal

In [37]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [38]:
adata.obs['cell_type_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    CL:0000235
Pan_T7935490_CACACCTGTCGACTAT    CL:0000235
Pan_T7935490_CGTAGGCCAATGGACG    CL:0000235
Pan_T7935490_CGTCAGGCAGGGATTG    CL:0000235
Pan_T7935490_CTACACCAGCGCTCCA    CL:0000990
                                    ...    
CZINY-0112_TTTGACTGTTCAAGGG      CL:0000583
CZINY-0112_TTTGATCCAACCCTCT      CL:0000583
CZINY-0112_TTTGATCGTCTTAGTG      CL:0000860
CZINY-0112_TTTGGAGGTCAGGCAA      CL:0000860
CZINY-0112_TTTGGAGTCGTTAGTG      CL:0000860
Name: cell_type_ontology_term_id, Length: 51552, dtype: object

## development_stage_ontology_term_id

In [39]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Age_range']

In [40]:
adata.obs['development_stage_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    65-70
Pan_T7935490_CACACCTGTCGACTAT    65-70
Pan_T7935490_CGTAGGCCAATGGACG    65-70
Pan_T7935490_CGTCAGGCAGGGATTG    65-70
Pan_T7935490_CTACACCAGCGCTCCA    65-70
                                 ...  
CZINY-0112_TTTGACTGTTCAAGGG      55-60
CZINY-0112_TTTGATCCAACCCTCT      55-60
CZINY-0112_TTTGATCGTCTTAGTG      55-60
CZINY-0112_TTTGGAGGTCAGGCAA      55-60
CZINY-0112_TTTGGAGTCGTTAGTG      55-60
Name: development_stage_ontology_term_id, Length: 51552, dtype: category
Categories (5, object): ['50-54', '55-60', '60-64', '65-70', '70-74']

In [41]:
mapping = {'50-54': 'HsapDv:0000240', # 50-60
           '55-60': 'HsapDv:0000240', # 50-60
           '60-64': 'HsapDv:0000241', # 60-70
           '65-70': 'HsapDv:0000241', # 60-70
           '70-74': 'HsapDv:0000242'} # 70-80

In [42]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [43]:
adata.obs['development_stage_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    HsapDv:0000241
Pan_T7935490_CACACCTGTCGACTAT    HsapDv:0000241
Pan_T7935490_CGTAGGCCAATGGACG    HsapDv:0000241
Pan_T7935490_CGTCAGGCAGGGATTG    HsapDv:0000241
Pan_T7935490_CTACACCAGCGCTCCA    HsapDv:0000241
                                      ...      
CZINY-0112_TTTGACTGTTCAAGGG      HsapDv:0000240
CZINY-0112_TTTGATCCAACCCTCT      HsapDv:0000240
CZINY-0112_TTTGATCGTCTTAGTG      HsapDv:0000240
CZINY-0112_TTTGGAGGTCAGGCAA      HsapDv:0000240
CZINY-0112_TTTGGAGTCGTTAGTG      HsapDv:0000240
Name: development_stage_ontology_term_id, Length: 51552, dtype: object

## disease_ontology_term_id

"PATO:0000461" if healthy

In [44]:
adata.obs['disease_ontology_term_id'] = ["PATO:0000461"] * len(adata.obs)

In [45]:
adata.obs['disease_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    PATO:0000461
Pan_T7935490_CACACCTGTCGACTAT    PATO:0000461
Pan_T7935490_CGTAGGCCAATGGACG    PATO:0000461
Pan_T7935490_CGTCAGGCAGGGATTG    PATO:0000461
Pan_T7935490_CTACACCAGCGCTCCA    PATO:0000461
                                     ...     
CZINY-0112_TTTGACTGTTCAAGGG      PATO:0000461
CZINY-0112_TTTGATCCAACCCTCT      PATO:0000461
CZINY-0112_TTTGATCGTCTTAGTG      PATO:0000461
CZINY-0112_TTTGGAGGTCAGGCAA      PATO:0000461
CZINY-0112_TTTGGAGTCGTTAGTG      PATO:0000461
Name: disease_ontology_term_id, Length: 51552, dtype: object

## ethnicity_ontology_term_id

In [46]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [47]:
adata.obs['ethnicity_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    unknown
Pan_T7935490_CACACCTGTCGACTAT    unknown
Pan_T7935490_CGTAGGCCAATGGACG    unknown
Pan_T7935490_CGTCAGGCAGGGATTG    unknown
Pan_T7935490_CTACACCAGCGCTCCA    unknown
                                  ...   
CZINY-0112_TTTGACTGTTCAAGGG      unknown
CZINY-0112_TTTGATCCAACCCTCT      unknown
CZINY-0112_TTTGATCGTCTTAGTG      unknown
CZINY-0112_TTTGGAGGTCAGGCAA      unknown
CZINY-0112_TTTGGAGTCGTTAGTG      unknown
Name: ethnicity_ontology_term_id, Length: 51552, dtype: object

## is_primary_data

In [48]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [49]:
adata.obs['is_primary_data']

Pan_T7935490_AACCATGTCTCGTTTA    False
Pan_T7935490_CACACCTGTCGACTAT    False
Pan_T7935490_CGTAGGCCAATGGACG    False
Pan_T7935490_CGTCAGGCAGGGATTG    False
Pan_T7935490_CTACACCAGCGCTCCA    False
                                 ...  
CZINY-0112_TTTGACTGTTCAAGGG      False
CZINY-0112_TTTGATCCAACCCTCT      False
CZINY-0112_TTTGATCGTCTTAGTG      False
CZINY-0112_TTTGGAGGTCAGGCAA      False
CZINY-0112_TTTGGAGTCGTTAGTG      False
Name: is_primary_data, Length: 51552, dtype: bool

In [50]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [51]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [52]:
adata.obs['organism_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    NCBITaxon:9606
Pan_T7935490_CACACCTGTCGACTAT    NCBITaxon:9606
Pan_T7935490_CGTAGGCCAATGGACG    NCBITaxon:9606
Pan_T7935490_CGTCAGGCAGGGATTG    NCBITaxon:9606
Pan_T7935490_CTACACCAGCGCTCCA    NCBITaxon:9606
                                      ...      
CZINY-0112_TTTGACTGTTCAAGGG      NCBITaxon:9606
CZINY-0112_TTTGATCCAACCCTCT      NCBITaxon:9606
CZINY-0112_TTTGATCGTCTTAGTG      NCBITaxon:9606
CZINY-0112_TTTGGAGGTCAGGCAA      NCBITaxon:9606
CZINY-0112_TTTGGAGTCGTTAGTG      NCBITaxon:9606
Name: organism_ontology_term_id, Length: 51552, dtype: object

In [53]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [54]:
adata.obs['organism_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    NCBITaxon:9606
Pan_T7935490_CACACCTGTCGACTAT    NCBITaxon:9606
Pan_T7935490_CGTAGGCCAATGGACG    NCBITaxon:9606
Pan_T7935490_CGTCAGGCAGGGATTG    NCBITaxon:9606
Pan_T7935490_CTACACCAGCGCTCCA    NCBITaxon:9606
                                      ...      
CZINY-0112_TTTGACTGTTCAAGGG      NCBITaxon:9606
CZINY-0112_TTTGATCCAACCCTCT      NCBITaxon:9606
CZINY-0112_TTTGATCGTCTTAGTG      NCBITaxon:9606
CZINY-0112_TTTGGAGGTCAGGCAA      NCBITaxon:9606
CZINY-0112_TTTGGAGTCGTTAGTG      NCBITaxon:9606
Name: organism_ontology_term_id, Length: 51552, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [55]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sex']

In [56]:
adata.obs['sex_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    Female
Pan_T7935490_CACACCTGTCGACTAT    Female
Pan_T7935490_CGTAGGCCAATGGACG    Female
Pan_T7935490_CGTCAGGCAGGGATTG    Female
Pan_T7935490_CTACACCAGCGCTCCA    Female
                                  ...  
CZINY-0112_TTTGACTGTTCAAGGG        Male
CZINY-0112_TTTGATCCAACCCTCT        Male
CZINY-0112_TTTGATCGTCTTAGTG        Male
CZINY-0112_TTTGGAGGTCAGGCAA        Male
CZINY-0112_TTTGGAGTCGTTAGTG        Male
Name: sex_ontology_term_id, Length: 51552, dtype: category
Categories (2, object): ['Female', 'Male']

**PATO_0000384 for male  
PATO_0000383 for female**

In [57]:
mapping = {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [58]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [59]:
adata.obs['sex_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    PATO:0000383
Pan_T7935490_CACACCTGTCGACTAT    PATO:0000383
Pan_T7935490_CGTAGGCCAATGGACG    PATO:0000383
Pan_T7935490_CGTCAGGCAGGGATTG    PATO:0000383
Pan_T7935490_CTACACCAGCGCTCCA    PATO:0000383
                                     ...     
CZINY-0112_TTTGACTGTTCAAGGG      PATO:0000384
CZINY-0112_TTTGATCCAACCCTCT      PATO:0000384
CZINY-0112_TTTGATCGTCTTAGTG      PATO:0000384
CZINY-0112_TTTGGAGGTCAGGCAA      PATO:0000384
CZINY-0112_TTTGGAGTCGTTAGTG      PATO:0000384
Name: sex_ontology_term_id, Length: 51552, dtype: category
Categories (2, object): ['PATO:0000383', 'PATO:0000384']

## tissue_ontology_term_id

> All tissue dissociation protocols are available on protocols.io [40]: spleen (UBERON:0002106), esophagus epithelium (UBERON:0001976), and lung parenchyma (UBERON:0008946).

In [60]:
adata.obs['tissue_ontology_term_id'] = adata.obs['Organ']

In [61]:
adata.obs['tissue_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    ILE
Pan_T7935490_CACACCTGTCGACTAT    ILE
Pan_T7935490_CGTAGGCCAATGGACG    ILE
Pan_T7935490_CGTCAGGCAGGGATTG    ILE
Pan_T7935490_CTACACCAGCGCTCCA    ILE
                                ... 
CZINY-0112_TTTGACTGTTCAAGGG      LNG
CZINY-0112_TTTGATCCAACCCTCT      LNG
CZINY-0112_TTTGATCGTCTTAGTG      BMA
CZINY-0112_TTTGGAGGTCAGGCAA      SPL
CZINY-0112_TTTGGAGTCGTTAGTG      SPL
Name: tissue_ontology_term_id, Length: 51552, dtype: category
Categories (17, object): ['BLD', 'BMA', 'CAE', 'DUO', ..., 'SKM', 'SPL', 'TCL', 'THY']

In [62]:
list(adata.obs['tissue_ontology_term_id'].unique())

['ILE',
 'LNG',
 'LLN',
 'MLN',
 'BMA',
 'SKM',
 'LIV',
 'SPL',
 'OME',
 'CAE',
 'THY',
 'DUO',
 'BLD',
 'TCL',
 'SCL',
 'JEJLP',
 'JEJEPI']

In [63]:
mapping = {'ILE': 'UBERON:0002116',
 'LNG': 'UBERON:0002048',
 'LLN': 'UBERON:0007644',
 'MLN': 'UBERON:0002509',
 'BMA': 'UBERON:0002371',
 'SKM': 'UBERON:0001134',
 'LIV': 'UBERON:0002107',
 'SPL': 'UBERON:0002106',
 'OME': 'UBERON:0003688',
 'CAE': 'UBERON:0001153',
 'THY': 'UBERON:0002370',
 'DUO': 'UBERON:0002114',
 'BLD': 'UBERON:0000178',
 'TCL': 'UBERON:0001157',
 'SCL': 'UBERON:0001159',
 'JEJLP': 'UBERON:0000030',
 'JEJEPI': 'UBERON:0000400'}

In [64]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].map(mapping)

In [65]:
adata.obs['tissue_ontology_term_id']

Pan_T7935490_AACCATGTCTCGTTTA    UBERON:0002116
Pan_T7935490_CACACCTGTCGACTAT    UBERON:0002116
Pan_T7935490_CGTAGGCCAATGGACG    UBERON:0002116
Pan_T7935490_CGTCAGGCAGGGATTG    UBERON:0002116
Pan_T7935490_CTACACCAGCGCTCCA    UBERON:0002116
                                      ...      
CZINY-0112_TTTGACTGTTCAAGGG      UBERON:0002048
CZINY-0112_TTTGATCCAACCCTCT      UBERON:0002048
CZINY-0112_TTTGATCGTCTTAGTG      UBERON:0002371
CZINY-0112_TTTGGAGGTCAGGCAA      UBERON:0002106
CZINY-0112_TTTGGAGTCGTTAGTG      UBERON:0002106
Name: tissue_ontology_term_id, Length: 51552, dtype: category
Categories (17, object): ['UBERON:0000178', 'UBERON:0002371', 'UBERON:0001153', 'UBERON:0002114', ..., 'UBERON:0001134', 'UBERON:0002106', 'UBERON:0001157', 'UBERON:0002370']

---

# <ins>var and raw.var

In [66]:
adata.var

Unnamed: 0_level_0,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,gene
ENSG00000237613,FAM138A,gene
ENSG00000186092,OR4F5,gene
ENSG00000238009,AL627309.1,gene
ENSG00000239945,AL627309.3,gene
...,...,...
ENSG00000277836,AC141272.1,gene
ENSG00000278633,AC023491.2,gene
ENSG00000276017,AC007325.1,gene
ENSG00000278817,AC007325.4,gene


## feature_biotype

Adjusted at the beginning!

In [67]:
#adata2.var['feature_biotype'] = ['gene'] * len(adata2.var.index)

## feature_id

Adjusted at the beginning!

In [68]:
#adata2.var_names = adata2.var["gene_ids"].copy()
#adata2.var_names.name = "feature_id"
#adata2.raw.var_names = adata2.raw.var["gene_ids"].copy()
#adata2.raw.var_names.name = "feature_id"

## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [69]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [70]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000243485    False
ENSG00000237613    False
ENSG00000186092    False
ENSG00000238009    False
ENSG00000239945    False
                   ...  
ENSG00000277836    False
ENSG00000278633    False
ENSG00000276017    False
ENSG00000278817    False
ENSG00000277196    False
Name: feature_is_filtered, Length: 36503, dtype: bool

# <ins>obsm

In [73]:
adata.obsm

AxisArrays with keys: X_umap

# <ins>uns

In [74]:
adata.uns['schema_version'] = '2.0.0'

In [75]:
adata.uns['title'] = 'Myeloid compartment'

In [76]:
adata.uns['X_normalization'] = 'Scanpy'

In [77]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [78]:
adata

AnnData object with n_obs × n_vars = 51552 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'Age_range_colors', 'Sex_colors', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    layers: 'counts'

In [79]:
adata.uns['development_stage_ontology_term_id_colors'] = adata.uns['Age_range_colors']
adata.uns['sex_ontology_term_id_colors'] = adata.uns['Sex_colors']

In [80]:
# Removing the same slots created for the schema
del adata.uns['Age_range_colors']
del adata.uns['Sex_colors']
del adata.obs['Organ'] # already in tissue_ontology_term_id
del adata.obs['Sex'] # already in sex_ontology_term_id
del adata.obs['Age_range'] # already in developmental_stage_ontology_term_id
del adata.obs['Chemistry'] # already in assay_ontology_term_id
del adata.layers

In [81]:
adata

AnnData object with n_obs × n_vars = 51552 × 36503
    obs: 'Donor', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding', 'development_stage_ontology_term_id_colors', 'sex_ontology_term_id_colors'
    obsm: 'X_umap'

In [82]:
adata.X

<51552x36503 sparse matrix of type '<class 'numpy.float32'>'
	with 144744532 stored elements in Compressed Sparse Row format>

In [83]:
adata.raw.X

<51552x36503 sparse matrix of type '<class 'numpy.float32'>'
	with 144744532 stored elements in Compressed Sparse Row format>

In [84]:
adata.write('imm_myelo.h5ad', compression = 'gzip')

... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical


---