# **Curating `tissue-immune` dataset:**

### B-cell dataset

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

---

## 2. Pre-prepare before schema adjustments

### Load raw matrix

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/cxgportal_sets/tissueimmune/CountAdded_PIP_B_object_for_cellxgene.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 54934 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [4]:
adata.obs

Unnamed: 0,Organ,Donor,Chemistry,Predicted_labels_CellTypist,Majority_voting_CellTypist,Manually_curated_celltype,Sex,Age_range
Pan_T7935490_AGGGAGTAGTACGCGA,ILE,A29,5v1,Memory B cells,Memory B cells,Memory B cells,Female,65-70
Pan_T7935490_ATTCTACGTAAGGGAA,ILE,A29,5v1,Naive B cells,Naive B cells,Naive B cells,Female,65-70
Pan_T7935490_ATTGGTGGTCTAAAGA,ILE,A29,5v1,Plasma cells,Plasma cells,Plasma cells,Female,65-70
Pan_T7935490_CCGGGATCAGCTTAAC,ILE,A29,5v1,Plasma cells,Plasma cells,Plasma cells,Female,65-70
Pan_T7935490_CCTTCGAGTCAGAAGC,ILE,A29,5v1,Plasma cells,Plasma cells,Plasma cells,Female,65-70
...,...,...,...,...,...,...,...,...
CZINY-0112_TTTGACTAGGAGAATG,BMA,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60
CZINY-0112_TTTGACTTCGAACTCA,LLN,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60
CZINY-0112_TTTGGAGCACGACTAT,LLN,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60
CZINY-0112_TTTGGAGGTACGCGTC,SPL,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60


In [5]:
adata2 = sc.read_10x_mtx('/warehouse/cellgeni/tic-1168/GSE148073/GSM4453619/output/Gene/filtered')

In [6]:
adata.var

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
AC141272.1
AC023491.2
AC007325.1
AC007325.4
AC007325.2


In [7]:
adata.var = adata2.var

In [8]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


### Remove the genes that do not match in gene-ensemblID file

In [9]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


In [10]:
adata.var['gene_symbols'] = adata.var_names

In [11]:
adata.var_names = adata.var['gene_ids']

In [12]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,ENSG00000243485,Gene Expression,MIR1302-2HG
ENSG00000237613,ENSG00000237613,Gene Expression,FAM138A
ENSG00000186092,ENSG00000186092,Gene Expression,OR4F5
ENSG00000238009,ENSG00000238009,Gene Expression,AL627309.1
ENSG00000239945,ENSG00000239945,Gene Expression,AL627309.3
...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,AC141272.1
ENSG00000278633,ENSG00000278633,Gene Expression,AC023491.2
ENSG00000276017,ENSG00000276017,Gene Expression,AC007325.1
ENSG00000278817,ENSG00000278817,Gene Expression,AC007325.4


In [13]:
genes = pd.read_csv('/home/jovyan/genes_homo_sapiens.csv', header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [14]:
adata

AnnData object with n_obs × n_vars = 54934 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_ids', 'feature_types', 'gene_symbols'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [15]:
adata = adata[:,adata.var_names.isin(genes[0])]

  res = method(*args, **kwargs)


In [16]:
adata

View of AnnData object with n_obs × n_vars = 54934 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_ids', 'feature_types', 'gene_symbols'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [17]:
adata.var['feature_biotype'] = ['gene'] * len(adata.var.index)

Trying to set attribute `.var` of view, copying.


In [18]:
del adata.var['feature_types']
del adata.var['gene_ids']

In [19]:
raw = adata.copy()

In [20]:
raw

AnnData object with n_obs × n_vars = 54934 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range'
    var: 'gene_symbols', 'feature_biotype'
    uns: 'Age_range_colors', 'Sex_colors'
    obsm: 'X_umap'
    layers: 'counts'

In [21]:
raw.X = adata.layers['counts']

In [22]:
del raw.obs
del raw.uns
del raw.obsm
del raw.layers

In [23]:
del raw.var['gene_symbols']

In [24]:
adata.raw = raw

In [25]:
del raw

---

## 3. Apply the schema requirements:

# <ins>X

adjusted!

# <ins>obs

In [26]:
adata.obs.columns

Index(['Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist',
       'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex',
       'Age_range'],
      dtype='object')

In [27]:
adata.obs

Unnamed: 0,Organ,Donor,Chemistry,Predicted_labels_CellTypist,Majority_voting_CellTypist,Manually_curated_celltype,Sex,Age_range
Pan_T7935490_AGGGAGTAGTACGCGA,ILE,A29,5v1,Memory B cells,Memory B cells,Memory B cells,Female,65-70
Pan_T7935490_ATTCTACGTAAGGGAA,ILE,A29,5v1,Naive B cells,Naive B cells,Naive B cells,Female,65-70
Pan_T7935490_ATTGGTGGTCTAAAGA,ILE,A29,5v1,Plasma cells,Plasma cells,Plasma cells,Female,65-70
Pan_T7935490_CCGGGATCAGCTTAAC,ILE,A29,5v1,Plasma cells,Plasma cells,Plasma cells,Female,65-70
Pan_T7935490_CCTTCGAGTCAGAAGC,ILE,A29,5v1,Plasma cells,Plasma cells,Plasma cells,Female,65-70
...,...,...,...,...,...,...,...,...
CZINY-0112_TTTGACTAGGAGAATG,BMA,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60
CZINY-0112_TTTGACTTCGAACTCA,LLN,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60
CZINY-0112_TTTGGAGCACGACTAT,LLN,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60
CZINY-0112_TTTGGAGGTACGCGTC,SPL,D496,3,Memory B cells,Memory B cells,Memory B cells,Male,55-60


## assay_ontology_term_id

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

In [28]:
adata.obs['assay_ontology_term_id'] = adata.obs['Chemistry']

In [29]:
adata.obs['assay_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    5v1
Pan_T7935490_ATTCTACGTAAGGGAA    5v1
Pan_T7935490_ATTGGTGGTCTAAAGA    5v1
Pan_T7935490_CCGGGATCAGCTTAAC    5v1
Pan_T7935490_CCTTCGAGTCAGAAGC    5v1
                                ... 
CZINY-0112_TTTGACTAGGAGAATG        3
CZINY-0112_TTTGACTTCGAACTCA        3
CZINY-0112_TTTGGAGCACGACTAT        3
CZINY-0112_TTTGGAGGTACGCGTC        3
CZINY-0112_TTTGGAGTCCTATTGT        3
Name: assay_ontology_term_id, Length: 54934, dtype: category
Categories (3, object): ['3', '5v1', '5v2']

In [30]:
mapping = {'3': 'EFO:0009922', '5v1': 'EFO:0011025', '5v2': 'EFO:0009900'}

In [31]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].map(mapping)

In [32]:
adata.obs['assay_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    EFO:0011025
Pan_T7935490_ATTCTACGTAAGGGAA    EFO:0011025
Pan_T7935490_ATTGGTGGTCTAAAGA    EFO:0011025
Pan_T7935490_CCGGGATCAGCTTAAC    EFO:0011025
Pan_T7935490_CCTTCGAGTCAGAAGC    EFO:0011025
                                    ...     
CZINY-0112_TTTGACTAGGAGAATG      EFO:0009922
CZINY-0112_TTTGACTTCGAACTCA      EFO:0009922
CZINY-0112_TTTGGAGCACGACTAT      EFO:0009922
CZINY-0112_TTTGGAGGTACGCGTC      EFO:0009922
CZINY-0112_TTTGGAGTCCTATTGT      EFO:0009922
Name: assay_ontology_term_id, Length: 54934, dtype: category
Categories (3, object): ['EFO:0009922', 'EFO:0011025', 'EFO:0009900']

## cell_type_ontology_term_id

In [33]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['Manually_curated_celltype']

In [34]:
adata.obs['cell_type_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    Memory B cells
Pan_T7935490_ATTCTACGTAAGGGAA     Naive B cells
Pan_T7935490_ATTGGTGGTCTAAAGA      Plasma cells
Pan_T7935490_CCGGGATCAGCTTAAC      Plasma cells
Pan_T7935490_CCTTCGAGTCAGAAGC      Plasma cells
                                      ...      
CZINY-0112_TTTGACTAGGAGAATG      Memory B cells
CZINY-0112_TTTGACTTCGAACTCA      Memory B cells
CZINY-0112_TTTGGAGCACGACTAT      Memory B cells
CZINY-0112_TTTGGAGGTACGCGTC      Memory B cells
CZINY-0112_TTTGGAGTCCTATTGT      Memory B cells
Name: cell_type_ontology_term_id, Length: 54934, dtype: category
Categories (11, object): ['ABCs', 'GC_B (I)', 'GC_B (II)', 'MNP/B doublets', ..., 'Plasmablasts', 'Pre-B', 'Pro-B', 'T/B doublets']

In [35]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['Memory B cells',
 'Naive B cells',
 'Plasma cells',
 'MNP/B doublets',
 'ABCs',
 'T/B doublets',
 'GC_B (I)',
 'GC_B (II)',
 'Plasmablasts',
 'Pro-B',
 'Pre-B']

In [36]:
mapping = {'Memory B cells': 'CL:0000787',
 'Naive B cells': 'CL:0000788',
 'Plasma cells': 'CL:0000786',
 'MNP/B doublets': 'CL:0000548', # ??
 'ABCs': 'CL:0000787', 
 'T/B doublets': 'CL:0000542', # lymphocyte (group that contains both B and T cells)
 'GC_B (I)': 'CL:0000844', # ??
 'GC_B (II)': 'CL:0000844', # ??
 'Plasmablasts': 'CL:0000980',
 'Pro-B': 'CL:0000826',
 'Pre-B': 'CL:0000817'} # precursor B cell?

In [37]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [38]:
adata.obs['cell_type_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    CL:0000787
Pan_T7935490_ATTCTACGTAAGGGAA    CL:0000788
Pan_T7935490_ATTGGTGGTCTAAAGA    CL:0000786
Pan_T7935490_CCGGGATCAGCTTAAC    CL:0000786
Pan_T7935490_CCTTCGAGTCAGAAGC    CL:0000786
                                    ...    
CZINY-0112_TTTGACTAGGAGAATG      CL:0000787
CZINY-0112_TTTGACTTCGAACTCA      CL:0000787
CZINY-0112_TTTGGAGCACGACTAT      CL:0000787
CZINY-0112_TTTGGAGGTACGCGTC      CL:0000787
CZINY-0112_TTTGGAGTCCTATTGT      CL:0000787
Name: cell_type_ontology_term_id, Length: 54934, dtype: object

## development_stage_ontology_term_id

In [39]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Age_range']

In [40]:
adata.obs['development_stage_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    65-70
Pan_T7935490_ATTCTACGTAAGGGAA    65-70
Pan_T7935490_ATTGGTGGTCTAAAGA    65-70
Pan_T7935490_CCGGGATCAGCTTAAC    65-70
Pan_T7935490_CCTTCGAGTCAGAAGC    65-70
                                 ...  
CZINY-0112_TTTGACTAGGAGAATG      55-60
CZINY-0112_TTTGACTTCGAACTCA      55-60
CZINY-0112_TTTGGAGCACGACTAT      55-60
CZINY-0112_TTTGGAGGTACGCGTC      55-60
CZINY-0112_TTTGGAGTCCTATTGT      55-60
Name: development_stage_ontology_term_id, Length: 54934, dtype: category
Categories (5, object): ['50-54', '55-60', '60-64', '65-70', '70-74']

In [41]:
mapping = {'50-54': 'HsapDv:0000240', # 50-60
           '55-60': 'HsapDv:0000240', # 50-60
           '60-64': 'HsapDv:0000241', # 60-70
           '65-70': 'HsapDv:0000241', # 60-70
           '70-74': 'HsapDv:0000242'} # 70-80

In [42]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [43]:
adata.obs['development_stage_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    HsapDv:0000241
Pan_T7935490_ATTCTACGTAAGGGAA    HsapDv:0000241
Pan_T7935490_ATTGGTGGTCTAAAGA    HsapDv:0000241
Pan_T7935490_CCGGGATCAGCTTAAC    HsapDv:0000241
Pan_T7935490_CCTTCGAGTCAGAAGC    HsapDv:0000241
                                      ...      
CZINY-0112_TTTGACTAGGAGAATG      HsapDv:0000240
CZINY-0112_TTTGACTTCGAACTCA      HsapDv:0000240
CZINY-0112_TTTGGAGCACGACTAT      HsapDv:0000240
CZINY-0112_TTTGGAGGTACGCGTC      HsapDv:0000240
CZINY-0112_TTTGGAGTCCTATTGT      HsapDv:0000240
Name: development_stage_ontology_term_id, Length: 54934, dtype: object

## disease_ontology_term_id

"PATO:0000461" if healthy

In [44]:
adata.obs['disease_ontology_term_id'] = ["PATO:0000461"] * len(adata.obs)

In [45]:
adata.obs['disease_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    PATO:0000461
Pan_T7935490_ATTCTACGTAAGGGAA    PATO:0000461
Pan_T7935490_ATTGGTGGTCTAAAGA    PATO:0000461
Pan_T7935490_CCGGGATCAGCTTAAC    PATO:0000461
Pan_T7935490_CCTTCGAGTCAGAAGC    PATO:0000461
                                     ...     
CZINY-0112_TTTGACTAGGAGAATG      PATO:0000461
CZINY-0112_TTTGACTTCGAACTCA      PATO:0000461
CZINY-0112_TTTGGAGCACGACTAT      PATO:0000461
CZINY-0112_TTTGGAGGTACGCGTC      PATO:0000461
CZINY-0112_TTTGGAGTCCTATTGT      PATO:0000461
Name: disease_ontology_term_id, Length: 54934, dtype: object

## ethnicity_ontology_term_id

In [46]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [47]:
adata.obs['ethnicity_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    unknown
Pan_T7935490_ATTCTACGTAAGGGAA    unknown
Pan_T7935490_ATTGGTGGTCTAAAGA    unknown
Pan_T7935490_CCGGGATCAGCTTAAC    unknown
Pan_T7935490_CCTTCGAGTCAGAAGC    unknown
                                  ...   
CZINY-0112_TTTGACTAGGAGAATG      unknown
CZINY-0112_TTTGACTTCGAACTCA      unknown
CZINY-0112_TTTGGAGCACGACTAT      unknown
CZINY-0112_TTTGGAGGTACGCGTC      unknown
CZINY-0112_TTTGGAGTCCTATTGT      unknown
Name: ethnicity_ontology_term_id, Length: 54934, dtype: object

## is_primary_data

In [48]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [49]:
adata.obs['is_primary_data']

Pan_T7935490_AGGGAGTAGTACGCGA    False
Pan_T7935490_ATTCTACGTAAGGGAA    False
Pan_T7935490_ATTGGTGGTCTAAAGA    False
Pan_T7935490_CCGGGATCAGCTTAAC    False
Pan_T7935490_CCTTCGAGTCAGAAGC    False
                                 ...  
CZINY-0112_TTTGACTAGGAGAATG      False
CZINY-0112_TTTGACTTCGAACTCA      False
CZINY-0112_TTTGGAGCACGACTAT      False
CZINY-0112_TTTGGAGGTACGCGTC      False
CZINY-0112_TTTGGAGTCCTATTGT      False
Name: is_primary_data, Length: 54934, dtype: bool

In [50]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [51]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [52]:
adata.obs['organism_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    NCBITaxon:9606
Pan_T7935490_ATTCTACGTAAGGGAA    NCBITaxon:9606
Pan_T7935490_ATTGGTGGTCTAAAGA    NCBITaxon:9606
Pan_T7935490_CCGGGATCAGCTTAAC    NCBITaxon:9606
Pan_T7935490_CCTTCGAGTCAGAAGC    NCBITaxon:9606
                                      ...      
CZINY-0112_TTTGACTAGGAGAATG      NCBITaxon:9606
CZINY-0112_TTTGACTTCGAACTCA      NCBITaxon:9606
CZINY-0112_TTTGGAGCACGACTAT      NCBITaxon:9606
CZINY-0112_TTTGGAGGTACGCGTC      NCBITaxon:9606
CZINY-0112_TTTGGAGTCCTATTGT      NCBITaxon:9606
Name: organism_ontology_term_id, Length: 54934, dtype: object

In [53]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [54]:
adata.obs['organism_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    NCBITaxon:9606
Pan_T7935490_ATTCTACGTAAGGGAA    NCBITaxon:9606
Pan_T7935490_ATTGGTGGTCTAAAGA    NCBITaxon:9606
Pan_T7935490_CCGGGATCAGCTTAAC    NCBITaxon:9606
Pan_T7935490_CCTTCGAGTCAGAAGC    NCBITaxon:9606
                                      ...      
CZINY-0112_TTTGACTAGGAGAATG      NCBITaxon:9606
CZINY-0112_TTTGACTTCGAACTCA      NCBITaxon:9606
CZINY-0112_TTTGGAGCACGACTAT      NCBITaxon:9606
CZINY-0112_TTTGGAGGTACGCGTC      NCBITaxon:9606
CZINY-0112_TTTGGAGTCCTATTGT      NCBITaxon:9606
Name: organism_ontology_term_id, Length: 54934, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [55]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sex']

In [56]:
adata.obs['sex_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    Female
Pan_T7935490_ATTCTACGTAAGGGAA    Female
Pan_T7935490_ATTGGTGGTCTAAAGA    Female
Pan_T7935490_CCGGGATCAGCTTAAC    Female
Pan_T7935490_CCTTCGAGTCAGAAGC    Female
                                  ...  
CZINY-0112_TTTGACTAGGAGAATG        Male
CZINY-0112_TTTGACTTCGAACTCA        Male
CZINY-0112_TTTGGAGCACGACTAT        Male
CZINY-0112_TTTGGAGGTACGCGTC        Male
CZINY-0112_TTTGGAGTCCTATTGT        Male
Name: sex_ontology_term_id, Length: 54934, dtype: category
Categories (2, object): ['Female', 'Male']

**PATO_0000384 for male  
PATO_0000383 for female**

In [57]:
mapping = {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [58]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [59]:
adata.obs['sex_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    PATO:0000383
Pan_T7935490_ATTCTACGTAAGGGAA    PATO:0000383
Pan_T7935490_ATTGGTGGTCTAAAGA    PATO:0000383
Pan_T7935490_CCGGGATCAGCTTAAC    PATO:0000383
Pan_T7935490_CCTTCGAGTCAGAAGC    PATO:0000383
                                     ...     
CZINY-0112_TTTGACTAGGAGAATG      PATO:0000384
CZINY-0112_TTTGACTTCGAACTCA      PATO:0000384
CZINY-0112_TTTGGAGCACGACTAT      PATO:0000384
CZINY-0112_TTTGGAGGTACGCGTC      PATO:0000384
CZINY-0112_TTTGGAGTCCTATTGT      PATO:0000384
Name: sex_ontology_term_id, Length: 54934, dtype: category
Categories (2, object): ['PATO:0000383', 'PATO:0000384']

## tissue_ontology_term_id

> All tissue dissociation protocols are available on protocols.io [40]: spleen (UBERON:0002106), esophagus epithelium (UBERON:0001976), and lung parenchyma (UBERON:0008946).

In [60]:
adata.obs['tissue_ontology_term_id'] = adata.obs['Organ']

In [61]:
adata.obs['tissue_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    ILE
Pan_T7935490_ATTCTACGTAAGGGAA    ILE
Pan_T7935490_ATTGGTGGTCTAAAGA    ILE
Pan_T7935490_CCGGGATCAGCTTAAC    ILE
Pan_T7935490_CCTTCGAGTCAGAAGC    ILE
                                ... 
CZINY-0112_TTTGACTAGGAGAATG      BMA
CZINY-0112_TTTGACTTCGAACTCA      LLN
CZINY-0112_TTTGGAGCACGACTAT      LLN
CZINY-0112_TTTGGAGGTACGCGTC      SPL
CZINY-0112_TTTGGAGTCCTATTGT      LLN
Name: tissue_ontology_term_id, Length: 54934, dtype: category
Categories (17, object): ['BLD', 'BMA', 'CAE', 'DUO', ..., 'SKM', 'SPL', 'TCL', 'THY']

In [62]:
list(adata.obs['tissue_ontology_term_id'].unique())

['ILE',
 'LNG',
 'LLN',
 'MLN',
 'BMA',
 'SKM',
 'LIV',
 'SPL',
 'OME',
 'CAE',
 'THY',
 'DUO',
 'BLD',
 'TCL',
 'SCL',
 'JEJEPI',
 'JEJLP']

In [63]:
mapping = {'ILE': 'UBERON:0002116',
 'LNG': 'UBERON:0002048',
 'LLN': 'UBERON:0007644',
 'MLN': 'UBERON:0002509',
 'BMA': 'UBERON:0002371',
 'SKM': 'UBERON:0001134',
 'LIV': 'UBERON:0002107',
 'SPL': 'UBERON:0002106',
 'OME': 'UBERON:0003688',
 'CAE': 'UBERON:0001153',
 'THY': 'UBERON:0002370',
 'DUO': 'UBERON:0002114',
 'BLD': 'UBERON:0000178',
 'TCL': 'UBERON:0001157',
 'SCL': 'UBERON:0001159',
 'JEJLP': 'UBERON:0000030',
 'JEJEPI': 'UBERON:0000400'}

In [64]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].map(mapping)

In [65]:
adata.obs['tissue_ontology_term_id']

Pan_T7935490_AGGGAGTAGTACGCGA    UBERON:0002116
Pan_T7935490_ATTCTACGTAAGGGAA    UBERON:0002116
Pan_T7935490_ATTGGTGGTCTAAAGA    UBERON:0002116
Pan_T7935490_CCGGGATCAGCTTAAC    UBERON:0002116
Pan_T7935490_CCTTCGAGTCAGAAGC    UBERON:0002116
                                      ...      
CZINY-0112_TTTGACTAGGAGAATG      UBERON:0002371
CZINY-0112_TTTGACTTCGAACTCA      UBERON:0007644
CZINY-0112_TTTGGAGCACGACTAT      UBERON:0007644
CZINY-0112_TTTGGAGGTACGCGTC      UBERON:0002106
CZINY-0112_TTTGGAGTCCTATTGT      UBERON:0007644
Name: tissue_ontology_term_id, Length: 54934, dtype: category
Categories (17, object): ['UBERON:0000178', 'UBERON:0002371', 'UBERON:0001153', 'UBERON:0002114', ..., 'UBERON:0001134', 'UBERON:0002106', 'UBERON:0001157', 'UBERON:0002370']

---

# <ins>var and raw.var

In [66]:
adata.var

Unnamed: 0_level_0,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,gene
ENSG00000237613,FAM138A,gene
ENSG00000186092,OR4F5,gene
ENSG00000238009,AL627309.1,gene
ENSG00000239945,AL627309.3,gene
...,...,...
ENSG00000277836,AC141272.1,gene
ENSG00000278633,AC023491.2,gene
ENSG00000276017,AC007325.1,gene
ENSG00000278817,AC007325.4,gene


## feature_is_filtered

In [67]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [68]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000243485    False
ENSG00000237613    False
ENSG00000186092    False
ENSG00000238009    False
ENSG00000239945    False
                   ...  
ENSG00000277836    False
ENSG00000278633    False
ENSG00000276017    False
ENSG00000278817    False
ENSG00000277196    False
Name: feature_is_filtered, Length: 36503, dtype: bool

# <ins>obsm

In [71]:
adata.obsm

AxisArrays with keys: X_umap

# <ins>uns

In [72]:
adata.uns['schema_version'] = '2.0.0'

In [73]:
adata.uns['title'] = 'B cell compartment'

In [74]:
adata.uns['X_normalization'] = 'Scanpy'

In [75]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [76]:
adata

AnnData object with n_obs × n_vars = 54934 × 36503
    obs: 'Organ', 'Donor', 'Chemistry', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'Sex', 'Age_range', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'Age_range_colors', 'Sex_colors', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    layers: 'counts'

In [77]:
adata.uns['development_stage_ontology_term_id_colors'] = adata.uns['Age_range_colors']
adata.uns['sex_ontology_term_id_colors'] = adata.uns['Sex_colors']

In [78]:
# Removing the same slots created for the schema
del adata.uns['Age_range_colors']
del adata.uns['Sex_colors']
del adata.obs['Organ'] # already in tissue_ontology_term_id
del adata.obs['Sex'] # already in sex_ontology_term_id
del adata.obs['Age_range'] # already in developmental_stage_ontology_term_id
del adata.obs['Chemistry'] # already in assay_ontology_term_id
del adata.layers

In [79]:
adata

AnnData object with n_obs × n_vars = 54934 × 36503
    obs: 'Donor', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding', 'development_stage_ontology_term_id_colors', 'sex_ontology_term_id_colors'
    obsm: 'X_umap'

In [80]:
adata.X

<54934x36503 sparse matrix of type '<class 'numpy.float32'>'
	with 103413359 stored elements in Compressed Sparse Row format>

In [81]:
adata.raw.X

<54934x36503 sparse matrix of type '<class 'numpy.float32'>'
	with 103413359 stored elements in Compressed Sparse Row format>

In [82]:
adata.write('imm_bcell.h5ad', compression = 'gzip')

... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical


In [115]:
adata.var

Unnamed: 0_level_0,feature_types,gene_symbols,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,Gene Expression,MIR1302-2HG,True
ENSG00000237613,Gene Expression,FAM138A,True
ENSG00000186092,Gene Expression,OR4F5,True
ENSG00000238009,Gene Expression,AL627309.1,False
ENSG00000239945,Gene Expression,AL627309.3,False
...,...,...,...
ENSG00000277836,Gene Expression,AC141272.1,False
ENSG00000278633,Gene Expression,AC023491.2,False
ENSG00000276017,Gene Expression,AC007325.1,True
ENSG00000278817,Gene Expression,AC007325.4,False
