# **Curating `menon19.processed.h5ad` dataset:**

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [7]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

Do not forget `mount-farm`

In [8]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/menon19.processed.h5ad')

In [9]:
adata

AnnData object with n_obs × n_vars = 20091 × 19719
    obs: 'tissue', 'individual', 'CellType'
    var: 'name'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'X_tsne'

In [10]:
adata.X[1:10,1:10].todense()

matrix([[1., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

## 2. Pre-prepare before schema adjustments

### Load raw matrix

Matrix in the object is raw, therefore raw.X won't be used

### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [11]:
sym2ensg = pd.read_table('../GRCh38-1.2.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [13]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [14]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in sym2ensg['gene_symbol']:
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        poss = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))
        if len(poss) > 0:
            ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
        else:
            continue

    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 19719/19719 [00:12<00:00, 1594.35it/s]


### Remove the genes that do not match in gene-ensemblID file

In [15]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [16]:
notvalid

Unnamed: 0_level_0,name,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1
RP4-635E18.8,RP4-635E18.8,ENSG00000271895
NPPA-AS1,NPPA-AS1,ENSG00000242349
UBXN10-AS1,UBXN10-AS1,ENSG00000225986
RP11-268J15.5,RP11-268J15.5,ENSG00000116883
RP11-69E11.4,RP11-69E11.4,ENSG00000182109
...,...,...
RP3-323A16.1,RP3-323A16.1,ENSG00000279184
SCO2,SCO2,ENSG00000130489
AF165138.7,AF165138.7,ENSG00000243440
AP000473.5,AP000473.5,ENSG00000228798


In [17]:
sum(notvalid['gene_ids'] == 'a')

7

In [18]:
sum(notvalid['gene_ids'] != 'a')

203

In [19]:
sum(~adata.var_names.isin(notvalid.index))

19509

In [20]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [21]:
adata

View of AnnData object with n_obs × n_vars = 20091 × 19509
    obs: 'tissue', 'individual', 'CellType'
    var: 'name', 'gene_ids'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'X_tsne'

### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [24]:
validlst.index = validlst['gene_ids']

In [25]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [26]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [27]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        if symb_val[0].startswith('ENSG'):
            continue
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

100%|██████████| 19509/19509 [05:16<00:00, 61.56it/s]


In [28]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [29]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [30]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [31]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [32]:
symbols

Unnamed: 0,oldsymb,newsymb
0,FO538757.2,WASH9P
1,AP006222.2,AP006222.1
2,RP4-669L17.10,RP4-669L17.4
3,RP5-857K21.4,RP11-206L10.17
4,RP11-206L10.9,LINC01409
...,...,...
1419,AP001055.6,LINC01678
1420,C21orf2,CFAP410
1421,AL133493.2,LINC01694
1422,AL592528.1,PCBP3-AS1


In [33]:
adata.var

Unnamed: 0_level_0,name,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000238009,RP11-34P13.7,ENSG00000238009,RP11-34P13.7
ENSG00000279457,FO538757.2,ENSG00000279457,FO538757.2
ENSG00000228463,AP006222.2,ENSG00000228463,AP006222.2
ENSG00000237094,RP4-669L17.10,ENSG00000237094,RP4-669L17.10
ENSG00000230021,RP5-857K21.4,ENSG00000230021,RP5-857K21.4
...,...,...,...
ENSG00000276256,AC011043.1,ENSG00000276256,AC011043.1
ENSG00000273748,AL592183.1,ENSG00000273748,AL592183.1
ENSG00000278817,AC007325.4,ENSG00000278817,AC007325.4
ENSG00000278384,AL354822.1,ENSG00000278384,AL354822.1


In [34]:
symbols

Unnamed: 0,oldsymb,newsymb
0,FO538757.2,WASH9P
1,AP006222.2,AP006222.1
2,RP4-669L17.10,RP4-669L17.4
3,RP5-857K21.4,RP11-206L10.17
4,RP11-206L10.9,LINC01409
...,...,...
1419,AP001055.6,LINC01678
1420,C21orf2,CFAP410
1421,AL133493.2,LINC01694
1422,AL592528.1,PCBP3-AS1


In [35]:
adata.var

Unnamed: 0_level_0,name,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000238009,RP11-34P13.7,ENSG00000238009,RP11-34P13.7
ENSG00000279457,FO538757.2,ENSG00000279457,FO538757.2
ENSG00000228463,AP006222.2,ENSG00000228463,AP006222.2
ENSG00000237094,RP4-669L17.10,ENSG00000237094,RP4-669L17.10
ENSG00000230021,RP5-857K21.4,ENSG00000230021,RP5-857K21.4
...,...,...,...
ENSG00000276256,AC011043.1,ENSG00000276256,AC011043.1
ENSG00000273748,AL592183.1,ENSG00000273748,AL592183.1
ENSG00000278817,AC007325.4,ENSG00000278817,AC007325.4
ENSG00000278384,AL354822.1,ENSG00000278384,AL354822.1


In [36]:
adata.var['gene_symbols'] = adata.var['name']

In [37]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

100%|██████████| 1424/1424 [00:01<00:00, 1069.04it/s]


In [38]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

---

---

## 3. Apply the schema requirements:

# <ins>X

Changing donor info:

In [39]:
adata.obs['individual'] = adata.obs['individual'].astype('object')

# <ins>obs

In [40]:
adata.obs.columns

Index(['tissue', 'individual', 'CellType'], dtype='object')

## assay_ontology_term_id

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

> Single-cell libraries were prepared using the Chromium 3′ v3 platform (10X Genomics) following the manufacturer’s protocol (Menon)

In [41]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009922"] * len(adata.obs)

In [42]:
adata.obs['assay_ontology_term_id']

index
AAACCCAAGTTACTCG-1    EFO:0009922
AAACCCAGTGGAACAC-1    EFO:0009922
AAACGAAAGAGAGGTA-1    EFO:0009922
AAACGAACAGGTTTAC-1    EFO:0009922
AAACGCTCAATGTCTG-1    EFO:0009922
                         ...     
TTTGGAGAGACTTAAG-2    EFO:0009922
TTTGGTTAGATCCCAT-2    EFO:0009922
TTTGGTTCAACCGATT-2    EFO:0009922
TTTGGTTGTTTGGAGG-2    EFO:0009922
TTTGTTGGTACCTTCC-2    EFO:0009922
Name: assay_ontology_term_id, Length: 20091, dtype: object

In [43]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [44]:
adata.obs['assay_ontology_term_id']

index
AAACCCAAGTTACTCG-1    EFO:0009922
AAACCCAGTGGAACAC-1    EFO:0009922
AAACGAAAGAGAGGTA-1    EFO:0009922
AAACGAACAGGTTTAC-1    EFO:0009922
AAACGCTCAATGTCTG-1    EFO:0009922
                         ...     
TTTGGAGAGACTTAAG-2    EFO:0009922
TTTGGTTAGATCCCAT-2    EFO:0009922
TTTGGTTCAACCGATT-2    EFO:0009922
TTTGGTTGTTTGGAGG-2    EFO:0009922
TTTGTTGGTACCTTCC-2    EFO:0009922
Name: assay_ontology_term_id, Length: 20091, dtype: category
Categories (1, object): ['EFO:0009922']

## cell_type_ontology_term_id

In [45]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [46]:
adata.obs['cell_type_ontology_term_id']

index
AAACCCAAGTTACTCG-1         Rods
AAACCCAGTGGAACAC-1         Rods
AAACGAAAGAGAGGTA-1          ACs
AAACGAACAGGTTTAC-1    Macroglia
AAACGCTCAATGTCTG-1          BPs
                        ...    
TTTGGAGAGACTTAAG-2         Rods
TTTGGTTAGATCCCAT-2    Macroglia
TTTGGTTCAACCGATT-2    Macroglia
TTTGGTTGTTTGGAGG-2         Rods
TTTGTTGGTACCTTCC-2         Rods
Name: cell_type_ontology_term_id, Length: 20091, dtype: category
Categories (9, object): ['ACs', 'BPs', 'Cones', 'Endo', ..., 'Macroglia', 'Microglia', 'RGCs', 'Rods']

In [47]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['Rods',
 'ACs',
 'Macroglia',
 'BPs',
 'HCs',
 'Cones',
 'RGCs',
 'Endo',
 'Microglia']

In [48]:
mapping = {'Rods': 'CL:0000604',
 'ACs': 'CL:0000561',
 'Macroglia': 'CL:0000126',
 'BPs': 'CL:0000748',
 'HCs': 'CL:0000745',
 'Cones': 'CL:0000573',
 'RGCs': 'CL:0000740',
 'Endo': 'CL:0002585',
 'Microglia': 'CL:0000129'}

In [49]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [50]:
adata.obs['cell_type_ontology_term_id']

index
AAACCCAAGTTACTCG-1    CL:0000604
AAACCCAGTGGAACAC-1    CL:0000604
AAACGAAAGAGAGGTA-1    CL:0000561
AAACGAACAGGTTTAC-1    CL:0000126
AAACGCTCAATGTCTG-1    CL:0000748
                         ...    
TTTGGAGAGACTTAAG-2    CL:0000604
TTTGGTTAGATCCCAT-2    CL:0000126
TTTGGTTCAACCGATT-2    CL:0000126
TTTGGTTGTTTGGAGG-2    CL:0000604
TTTGTTGGTACCTTCC-2    CL:0000604
Name: cell_type_ontology_term_id, Length: 20091, dtype: category
Categories (9, object): ['CL:0000561', 'CL:0000748', 'CL:0000573', 'CL:0002585', ..., 'CL:0000126', 'CL:0000129', 'CL:0000740', 'CL:0000604']

## developmental_stage_ontology_term_id

In [51]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['individual']

In [52]:
mapping = {555: 'HsapDv:0000162', # 68 years-old
           574: 'HsapDv:0000165', # 71 years-old
           607: 'HsapDv:0000212'} # 86 years-old

In [53]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [54]:
adata.obs['development_stage_ontology_term_id']

index
AAACCCAAGTTACTCG-1    HsapDv:0000165
AAACCCAGTGGAACAC-1    HsapDv:0000165
AAACGAAAGAGAGGTA-1    HsapDv:0000165
AAACGAACAGGTTTAC-1    HsapDv:0000165
AAACGCTCAATGTCTG-1    HsapDv:0000165
                           ...      
TTTGGAGAGACTTAAG-2    HsapDv:0000162
TTTGGTTAGATCCCAT-2    HsapDv:0000162
TTTGGTTCAACCGATT-2    HsapDv:0000162
TTTGGTTGTTTGGAGG-2    HsapDv:0000162
TTTGTTGGTACCTTCC-2    HsapDv:0000162
Name: development_stage_ontology_term_id, Length: 20091, dtype: object

## donor_id

In [55]:
adata.obs['donor_id'] = adata.obs['individual']

In [56]:
mapping = {555: 'donor_1', # 68 years-old
           574: 'donor_2', # 71 years-old
           607: 'donor_3'} # 86 years-old

In [57]:
adata.obs['donor_id'] = adata.obs['donor_id'].map(mapping)

In [58]:
adata.obs['donor_id']

index
AAACCCAAGTTACTCG-1    donor_2
AAACCCAGTGGAACAC-1    donor_2
AAACGAAAGAGAGGTA-1    donor_2
AAACGAACAGGTTTAC-1    donor_2
AAACGCTCAATGTCTG-1    donor_2
                       ...   
TTTGGAGAGACTTAAG-2    donor_1
TTTGGTTAGATCCCAT-2    donor_1
TTTGGTTCAACCGATT-2    donor_1
TTTGGTTGTTTGGAGG-2    donor_1
TTTGTTGGTACCTTCC-2    donor_1
Name: donor_id, Length: 20091, dtype: object

## disease_ontology_term_id

In [59]:
adata.obs['disease_ontology_term_id'] =  ['PATO:0000461'] * len(adata.obs)

In [60]:
adata.obs['disease_ontology_term_id']

index
AAACCCAAGTTACTCG-1    PATO:0000461
AAACCCAGTGGAACAC-1    PATO:0000461
AAACGAAAGAGAGGTA-1    PATO:0000461
AAACGAACAGGTTTAC-1    PATO:0000461
AAACGCTCAATGTCTG-1    PATO:0000461
                          ...     
TTTGGAGAGACTTAAG-2    PATO:0000461
TTTGGTTAGATCCCAT-2    PATO:0000461
TTTGGTTCAACCGATT-2    PATO:0000461
TTTGGTTGTTTGGAGG-2    PATO:0000461
TTTGTTGGTACCTTCC-2    PATO:0000461
Name: disease_ontology_term_id, Length: 20091, dtype: object

## ethnicity_ontology_term_id

In [61]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [62]:
adata.obs['ethnicity_ontology_term_id']

index
AAACCCAAGTTACTCG-1    unknown
AAACCCAGTGGAACAC-1    unknown
AAACGAAAGAGAGGTA-1    unknown
AAACGAACAGGTTTAC-1    unknown
AAACGCTCAATGTCTG-1    unknown
                       ...   
TTTGGAGAGACTTAAG-2    unknown
TTTGGTTAGATCCCAT-2    unknown
TTTGGTTCAACCGATT-2    unknown
TTTGGTTGTTTGGAGG-2    unknown
TTTGTTGGTACCTTCC-2    unknown
Name: ethnicity_ontology_term_id, Length: 20091, dtype: object

## is_primary_data

In [63]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [64]:
adata.obs['is_primary_data']

index
AAACCCAAGTTACTCG-1    True
AAACCCAGTGGAACAC-1    True
AAACGAAAGAGAGGTA-1    True
AAACGAACAGGTTTAC-1    True
AAACGCTCAATGTCTG-1    True
                      ... 
TTTGGAGAGACTTAAG-2    True
TTTGGTTAGATCCCAT-2    True
TTTGGTTCAACCGATT-2    True
TTTGGTTGTTTGGAGG-2    True
TTTGTTGGTACCTTCC-2    True
Name: is_primary_data, Length: 20091, dtype: bool

In [65]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [66]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [67]:
adata.obs['organism_ontology_term_id']

index
AAACCCAAGTTACTCG-1    NCBITaxon:9606
AAACCCAGTGGAACAC-1    NCBITaxon:9606
AAACGAAAGAGAGGTA-1    NCBITaxon:9606
AAACGAACAGGTTTAC-1    NCBITaxon:9606
AAACGCTCAATGTCTG-1    NCBITaxon:9606
                           ...      
TTTGGAGAGACTTAAG-2    NCBITaxon:9606
TTTGGTTAGATCCCAT-2    NCBITaxon:9606
TTTGGTTCAACCGATT-2    NCBITaxon:9606
TTTGGTTGTTTGGAGG-2    NCBITaxon:9606
TTTGTTGGTACCTTCC-2    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 20091, dtype: object

In [68]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [69]:
adata.obs['organism_ontology_term_id']

index
AAACCCAAGTTACTCG-1    NCBITaxon:9606
AAACCCAGTGGAACAC-1    NCBITaxon:9606
AAACGAAAGAGAGGTA-1    NCBITaxon:9606
AAACGAACAGGTTTAC-1    NCBITaxon:9606
AAACGCTCAATGTCTG-1    NCBITaxon:9606
                           ...      
TTTGGAGAGACTTAAG-2    NCBITaxon:9606
TTTGGTTAGATCCCAT-2    NCBITaxon:9606
TTTGGTTCAACCGATT-2    NCBITaxon:9606
TTTGGTTGTTTGGAGG-2    NCBITaxon:9606
TTTGTTGGTACCTTCC-2    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 20091, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [70]:
adata.obs['sex_ontology_term_id'] = ['PATO:0000384'] * len(adata.obs)

**PATO:0000384 for male  
PATO:0000383 for female**

In [71]:
adata.obs['sex_ontology_term_id']

index
AAACCCAAGTTACTCG-1    PATO:0000384
AAACCCAGTGGAACAC-1    PATO:0000384
AAACGAAAGAGAGGTA-1    PATO:0000384
AAACGAACAGGTTTAC-1    PATO:0000384
AAACGCTCAATGTCTG-1    PATO:0000384
                          ...     
TTTGGAGAGACTTAAG-2    PATO:0000384
TTTGGTTAGATCCCAT-2    PATO:0000384
TTTGGTTCAACCGATT-2    PATO:0000384
TTTGGTTGTTTGGAGG-2    PATO:0000384
TTTGTTGGTACCTTCC-2    PATO:0000384
Name: sex_ontology_term_id, Length: 20091, dtype: object

## suspension_type

In [72]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

## tissue_ontology_term_id

In [73]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue']

In [74]:
adata.obs['tissue_ontology_term_id']

index
AAACCCAAGTTACTCG-1    MR2
AAACCCAGTGGAACAC-1    MR2
AAACGAAAGAGAGGTA-1    MR2
AAACGAACAGGTTTAC-1    MR2
AAACGCTCAATGTCTG-1    MR2
                     ... 
TTTGGAGAGACTTAAG-2     PR
TTTGGTTAGATCCCAT-2     PR
TTTGGTTCAACCGATT-2     PR
TTTGGTTGTTTGGAGG-2     PR
TTTGTTGGTACCTTCC-2     PR
Name: tissue_ontology_term_id, Length: 20091, dtype: category
Categories (6, object): ['MR', 'MR2', 'MR3', 'PR', 'PR2', 'PR3']

In [75]:
mapping = {'MR': 'UBERON:0000053', 'MR2': 'UBERON:0000053', 'MR3': 'UBERON:0000053', 
           'PR': 'UBERON:0013682', 'PR2': 'UBERON:0013682', 'PR3': 'UBERON:0013682'}

In [76]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].map(mapping)

In [77]:
adata.obs['tissue_ontology_term_id']

index
AAACCCAAGTTACTCG-1    UBERON:0000053
AAACCCAGTGGAACAC-1    UBERON:0000053
AAACGAAAGAGAGGTA-1    UBERON:0000053
AAACGAACAGGTTTAC-1    UBERON:0000053
AAACGCTCAATGTCTG-1    UBERON:0000053
                           ...      
TTTGGAGAGACTTAAG-2    UBERON:0013682
TTTGGTTAGATCCCAT-2    UBERON:0013682
TTTGGTTCAACCGATT-2    UBERON:0013682
TTTGGTTGTTTGGAGG-2    UBERON:0013682
TTTGTTGGTACCTTCC-2    UBERON:0013682
Name: tissue_ontology_term_id, Length: 20091, dtype: object

---

# <ins>var and raw.var

In [78]:
adata.var

Unnamed: 0_level_0,name,gene_ids,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000238009,RP11-34P13.7,ENSG00000238009,RP11-34P13.7,gene
ENSG00000279457,FO538757.2,ENSG00000279457,WASH9P,gene
ENSG00000228463,AP006222.2,ENSG00000228463,AP006222.1,gene
ENSG00000237094,RP4-669L17.10,ENSG00000237094,RP4-669L17.4,gene
ENSG00000230021,RP5-857K21.4,ENSG00000230021,RP11-206L10.17,gene
...,...,...,...,...
ENSG00000276256,AC011043.1,ENSG00000276256,AC011043.1,gene
ENSG00000273748,AL592183.1,ENSG00000273748,AL592183.1,gene
ENSG00000278817,AC007325.4,ENSG00000278817,AC007325.4,gene
ENSG00000278384,AL354822.1,ENSG00000278384,AL354822.1,gene


## feature_biotype

Adjusted at the beginning!

In [79]:
#adata2.var['feature_biotype'] = ['gene'] * len(adata2.var.index)

## feature_id

Adjusted at the beginning!

In [80]:
#adata2.var_names = adata2.var["gene_ids"].copy()
#adata2.var_names.name = "feature_id"
#adata2.raw.var_names = adata2.raw.var["gene_ids"].copy()
#adata2.raw.var_names.name = "feature_id"

## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [81]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [82]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000279457    False
ENSG00000228463    False
ENSG00000237094    False
ENSG00000230021    False
                   ...  
ENSG00000276256    False
ENSG00000273748    False
ENSG00000278817    False
ENSG00000278384    False
ENSG00000271254    False
Name: feature_is_filtered, Length: 19509, dtype: bool

# <ins>obsm

In [83]:
adata.obsm

AxisArrays with keys: X_pca, X_pca_harmony, X_umap, X_tsne

# <ins>uns

In [84]:
adata.uns['schema_version'] = '2.0.0'

In [85]:
adata.uns['title'] = 'Retina' # or Single-cell transcriptomic atlas of the human retina

In [96]:
adata.uns['X_normalization'] = 'none'

In [87]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [88]:
adata

AnnData object with n_obs × n_vars = 20091 × 19509
    obs: 'tissue', 'individual', 'CellType', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'name', 'gene_ids', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'X_tsne'

In [89]:
adata.obs

Unnamed: 0_level_0,tissue,individual,CellType,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AAACCCAAGTTACTCG-1,MR2,574,Rods,EFO:0009922,CL:0000604,HsapDv:0000165,donor_2,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0000053
AAACCCAGTGGAACAC-1,MR2,574,Rods,EFO:0009922,CL:0000604,HsapDv:0000165,donor_2,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0000053
AAACGAAAGAGAGGTA-1,MR2,574,ACs,EFO:0009922,CL:0000561,HsapDv:0000165,donor_2,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0000053
AAACGAACAGGTTTAC-1,MR2,574,Macroglia,EFO:0009922,CL:0000126,HsapDv:0000165,donor_2,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0000053
AAACGCTCAATGTCTG-1,MR2,574,BPs,EFO:0009922,CL:0000748,HsapDv:0000165,donor_2,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0000053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGAGACTTAAG-2,PR,555,Rods,EFO:0009922,CL:0000604,HsapDv:0000162,donor_1,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0013682
TTTGGTTAGATCCCAT-2,PR,555,Macroglia,EFO:0009922,CL:0000126,HsapDv:0000162,donor_1,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0013682
TTTGGTTCAACCGATT-2,PR,555,Macroglia,EFO:0009922,CL:0000126,HsapDv:0000162,donor_1,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0013682
TTTGGTTGTTTGGAGG-2,PR,555,Rods,EFO:0009922,CL:0000604,HsapDv:0000162,donor_1,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,cell,UBERON:0013682


In [90]:
# Removing the same slots created for the schema
del adata.obs['tissue']
del adata.obs['individual']

In [91]:
adata

AnnData object with n_obs × n_vars = 20091 × 19509
    obs: 'CellType', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'name', 'gene_ids', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'X_tsne'

In [94]:
del adata.var['gene_ids']

In [92]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [97]:
adata.write('menon19_f.h5ad', compression = 'gzip')