# **Curating `voigt19.processed.h5ad` dataset:**

* includes ~4K cells
* retina

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

Do not forget `mount-farm`

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/voigt19.processed.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 4335 × 21040
    obs: 'batch', 'final_cluster_labels', 'library', 'Location', 'CellType', 'Donor'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

## 2. Pre-prepare before schema adjustments

### Load raw matrix

In [4]:
import glob

path = r'/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/raw/voigt19/PNAS' # use your path
all_files = glob.glob(path + "/*.csv")
all_files.sort()

raw2 = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    if genenames == []:
        genenames = list(df.iloc[:,0])
    df = df.iloc[:,1:]
    cellnames.append(list(df.columns))
    df = scipy.sparse.csr_matrix(df)
    raw2.append(df)

raw = scipy.sparse.hstack(raw2)
raw = scipy.sparse.csr_matrix(raw)

100%|██████████| 6/6 [00:05<00:00,  1.04it/s]


In [7]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RP11-34P13.7,False,0.006743,1.271277,0.401708
RP11-34P13.8,False,0.000423,0.164355,-1.290357
AL627309.1,False,0.016197,0.941432,-0.102501
AP006222.2,False,0.114638,0.959162,-0.075398
RP4-669L17.10,False,0.012195,1.257201,0.380192
...,...,...,...,...
AC011841.1,False,0.002423,0.160377,-1.296439
AL354822.1,False,0.009853,0.862335,-0.223410
KIR2DL2,True,0.040955,1.787913,1.191449
PNRC2.1,False,0.103337,1.110076,0.155293


In [8]:
adata

AnnData object with n_obs × n_vars = 4335 × 21040
    obs: 'batch', 'final_cluster_labels', 'library', 'Location', 'CellType', 'Donor'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [9]:
raw

<21040x4335 sparse matrix of type '<class 'numpy.int64'>'
	with 9811304 stored elements in Compressed Sparse Row format>

In [10]:
raw = sc.AnnData(X = raw.T, var = adata.var)

### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [11]:
sym2ensg = pd.read_table('../hg19-3.0.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [13]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [14]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RP11-34P13.7,False,0.006743,1.271277,0.401708,a
RP11-34P13.8,False,0.000423,0.164355,-1.290357,a
AL627309.1,False,0.016197,0.941432,-0.102501,a
AP006222.2,False,0.114638,0.959162,-0.075398,a
RP4-669L17.10,False,0.012195,1.257201,0.380192,a
...,...,...,...,...,...
AC011841.1,False,0.002423,0.160377,-1.296439,a
AL354822.1,False,0.009853,0.862335,-0.223410,a
KIR2DL2,True,0.040955,1.787913,1.191449,a
PNRC2.1,False,0.103337,1.110076,0.155293,a


In [15]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in list(sym2ensg['gene_symbol']):
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        poss = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))
        if len(poss) > 0:
            ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
        else:
            continue

    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 21040/21040 [00:52<00:00, 398.08it/s]


### Remove the genes that do not match in gene-ensemblID file

In [16]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [17]:
notvalid

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AL627309.1,False,0.016197,0.941432,-0.102501,ENSG00000237683
RP11-206L10.3,False,0.017483,0.941532,-0.102348,ENSG00000235373
AL590822.2,False,0.004417,0.529589,-0.732053,ENSG00000269554
AL590822.1,False,0.027071,0.853901,-0.236302,ENSG00000203301
AL589739.1,False,0.009814,0.747019,-0.399685,ENSG00000269753
...,...,...,...,...,...
AC011841.1,False,0.002423,0.160377,-1.296439,ENSG00000212884
AL354822.1,False,0.009853,0.862335,-0.223410,ENSG00000215615
KIR2DL2,True,0.040955,1.787913,1.191449,ENSG00000215764
PNRC2.1,False,0.103337,1.110076,0.155293,a


In [18]:
sum(notvalid['gene_ids'] == 'a')

47

In [19]:
sum(notvalid['gene_ids'] != 'a')

750

In [20]:
sum(~adata.var_names.isin(notvalid.index))

20243

In [21]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [22]:
adata

View of AnnData object with n_obs × n_vars = 4335 × 20243
    obs: 'batch', 'final_cluster_labels', 'library', 'Location', 'CellType', 'Donor'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [23]:
raw

AnnData object with n_obs × n_vars = 4335 × 21040
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'

In [31]:
raw = raw[:,adata.var_names]

In [32]:
raw

View of AnnData object with n_obs × n_vars = 4335 × 20243
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'

### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [33]:
validlst.index = validlst['gene_ids']

In [34]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [35]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [36]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

100%|██████████| 20243/20243 [05:18<00:00, 63.61it/s]


In [37]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [38]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [39]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [40]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [41]:
symbols

Unnamed: 0,oldsymb,newsymb
0,AP006222.2,AP006222.1
1,RP4-669L17.10,RP4-669L17.4
2,RP5-857K21.4,RP11-206L10.17
3,RP11-206L10.9,LINC01409
4,RP11-54O7.3,LINC02593
...,...,...
2141,C21orf2,CFAP410
2142,AP001065.15,LINC02575
2143,C21orf90,TSPEAR-AS2
2144,C21orf67,LINC01547


In [42]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000238009,False,0.006743,1.271277,0.401708,ENSG00000238009,RP11-34P13.7
ENSG00000239945,False,0.000423,0.164355,-1.290357,ENSG00000239945,RP11-34P13.8
ENSG00000228463,False,0.114638,0.959162,-0.075398,ENSG00000228463,AP006222.2
ENSG00000237094,False,0.012195,1.257201,0.380192,ENSG00000237094,RP4-669L17.10
ENSG00000230021,False,0.001037,0.068055,-1.437565,ENSG00000230021,RP5-857K21.4
...,...,...,...,...,...,...
ENSG00000212907,False,1.958249,2.255139,-0.224573,ENSG00000212907,MT-ND4L
ENSG00000198886,False,4.737793,4.751992,-1.012088,ENSG00000198886,MT-ND4
ENSG00000198786,False,3.180222,3.193242,0.515346,ENSG00000198786,MT-ND5
ENSG00000198695,False,0.457886,1.415811,0.065355,ENSG00000198695,MT-ND6


In [43]:
del adata.var['gene_ids']

In [44]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]
100%|██████████| 2146/2146 [00:02<00:00, 1023.99it/s]


In [48]:
raw.X = scipy.sparse.csr_matrix(raw.X)

In [50]:
del raw.var

In [51]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [52]:
raw.var['feature_biotype'] = ["gene"] * len(raw.var)

In [133]:
raw.X = scipy.sparse.csr_matrix(raw.X)

In [129]:
raw.var_names = adata.var_names

In [134]:
adata.raw = raw

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [55]:
adata.obs.columns

Index(['batch', 'final_cluster_labels', 'library', 'Location', 'CellType',
       'Donor'],
      dtype='object')

## assay_ontology_term_id

> Single cells were barcoded with the chromium system using the v3 single-cell reagent kit (10x Genomics).  

v3 only exists in 10x 3'! -> "EFO:0009922"

In [56]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009922"] * len(adata.obs)

In [57]:
adata.obs['assay_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        EFO:0009922
AAACCCATCTCTCTTC-1-macula_donor_1        EFO:0009922
AAAGGATAGTTCCGGC-1-macula_donor_1        EFO:0009922
AAAGGGCAGACAGCTG-1-macula_donor_1        EFO:0009922
AAAGGTATCTGGAAGG-1-macula_donor_1        EFO:0009922
                                            ...     
TTTCGATAGAGATTCA-6-peripheral_donor_3    EFO:0009922
TTTGACTCAACAGAGC-6-peripheral_donor_3    EFO:0009922
TTTGATCAGAGATGCC-6-peripheral_donor_3    EFO:0009922
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    EFO:0009922
TTTGTTGCATTAAAGG-6-peripheral_donor_3    EFO:0009922
Name: assay_ontology_term_id, Length: 4335, dtype: object

In [58]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [59]:
adata.obs['assay_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        EFO:0009922
AAACCCATCTCTCTTC-1-macula_donor_1        EFO:0009922
AAAGGATAGTTCCGGC-1-macula_donor_1        EFO:0009922
AAAGGGCAGACAGCTG-1-macula_donor_1        EFO:0009922
AAAGGTATCTGGAAGG-1-macula_donor_1        EFO:0009922
                                            ...     
TTTCGATAGAGATTCA-6-peripheral_donor_3    EFO:0009922
TTTGACTCAACAGAGC-6-peripheral_donor_3    EFO:0009922
TTTGATCAGAGATGCC-6-peripheral_donor_3    EFO:0009922
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    EFO:0009922
TTTGTTGCATTAAAGG-6-peripheral_donor_3    EFO:0009922
Name: assay_ontology_term_id, Length: 4335, dtype: category
Categories (1, object): ['EFO:0009922']

## cell_type_ontology_term_id

In [60]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [61]:
adata.obs['cell_type_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1         T/NK-cell
AAACCCATCTCTCTTC-1-macula_donor_1        Macrophage
AAAGGATAGTTCCGGC-1-macula_donor_1         T/NK-cell
AAAGGGCAGACAGCTG-1-macula_donor_1         T/NK-cell
AAAGGTATCTGGAAGG-1-macula_donor_1         T/NK-cell
                                            ...    
TTTCGATAGAGATTCA-6-peripheral_donor_3     T/NK-cell
TTTGACTCAACAGAGC-6-peripheral_donor_3      Pericyte
TTTGATCAGAGATGCC-6-peripheral_donor_3    Fibroblast
TTTGTTGCAGTCCGTG-6-peripheral_donor_3     T/NK-cell
TTTGTTGCATTAAAGG-6-peripheral_donor_3    Melanocyte
Name: cell_type_ontology_term_id, Length: 4335, dtype: category
Categories (11, object): ['Schwann1', 'Schwann2', 'Melanocyte', 'Endothelial', ..., 'B-cell', 'T/NK-cell', 'Macrophage', 'Mast-cell']

In [62]:
list(adata.obs['cell_type_ontology_term_id'].unique())

['T/NK-cell',
 'Macrophage',
 'Melanocyte',
 'Schwann2',
 'Pericyte',
 'Fibroblast',
 'Endothelial',
 'B-cell',
 'RPE',
 'Mast-cell',
 'Schwann1']

In [63]:
mapping = {'T/NK-cell': 'CL:0000814',
 'Macrophage': 'CL:0000235',
 'Melanocyte': 'CL:0000148',
 'Schwann2': 'CL:0002573',
 'Pericyte': 'CL:0000669',
 'Fibroblast': 'CL:0000057',
 'Endothelial': 'CL:0000115',
 'B-cell': 'CL:0000236',
 'RPE': 'CL:0002586',
 'Mast-cell': 'CL:0000097',
 'Schwann1': 'CL:0002573'}

In [64]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [65]:
adata.obs['cell_type_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        CL:0000814
AAACCCATCTCTCTTC-1-macula_donor_1        CL:0000235
AAAGGATAGTTCCGGC-1-macula_donor_1        CL:0000814
AAAGGGCAGACAGCTG-1-macula_donor_1        CL:0000814
AAAGGTATCTGGAAGG-1-macula_donor_1        CL:0000814
                                            ...    
TTTCGATAGAGATTCA-6-peripheral_donor_3    CL:0000814
TTTGACTCAACAGAGC-6-peripheral_donor_3    CL:0000669
TTTGATCAGAGATGCC-6-peripheral_donor_3    CL:0000057
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    CL:0000814
TTTGTTGCATTAAAGG-6-peripheral_donor_3    CL:0000148
Name: cell_type_ontology_term_id, Length: 4335, dtype: object

## development_stage_ontology_term_id

In [109]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Donor']

In [110]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['donor_1', 'donor_2', 'donor_3']

In [111]:
mapping = {'donor_1': 'HsapDv:0000148',
 'donor_2': 'HsapDv:0000208',
 'donor_3': 'HsapDv:0000173'}

In [112]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [113]:
adata.obs['development_stage_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        HsapDv:0000148
AAACCCATCTCTCTTC-1-macula_donor_1        HsapDv:0000148
AAAGGATAGTTCCGGC-1-macula_donor_1        HsapDv:0000148
AAAGGGCAGACAGCTG-1-macula_donor_1        HsapDv:0000148
AAAGGTATCTGGAAGG-1-macula_donor_1        HsapDv:0000148
                                              ...      
TTTCGATAGAGATTCA-6-peripheral_donor_3    HsapDv:0000173
TTTGACTCAACAGAGC-6-peripheral_donor_3    HsapDv:0000173
TTTGATCAGAGATGCC-6-peripheral_donor_3    HsapDv:0000173
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    HsapDv:0000173
TTTGTTGCATTAAAGG-6-peripheral_donor_3    HsapDv:0000173
Name: development_stage_ontology_term_id, Length: 4335, dtype: category
Categories (3, object): ['HsapDv:0000148', 'HsapDv:0000208', 'HsapDv:0000173']

## donor_id

In [102]:
adata.obs['donor_id'] = adata.obs['Donor']

## disease_ontology_term_id

In [137]:
adata.obs['disease_ontology_term_id'] = adata.obs['Donor_id']

In [138]:
adata.obs['disease_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        donor_1
AAACCCATCTCTCTTC-1-macula_donor_1        donor_1
AAAGGATAGTTCCGGC-1-macula_donor_1        donor_1
AAAGGGCAGACAGCTG-1-macula_donor_1        donor_1
AAAGGTATCTGGAAGG-1-macula_donor_1        donor_1
                                          ...   
TTTCGATAGAGATTCA-6-peripheral_donor_3    donor_3
TTTGACTCAACAGAGC-6-peripheral_donor_3    donor_3
TTTGATCAGAGATGCC-6-peripheral_donor_3    donor_3
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    donor_3
TTTGTTGCATTAAAGG-6-peripheral_donor_3    donor_3
Name: disease_ontology_term_id, Length: 4335, dtype: category
Categories (3, object): ['donor_1', 'donor_2', 'donor_3']

In [139]:
mapping = {'donor_1': 'MONDO:0005129', #ophthalmic note: mild cataract
 'donor_2': 'MONDO:0007472', #ophthalmic note: thick basal laminar deposits
 'donor_3': 'MONDO:0012419'} #ophthalmic note: neovascular amd

In [140]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].map(mapping)

## ethnicity_ontology_term_id

In [75]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [76]:
adata.obs['ethnicity_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        unknown
AAACCCATCTCTCTTC-1-macula_donor_1        unknown
AAAGGATAGTTCCGGC-1-macula_donor_1        unknown
AAAGGGCAGACAGCTG-1-macula_donor_1        unknown
AAAGGTATCTGGAAGG-1-macula_donor_1        unknown
                                          ...   
TTTCGATAGAGATTCA-6-peripheral_donor_3    unknown
TTTGACTCAACAGAGC-6-peripheral_donor_3    unknown
TTTGATCAGAGATGCC-6-peripheral_donor_3    unknown
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    unknown
TTTGTTGCATTAAAGG-6-peripheral_donor_3    unknown
Name: ethnicity_ontology_term_id, Length: 4335, dtype: object

## is_primary_data

In [77]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [78]:
adata.obs['is_primary_data']

index
AAACCCATCAAGGTGG-1-macula_donor_1        True
AAACCCATCTCTCTTC-1-macula_donor_1        True
AAAGGATAGTTCCGGC-1-macula_donor_1        True
AAAGGGCAGACAGCTG-1-macula_donor_1        True
AAAGGTATCTGGAAGG-1-macula_donor_1        True
                                         ... 
TTTCGATAGAGATTCA-6-peripheral_donor_3    True
TTTGACTCAACAGAGC-6-peripheral_donor_3    True
TTTGATCAGAGATGCC-6-peripheral_donor_3    True
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    True
TTTGTTGCATTAAAGG-6-peripheral_donor_3    True
Name: is_primary_data, Length: 4335, dtype: bool

In [79]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [80]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [81]:
adata.obs['organism_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        NCBITaxon:9606
AAACCCATCTCTCTTC-1-macula_donor_1        NCBITaxon:9606
AAAGGATAGTTCCGGC-1-macula_donor_1        NCBITaxon:9606
AAAGGGCAGACAGCTG-1-macula_donor_1        NCBITaxon:9606
AAAGGTATCTGGAAGG-1-macula_donor_1        NCBITaxon:9606
                                              ...      
TTTCGATAGAGATTCA-6-peripheral_donor_3    NCBITaxon:9606
TTTGACTCAACAGAGC-6-peripheral_donor_3    NCBITaxon:9606
TTTGATCAGAGATGCC-6-peripheral_donor_3    NCBITaxon:9606
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    NCBITaxon:9606
TTTGTTGCATTAAAGG-6-peripheral_donor_3    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 4335, dtype: object

In [82]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [83]:
adata.obs['organism_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        NCBITaxon:9606
AAACCCATCTCTCTTC-1-macula_donor_1        NCBITaxon:9606
AAAGGATAGTTCCGGC-1-macula_donor_1        NCBITaxon:9606
AAAGGGCAGACAGCTG-1-macula_donor_1        NCBITaxon:9606
AAAGGTATCTGGAAGG-1-macula_donor_1        NCBITaxon:9606
                                              ...      
TTTCGATAGAGATTCA-6-peripheral_donor_3    NCBITaxon:9606
TTTGACTCAACAGAGC-6-peripheral_donor_3    NCBITaxon:9606
TTTGATCAGAGATGCC-6-peripheral_donor_3    NCBITaxon:9606
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    NCBITaxon:9606
TTTGTTGCATTAAAGG-6-peripheral_donor_3    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 4335, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [84]:
adata.obs['sex_ontology_term_id'] = adata.obs['Donor']

**PATO:0000384 for male  
PATO:0000383 for female**

In [85]:
mapping = {'donor_1': 'PATO:0000384',
 'donor_2': 'PATO:0000383',
 'donor_3': 'PATO:0000384'}

In [86]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [87]:
adata.obs['sex_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1        PATO:0000384
AAACCCATCTCTCTTC-1-macula_donor_1        PATO:0000384
AAAGGATAGTTCCGGC-1-macula_donor_1        PATO:0000384
AAAGGGCAGACAGCTG-1-macula_donor_1        PATO:0000384
AAAGGTATCTGGAAGG-1-macula_donor_1        PATO:0000384
                                             ...     
TTTCGATAGAGATTCA-6-peripheral_donor_3    PATO:0000384
TTTGACTCAACAGAGC-6-peripheral_donor_3    PATO:0000384
TTTGATCAGAGATGCC-6-peripheral_donor_3    PATO:0000384
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    PATO:0000384
TTTGTTGCATTAAAGG-6-peripheral_donor_3    PATO:0000384
Name: sex_ontology_term_id, Length: 4335, dtype: object

## suspension_type

In [103]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

## tissue_ontology_term_id

In [116]:
adata.obs['tissue_ontology_term_id'] = adata.obs['Location']

In [117]:
adata.obs['tissue_ontology_term_id']

index
AAACCCATCAAGGTGG-1-macula_donor_1            macula
AAACCCATCTCTCTTC-1-macula_donor_1            macula
AAAGGATAGTTCCGGC-1-macula_donor_1            macula
AAAGGGCAGACAGCTG-1-macula_donor_1            macula
AAAGGTATCTGGAAGG-1-macula_donor_1            macula
                                            ...    
TTTCGATAGAGATTCA-6-peripheral_donor_3    peripheral
TTTGACTCAACAGAGC-6-peripheral_donor_3    peripheral
TTTGATCAGAGATGCC-6-peripheral_donor_3    peripheral
TTTGTTGCAGTCCGTG-6-peripheral_donor_3    peripheral
TTTGTTGCATTAAAGG-6-peripheral_donor_3    peripheral
Name: tissue_ontology_term_id, Length: 4335, dtype: category
Categories (2, object): ['macula', 'peripheral']

In [118]:
mapping = {'macula': 'UBERON:0000053',
           'peripheral': 'UBERON:0013682'}

In [119]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].map(mapping)

---

# <ins>var and raw.var

In [92]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000238009,False,0.006743,1.271277,0.401708,RP11-34P13.7,gene
ENSG00000239945,False,0.000423,0.164355,-1.290357,RP11-34P13.8,gene
ENSG00000228463,False,0.114638,0.959162,-0.075398,AP006222.1,gene
ENSG00000237094,False,0.012195,1.257201,0.380192,RP4-669L17.4,gene
ENSG00000230021,False,0.001037,0.068055,-1.437565,RP11-206L10.17,gene
...,...,...,...,...,...,...
ENSG00000212907,False,1.958249,2.255139,-0.224573,MT-ND4L,gene
ENSG00000198886,False,4.737793,4.751992,-1.012088,MT-ND4,gene
ENSG00000198786,False,3.180222,3.193242,0.515346,MT-ND5,gene
ENSG00000198695,False,0.457886,1.415811,0.065355,MT-ND6,gene


## feature_is_filtered

In [93]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [94]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000239945    False
ENSG00000228463    False
ENSG00000237094    False
ENSG00000230021    False
                   ...  
ENSG00000212907    False
ENSG00000198886    False
ENSG00000198786    False
ENSG00000198695    False
ENSG00000198727    False
Name: feature_is_filtered, Length: 20243, dtype: bool

# <ins>obsm

In [95]:
adata.obsm

AxisArrays with keys: X_umap_hm

In [96]:
adata.obsm['X_umap'] = adata.obsm['X_umap_hm']

In [97]:
del adata.obsm['X_umap_hm']

# <ins>uns

In [98]:
adata.uns['schema_version'] = '2.0.0'

In [99]:
adata.uns['title'] = 'Retina'

In [100]:
adata.uns['X_normalization'] = 'Scanpy'

In [104]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [105]:
adata

AnnData object with n_obs × n_vars = 4335 × 20243
    obs: 'batch', 'final_cluster_labels', 'library', 'Location', 'CellType', 'Donor', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id', 'suspension_type'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'leiden', 'neighbors_hm', 'pca', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    varm: 'PCs'

In [114]:
# adata.obs
print('obs')
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

# adata.var
print('var')
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

obs
var


In [121]:
adata.obs

Unnamed: 0_level_0,batch,final_cluster_labels,library,Location,CellType,Donor,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id,donor_id,suspension_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAACCCATCAAGGTGG-1-macula_donor_1,macula_donor_1,9,macula_donor_1,macula,T/NK-cell,donor_1,EFO:0009922,CL:0000814,HsapDv:0000148,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0000053,donor_1,cell
AAACCCATCTCTCTTC-1-macula_donor_1,macula_donor_1,10,macula_donor_1,macula,Macrophage,donor_1,EFO:0009922,CL:0000235,HsapDv:0000148,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0000053,donor_1,cell
AAAGGATAGTTCCGGC-1-macula_donor_1,macula_donor_1,9,macula_donor_1,macula,T/NK-cell,donor_1,EFO:0009922,CL:0000814,HsapDv:0000148,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0000053,donor_1,cell
AAAGGGCAGACAGCTG-1-macula_donor_1,macula_donor_1,9,macula_donor_1,macula,T/NK-cell,donor_1,EFO:0009922,CL:0000814,HsapDv:0000148,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0000053,donor_1,cell
AAAGGTATCTGGAAGG-1-macula_donor_1,macula_donor_1,9,macula_donor_1,macula,T/NK-cell,donor_1,EFO:0009922,CL:0000814,HsapDv:0000148,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0000053,donor_1,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCGATAGAGATTCA-6-peripheral_donor_3,peripheral_donor_3,9,peripheral_donor_3,peripheral,T/NK-cell,donor_3,EFO:0009922,CL:0000814,HsapDv:0000173,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0013682,donor_3,cell
TTTGACTCAACAGAGC-6-peripheral_donor_3,peripheral_donor_3,5,peripheral_donor_3,peripheral,Pericyte,donor_3,EFO:0009922,CL:0000669,HsapDv:0000173,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0013682,donor_3,cell
TTTGATCAGAGATGCC-6-peripheral_donor_3,peripheral_donor_3,6,peripheral_donor_3,peripheral,Fibroblast,donor_3,EFO:0009922,CL:0000057,HsapDv:0000173,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0013682,donor_3,cell
TTTGTTGCAGTCCGTG-6-peripheral_donor_3,peripheral_donor_3,9,peripheral_donor_3,peripheral,T/NK-cell,donor_3,EFO:0009922,CL:0000814,HsapDv:0000173,MONDO:0006969,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0013682,donor_3,cell


In [122]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,gene_symbols,feature_biotype,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000238009,False,0.006743,1.271277,0.401708,RP11-34P13.7,gene,False
ENSG00000239945,False,0.000423,0.164355,-1.290357,RP11-34P13.8,gene,False
ENSG00000228463,False,0.114638,0.959162,-0.075398,AP006222.1,gene,False
ENSG00000237094,False,0.012195,1.257201,0.380192,RP4-669L17.4,gene,False
ENSG00000230021,False,0.001037,0.068055,-1.437565,RP11-206L10.17,gene,False
...,...,...,...,...,...,...,...
ENSG00000212907,False,1.958249,2.255139,-0.224573,MT-ND4L,gene,False
ENSG00000198886,False,4.737793,4.751992,-1.012088,MT-ND4,gene,False
ENSG00000198786,False,3.180222,3.193242,0.515346,MT-ND5,gene,False
ENSG00000198695,False,0.457886,1.415811,0.065355,MT-ND6,gene,False


In [125]:
del adata.obs['library']
del adata.obs['Location']
del adata.obs['Donor']

In [141]:
adata.write('voigt19_f.h5ad', compression = 'gzip')