# **Curating `wang20_Rectum.processed.h5ad` dataset:**

* ~4K cells and 17K genes
* one of the three datasets in the publication
* publication link: https://rupress.org/jem/article/217/2/e20191130/132578/Single-cell-transcriptome-analysis-reveals

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/wang20_rectum.processed.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 3797 × 17676
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [4]:
adata.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,10500.0,9.259226,2545,7.842279,28.838095,5.447619,0.009524,49.000000
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,15281.0,9.634431,3375,8.124447,16.190041,9.619789,0.032720,44.244487
AAACCTGTCAGCGATT_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,6529.0,8.784163,1574,7.362011,32.409252,5.176903,0.015316,59.013631
AAAGATGGTTCAGACT_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,13501.0,9.510593,2989,8.003029,24.946301,3.814532,0.014814,46.737279
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,5287.0,8.573195,1582,7.367077,29.960281,5.239266,0.037829,51.030830
...,...,...,...,...,...,...,...,...,...,...,...
TCAGGATGTTCCACAA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17962.0,9.796069,2953,7.990915,11.897339,45.763279,0.000000,47.260884
TCCACACAGCAATATG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,21999.0,9.998797,3771,8.235361,12.205100,38.410835,0.013637,40.510932
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17397.0,9.764111,3101,8.039802,10.766224,41.392197,0.005748,41.346209
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,4892.0,8.495561,1223,7.109879,26.185608,30.498774,0.020442,58.053966


## 2. Pre-prepare before schema adjustments

### Add row counts

In [5]:
raw = pd.read_table('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/raw/wang20/GSE125970_raw_UMIcounts.txt.gz', index_col=0)

In [6]:
raw

Unnamed: 0_level_0,AAATGCCCAATCTGCA_Ileum-1_Enterocyte,AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,AAGACCTCACGGACAA_Ileum-1_Enterocyte,AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,ACGAGGATCGGCCGAT_Ileum-1_Enterocyte,ACGGCCAGTCTAAACC_Ileum-1_Enterocyte,AGGCCGTTCGAGCCCA_Ileum-1_Enterocyte,AGTGAGGGTCGGCTCA_Ileum-1_Enterocyte,ATGAGGGAGGATATAC_Ileum-1_Enterocyte,...,TAGTTGGCAGTCCTTC_Rectum-2_Stem Cell,TAGTTGGTCTGCCCTA_Rectum-2_Stem Cell,TATCTCAGTGCGGTAA_Rectum-2_Stem Cell,TATTACCCAATAACGA_Rectum-2_Stem Cell,TCAATCTTCACGACTA_Rectum-2_Stem Cell,TCAGGATGTTCCACAA_Rectum-2_Stem Cell,TCCACACAGCAATATG_Rectum-2_Stem Cell,TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FO538757.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
AP006222.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP3-510H16.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669P10.16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BACH1-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
raw.columns

Index(['AAATGCCCAATCTGCA_Ileum-1_Enterocyte',
       'AACTCTTGTCTAGTCA_Ileum-1_Enterocyte',
       'AAGACCTCACGGACAA_Ileum-1_Enterocyte',
       'AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte',
       'AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte',
       'ACGAGGATCGGCCGAT_Ileum-1_Enterocyte',
       'ACGGCCAGTCTAAACC_Ileum-1_Enterocyte',
       'AGGCCGTTCGAGCCCA_Ileum-1_Enterocyte',
       'AGTGAGGGTCGGCTCA_Ileum-1_Enterocyte',
       'ATGAGGGAGGATATAC_Ileum-1_Enterocyte',
       ...
       'TAGTTGGCAGTCCTTC_Rectum-2_Stem Cell',
       'TAGTTGGTCTGCCCTA_Rectum-2_Stem Cell',
       'TATCTCAGTGCGGTAA_Rectum-2_Stem Cell',
       'TATTACCCAATAACGA_Rectum-2_Stem Cell',
       'TCAATCTTCACGACTA_Rectum-2_Stem Cell',
       'TCAGGATGTTCCACAA_Rectum-2_Stem Cell',
       'TCCACACAGCAATATG_Rectum-2_Stem Cell',
       'TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell',
       'TCTATTGTCTTCGAGA_Rectum-2_Stem Cell',
       'TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell'],
      dtype='object', length=14537)

In [8]:
# raw = raw.loc[:,(raw.columns.isin(list(adata.obs.index)))]

In [9]:
raw = raw.loc[:,list(adata.obs.index)]

In [10]:
raw.shape

(19525, 3797)

In [11]:
adata

AnnData object with n_obs × n_vars = 3797 × 17676
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [5]:
sym2ensg = pd.read_table('../GRCh38-1.2.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [7]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [8]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in sym2ensg['gene_symbol']:
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
   
    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 17676/17676 [00:10<00:00, 1620.16it/s]


### Remove the genes that do not match in gene-ensemblID file

In [9]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [10]:
notvalid

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
RP4-635E18.9,False,False,False,3.0,3,3,False,0.000664,-0.159507,-0.228444,ENSG00000277726
RP4-635E18.8,False,False,False,39.0,41,39,False,0.006629,-0.168159,-0.253403,ENSG00000271895
NPPA-AS1,False,False,False,33.0,34,32,False,0.004985,-0.315745,-0.679140,ENSG00000242349
UBXN10-AS1,False,False,False,49.0,53,49,False,0.010247,0.163489,0.703292,ENSG00000225986
RP1-224A6.9,False,False,False,3.0,3,3,False,0.000303,-0.956615,-2.527840,ENSG00000271840
...,...,...,...,...,...,...,...,...,...,...,...
RP11-89G4.1,False,False,False,3.0,3,3,False,0.001102,0.501457,1.678219,ENSG00000277192
RP11-542K23.7,False,False,False,4.0,3,3,False,0.000429,-0.544465,-1.338924,ENSG00000229245
MIA2,False,False,False,7.0,7,7,False,0.002871,0.566389,1.865528,ENSG00000150526
AL928768.3,False,False,False,22.0,23,22,False,0.004706,0.035206,0.333237,ENSG00000253701


In [12]:
len(notvalid.loc[notvalid.gene_ids == 'a'])

7

In [18]:
sum(~adata.var_names.isin(notvalid.index))

17497

In [19]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [20]:
adata

View of AnnData object with n_obs × n_vars = 3797 × 17497
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [21]:
raw = raw.loc[adata.var_names,:]

In [22]:
raw

Unnamed: 0_level_0,AAACCTGAGCTGGAAC_Rectum-1_Enterocyte,AAACCTGGTTCCCGAG_Rectum-1_Enterocyte,AAACCTGTCAGCGATT_Rectum-1_Enterocyte,AAAGATGGTTCAGACT_Rectum-1_Enterocyte,AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte,AAATGCCCATCAGTAC_Rectum-1_Enterocyte,AACACGTGTGACAAAT_Rectum-1_Enterocyte,AACCGCGAGAGATGAG_Rectum-1_Enterocyte,AACTCCCCACCGAATT_Rectum-1_Enterocyte,AACTCTTTCGTGACAT_Rectum-1_Enterocyte,...,TAGTTGGCAGTCCTTC_Rectum-2_Stem Cell,TAGTTGGTCTGCCCTA_Rectum-2_Stem Cell,TATCTCAGTGCGGTAA_Rectum-2_Stem Cell,TATTACCCAATAACGA_Rectum-2_Stem Cell,TCAATCTTCACGACTA_Rectum-2_Stem Cell,TCAGGATGTTCCACAA_Rectum-2_Stem Cell,TCCACACAGCAATATG_Rectum-2_Stem Cell,TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11-34P13.7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FO538757.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
AP006222.2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CLDN14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AF129408.17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MX2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CBS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [23]:
validlst.index = validlst['gene_ids']

In [24]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [25]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [26]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

100%|██████████| 17497/17497 [05:49<00:00, 50.06it/s]


In [27]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [28]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [29]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [30]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [31]:
symbols

Unnamed: 0,oldsymb,newsymb
0,FO538757.2,WASH9P
1,AP006222.2,AP006222.1
2,RP4-669L17.10,RP4-669L17.4
3,RP11-206L10.9,LINC01409
4,FAM132A,C1QTNF12
...,...,...
1272,CCL3L3,CCL3L1
1273,CECR1,ADA2
1274,LINC01422,RP1-90L6.3
1275,SEPT3,SEPTIN3


In [32]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000238009,False,False,False,25.0,25,23,False,0.003679,-0.360232,-0.807470,ENSG00000238009,RP11-34P13.7
ENSG00000279457,False,False,False,579.0,548,520,True,0.107308,0.118275,0.572867,ENSG00000279457,FO538757.2
ENSG00000228463,False,False,False,230.0,230,219,False,0.033655,-0.314175,-0.674611,ENSG00000228463,AP006222.2
ENSG00000237094,False,False,False,4.0,4,4,False,0.000481,-0.686255,-1.747941,ENSG00000237094,RP4-669L17.10
ENSG00000237491,False,False,False,171.0,172,166,False,0.028947,-0.087199,-0.019861,ENSG00000237491,RP11-206L10.9
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000159261,False,False,False,3.0,3,3,False,0.000645,-0.009892,0.203147,ENSG00000159261,CLDN14
ENSG00000272991,False,False,False,6.0,6,6,False,0.001739,0.338456,1.208017,ENSG00000272991,AF129408.17
ENSG00000183486,False,False,False,4.0,4,4,False,0.001739,0.586054,1.922254,ENSG00000183486,MX2
ENSG00000160200,False,False,False,3.0,3,3,False,0.000536,-0.200734,-0.347372,ENSG00000160200,CBS


In [34]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000238009,False,False,False,25.0,25,23,False,0.003679,-0.360232,-0.807470,ENSG00000238009,RP11-34P13.7
ENSG00000279457,False,False,False,579.0,548,520,True,0.107308,0.118275,0.572867,ENSG00000279457,FO538757.2
ENSG00000228463,False,False,False,230.0,230,219,False,0.033655,-0.314175,-0.674611,ENSG00000228463,AP006222.2
ENSG00000237094,False,False,False,4.0,4,4,False,0.000481,-0.686255,-1.747941,ENSG00000237094,RP4-669L17.10
ENSG00000237491,False,False,False,171.0,172,166,False,0.028947,-0.087199,-0.019861,ENSG00000237491,RP11-206L10.9
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000159261,False,False,False,3.0,3,3,False,0.000645,-0.009892,0.203147,ENSG00000159261,CLDN14
ENSG00000272991,False,False,False,6.0,6,6,False,0.001739,0.338456,1.208017,ENSG00000272991,AF129408.17
ENSG00000183486,False,False,False,4.0,4,4,False,0.001739,0.586054,1.922254,ENSG00000183486,MX2
ENSG00000160200,False,False,False,3.0,3,3,False,0.000536,-0.200734,-0.347372,ENSG00000160200,CBS


In [36]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]
100%|██████████| 1277/1277 [00:01<00:00, 896.31it/s]


In [37]:
raw = raw.transpose()

In [38]:
raw = raw.set_axis(list(adata.var_names), axis = 1, inplace=False)

In [39]:
raw

Unnamed: 0,ENSG00000238009,ENSG00000279457,ENSG00000228463,ENSG00000237094,ENSG00000237491,ENSG00000225880,ENSG00000230368,ENSG00000187634,ENSG00000188976,ENSG00000187961,...,ENSG00000100167,ENSG00000100344,ENSG00000155307,ENSG00000233236,ENSG00000205929,ENSG00000159261,ENSG00000272991,ENSG00000183486,ENSG00000160200,ENSG00000160224
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGTCAGCGATT_Rectum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAGATGGTTCAGACT_Rectum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCAGGATGTTCCACAA_Rectum-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCCACACAGCAATATG_Rectum-2_Stem Cell,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
raww = adata.copy()

In [41]:
raww.X = scipy.sparse.csr_matrix(raw)

In [42]:
del raww.obs
del raww.uns
del raww.obsm
del raww.varm
del raww.var

In [43]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [44]:
raww.var['feature_biotype'] = ["gene"] * len(raww.var)

In [45]:
adata.raw = raww

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [46]:
adata.obs.columns

Index(['Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts',
       'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo',
       'percent_hb', 'percent_top50'],
      dtype='object')

## assay_ontology_term_id

At GEO:  
> "cDNA library establishment and sequencing: single cell suspensions were captured using the <ins>10X Genomics Chromium Single Cell 3' Library & Gel Bead Kit v2 (PN-120237)</ins> according to the manufacturers protocol."

In [47]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009899"] * len(adata.obs)

In [48]:
adata.obs['assay_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    EFO:0009899
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    EFO:0009899
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    EFO:0009899
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    EFO:0009899
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    EFO:0009899
                                           ...     
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     EFO:0009899
TCCACACAGCAATATG_Rectum-2_Stem Cell     EFO:0009899
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     EFO:0009899
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     EFO:0009899
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     EFO:0009899
Name: assay_ontology_term_id, Length: 3797, dtype: object

In [49]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [50]:
adata.obs['assay_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    EFO:0009899
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    EFO:0009899
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    EFO:0009899
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    EFO:0009899
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    EFO:0009899
                                           ...     
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     EFO:0009899
TCCACACAGCAATATG_Rectum-2_Stem Cell     EFO:0009899
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     EFO:0009899
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     EFO:0009899
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     EFO:0009899
Name: assay_ontology_term_id, Length: 3797, dtype: category
Categories (1, object): ['EFO:0009899']

## cell_type_ontology_term_id

In [51]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [52]:
list(adata.obs['CellType'].unique())

['Enterocyte',
 'Goblet',
 'Paneth-like',
 'Enteriendocrine',
 'Progenitor',
 'TA',
 'Stem Cell']

In [53]:
adata.obs['cell_type_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    Enterocyte
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    Enterocyte
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    Enterocyte
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    Enterocyte
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    Enterocyte
                                           ...    
TCAGGATGTTCCACAA_Rectum-2_Stem Cell      Stem Cell
TCCACACAGCAATATG_Rectum-2_Stem Cell      Stem Cell
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell      Stem Cell
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell      Stem Cell
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell      Stem Cell
Name: cell_type_ontology_term_id, Length: 3797, dtype: category
Categories (7, object): ['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor', 'Stem Cell', 'TA']

In [54]:
#list(adata.obs['cell_type_ontology_term_id'].unique())

In [55]:
mapping = {'Enterocyte': 'CL:1000347',
 'Goblet': 'CL:0009039',
 'Paneth-like': 'CL:0009009',
 'Enteriendocrine': 'CL:0009042',
 'Progenitor': 'CL:0011026',
 'TA': 'CL:0009011',
 'Stem Cell': 'CL:0000034'}

In [56]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [57]:
adata.obs['cell_type_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    CL:1000347
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    CL:1000347
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    CL:1000347
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    CL:1000347
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    CL:1000347
                                           ...    
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     CL:0000034
TCCACACAGCAATATG_Rectum-2_Stem Cell     CL:0000034
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     CL:0000034
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     CL:0000034
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     CL:0000034
Name: cell_type_ontology_term_id, Length: 3797, dtype: category
Categories (7, object): ['CL:0009042', 'CL:1000347', 'CL:0009039', 'CL:0009009', 'CL:0011026', 'CL:0000034', 'CL:0009011']

## development_stage_ontology_term_id

In [58]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Sample_ID']

In [59]:
adata.obs['development_stage_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    Rectum-1
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    Rectum-1
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    Rectum-1
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    Rectum-1
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    Rectum-1
                                          ...   
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     Rectum-2
TCCACACAGCAATATG_Rectum-2_Stem Cell     Rectum-2
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     Rectum-2
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     Rectum-2
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     Rectum-2
Name: development_stage_ontology_term_id, Length: 3797, dtype: category
Categories (2, object): ['Rectum-1', 'Rectum-2']

In [60]:
adata.obs['development_stage_ontology_term_id'].unique()

['Rectum-1', 'Rectum-2']
Categories (2, object): ['Rectum-1', 'Rectum-2']

In [61]:
mapping = {'Rectum-1': 'HsapDv:0000160', 'Rectum-2': 'HsapDv:0000144'}

In [62]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [63]:
adata.obs['development_stage_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    HsapDv:0000160
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    HsapDv:0000160
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    HsapDv:0000160
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    HsapDv:0000160
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    HsapDv:0000160
                                             ...      
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     HsapDv:0000144
TCCACACAGCAATATG_Rectum-2_Stem Cell     HsapDv:0000144
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     HsapDv:0000144
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     HsapDv:0000144
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     HsapDv:0000144
Name: development_stage_ontology_term_id, Length: 3797, dtype: category
Categories (2, object): ['HsapDv:0000160', 'HsapDv:0000144']

## disease_ontology_term_id

**"MONDO:0002169"**: rectum adenocarcinoma (from the [supplementary pdf](https://cdn.rupress.org/rup/content_public/journal/jem/217/2/10.1084_jem.20191130/5/jem_20191130_sm.pdf?Expires=1648068738&Signature=JY-9hWfChqrh5-tLDB-9piypZmbEUwBM5mwYqTmdT~72eotklqYvMHtSWaZjWNuGFXHuxFrIKF2CpErN53JyrSMGnVeA19j4XnFkldFWTiTvIzc6pbeXbA36LR5i8e7mV9iSX0PF5xPf8X273SohGK9jRScJKL497JJP1Mz7fEJrrzzRXd2r4qQv1kjcrv-luL92La4aU8l8FribyqH0WGCQn7MrhW9gYkmbzx4bsaYkK~fNynn~h1B4gzPaDM~Cbs6trlIi~5T-hRa-Fe7c834INW~t4N7EqMKVfVOWqqQfpq3RDYBvZQ1z~bDfIrgWa-BMQz9vmcdlCT4y4SaTGA__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA))

In [64]:
adata.obs['disease_ontology_term_id'] = adata.obs['Sample_ID']

In [65]:
adata.obs['disease_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    Rectum-1
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    Rectum-1
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    Rectum-1
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    Rectum-1
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    Rectum-1
                                          ...   
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     Rectum-2
TCCACACAGCAATATG_Rectum-2_Stem Cell     Rectum-2
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     Rectum-2
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     Rectum-2
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     Rectum-2
Name: disease_ontology_term_id, Length: 3797, dtype: category
Categories (2, object): ['Rectum-1', 'Rectum-2']

In [66]:
mapping = {'Rectum-1': 'MONDO:0004970', 'Rectum-2': 'MONDO:0004970'}

In [67]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].map(mapping)

In [68]:
adata.obs['disease_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    MONDO:0004970
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    MONDO:0004970
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    MONDO:0004970
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    MONDO:0004970
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    MONDO:0004970
                                            ...      
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     MONDO:0004970
TCCACACAGCAATATG_Rectum-2_Stem Cell     MONDO:0004970
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     MONDO:0004970
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     MONDO:0004970
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     MONDO:0004970
Name: disease_ontology_term_id, Length: 3797, dtype: object

## ethnicity_ontology_term_id

In [69]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [70]:
adata.obs['ethnicity_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    unknown
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    unknown
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    unknown
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    unknown
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    unknown
                                         ...   
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     unknown
TCCACACAGCAATATG_Rectum-2_Stem Cell     unknown
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     unknown
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     unknown
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     unknown
Name: ethnicity_ontology_term_id, Length: 3797, dtype: object

## is_primary_data

In [71]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [72]:
adata.obs['is_primary_data']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    True
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    True
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    True
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    True
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    True
                                        ... 
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     True
TCCACACAGCAATATG_Rectum-2_Stem Cell     True
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     True
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     True
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     True
Name: is_primary_data, Length: 3797, dtype: bool

In [73]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [74]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [75]:
adata.obs['organism_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    NCBITaxon:9606
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    NCBITaxon:9606
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    NCBITaxon:9606
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    NCBITaxon:9606
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    NCBITaxon:9606
                                             ...      
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     NCBITaxon:9606
TCCACACAGCAATATG_Rectum-2_Stem Cell     NCBITaxon:9606
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     NCBITaxon:9606
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     NCBITaxon:9606
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 3797, dtype: object

In [76]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [77]:
adata.obs['organism_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    NCBITaxon:9606
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    NCBITaxon:9606
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    NCBITaxon:9606
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    NCBITaxon:9606
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    NCBITaxon:9606
                                             ...      
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     NCBITaxon:9606
TCCACACAGCAATATG_Rectum-2_Stem Cell     NCBITaxon:9606
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     NCBITaxon:9606
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     NCBITaxon:9606
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 3797, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [78]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sample_ID']

In [79]:
adata.obs['sex_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    Rectum-1
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    Rectum-1
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    Rectum-1
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    Rectum-1
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    Rectum-1
                                          ...   
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     Rectum-2
TCCACACAGCAATATG_Rectum-2_Stem Cell     Rectum-2
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     Rectum-2
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     Rectum-2
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     Rectum-2
Name: sex_ontology_term_id, Length: 3797, dtype: category
Categories (2, object): ['Rectum-1', 'Rectum-2']

**PATO_0000384 for male  
PATO_0000383 for female**

In [80]:
mapping = {'Rectum-1': 'PATO:0000383', 'Rectum-2': 'PATO:0000383'}

In [81]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [82]:
adata.obs['sex_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    PATO:0000383
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    PATO:0000383
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    PATO:0000383
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    PATO:0000383
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    PATO:0000383
                                            ...     
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     PATO:0000383
TCCACACAGCAATATG_Rectum-2_Stem Cell     PATO:0000383
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     PATO:0000383
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     PATO:0000383
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     PATO:0000383
Name: sex_ontology_term_id, Length: 3797, dtype: object

## tissue_ontology_term_id

> <ins>**UBERON:0001156:**</ins> ascending Rectum (according to supp pdf)

In [83]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0001052'] * len(adata.obs)

In [84]:
adata.obs['tissue_ontology_term_id']

index
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte    UBERON:0001052
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte    UBERON:0001052
AAACCTGTCAGCGATT_Rectum-1_Enterocyte    UBERON:0001052
AAAGATGGTTCAGACT_Rectum-1_Enterocyte    UBERON:0001052
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte    UBERON:0001052
                                             ...      
TCAGGATGTTCCACAA_Rectum-2_Stem Cell     UBERON:0001052
TCCACACAGCAATATG_Rectum-2_Stem Cell     UBERON:0001052
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell     UBERON:0001052
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell     UBERON:0001052
TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell     UBERON:0001052
Name: tissue_ontology_term_id, Length: 3797, dtype: object

---

# <ins>var and raw.var

In [85]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSG00000238009,False,False,False,25.0,25,23,False,0.003679,-0.360232,-0.807470,ENSG00000238009,RP11-34P13.7,gene
ENSG00000279457,False,False,False,579.0,548,520,True,0.107308,0.118275,0.572867,ENSG00000279457,WASH9P,gene
ENSG00000228463,False,False,False,230.0,230,219,False,0.033655,-0.314175,-0.674611,ENSG00000228463,AP006222.1,gene
ENSG00000237094,False,False,False,4.0,4,4,False,0.000481,-0.686255,-1.747941,ENSG00000237094,RP4-669L17.4,gene
ENSG00000237491,False,False,False,171.0,172,166,False,0.028947,-0.087199,-0.019861,ENSG00000237491,LINC01409,gene
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000159261,False,False,False,3.0,3,3,False,0.000645,-0.009892,0.203147,ENSG00000159261,CLDN14,gene
ENSG00000272991,False,False,False,6.0,6,6,False,0.001739,0.338456,1.208017,ENSG00000272991,AF129408.17,gene
ENSG00000183486,False,False,False,4.0,4,4,False,0.001739,0.586054,1.922254,ENSG00000183486,MX2,gene
ENSG00000160200,False,False,False,3.0,3,3,False,0.000536,-0.200734,-0.347372,ENSG00000160200,CBS,gene


## feature_biotype

Adjusted at the beginning!

In [86]:
#adata2.var['feature_biotype'] = ['gene'] * len(adata2.var.index)

## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [87]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [88]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000279457    False
ENSG00000228463    False
ENSG00000237094    False
ENSG00000237491    False
                   ...  
ENSG00000159261    False
ENSG00000272991    False
ENSG00000183486    False
ENSG00000160200    False
ENSG00000160224    False
Name: feature_is_filtered, Length: 17497, dtype: bool

# <ins>obsm

In [89]:
adata.obsm

AxisArrays with keys: X_umap_hm

In [90]:
adata.obsm['X_umap'] = adata.obsm['X_umap_hm']
del adata.obsm['X_umap_hm']

# <ins>uns

In [91]:
adata.uns['schema_version'] = '2.0.0'

In [92]:
adata.uns['title'] = 'Rectum'

In [93]:
adata.uns['X_normalization'] = 'Scanpy'

In [94]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [95]:
adata.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,10500.0,9.259226,2545,7.842279,28.838095,5.447619,0.009524,49.000000,EFO:0009899,CL:1000347,HsapDv:0000160,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,15281.0,9.634431,3375,8.124447,16.190041,9.619789,0.032720,44.244487,EFO:0009899,CL:1000347,HsapDv:0000160,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
AAACCTGTCAGCGATT_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,6529.0,8.784163,1574,7.362011,32.409252,5.176903,0.015316,59.013631,EFO:0009899,CL:1000347,HsapDv:0000160,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
AAAGATGGTTCAGACT_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,13501.0,9.510593,2989,8.003029,24.946301,3.814532,0.014814,46.737279,EFO:0009899,CL:1000347,HsapDv:0000160,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,5287.0,8.573195,1582,7.367077,29.960281,5.239266,0.037829,51.030830,EFO:0009899,CL:1000347,HsapDv:0000160,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCAGGATGTTCCACAA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17962.0,9.796069,2953,7.990915,11.897339,45.763279,0.000000,47.260884,EFO:0009899,CL:0000034,HsapDv:0000144,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
TCCACACAGCAATATG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,21999.0,9.998797,3771,8.235361,12.205100,38.410835,0.013637,40.510932,EFO:0009899,CL:0000034,HsapDv:0000144,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17397.0,9.764111,3101,8.039802,10.766224,41.392197,0.005748,41.346209,EFO:0009899,CL:0000034,HsapDv:0000144,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,4892.0,8.495561,1223,7.109879,26.185608,30.498774,0.020442,58.053966,EFO:0009899,CL:0000034,HsapDv:0000144,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001052


In [96]:
adata

AnnData object with n_obs × n_vars = 3797 × 17497
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'leiden', 'neighbors_hm', 'pca', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    varm: 'PCs'

In [97]:
del adata.obs['Location']

In [99]:
del adata.var['gene_ids']

In [100]:
adata.write('wang20_rectum_f.h5ad', compression = 'gzip')

... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'sex_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical


---

# 5. Fixes

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

In [2]:
adata = sc.read('wang20_rectum_f.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 3797 × 17497
    obs: 'CellType', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'X_normalization', 'default_embedding', 'leiden', 'neighbors_hm', 'pca', 'schema_version', 'title'
    obsm: 'X_umap'
    varm: 'PCs'

In [4]:
adata.obs['donor_id'] = adata.obs['Sample_ID']

In [4]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [5]:
del adata.obs['Sample_ID']

In [5]:
adata.write('wang20_rectum_f.h5ad', compression = 'gzip')

... storing 'suspension_type' as categorical
