# **Curating `wang20_colon.processed.h5ad` dataset:**

* ~4K cells and 17K genes
* one of the three datasets in the publication
* publication link: https://rupress.org/jem/article/217/2/e20191130/132578/Single-cell-transcriptome-analysis-reveals

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy 
from tqdm import tqdm

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/wang20_colon.processed.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 4329 × 17181
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [4]:
adata.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,27109.0,10.207658,3977,8.288534,19.266663,12.837066,0.003689,46.656092
ACCTTTAGTACTTGAC_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,16638.0,9.719504,2972,7.997327,28.753456,5.968265,0.000000,52.866931
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,23970.0,10.084600,4110,8.321422,17.008760,15.640384,0.012516,42.515645
AGATCTGCAAGGACAC_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,20066.0,9.906832,3751,8.230044,16.246387,15.389215,0.014951,42.614373
ATGGGAGCAATGCCAT_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,23372.0,10.059337,4027,8.301025,21.303268,10.071881,0.017114,45.400479
...,...,...,...,...,...,...,...,...,...,...,...
TGTGGTACACCACCAG_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,21546.0,9.977992,3527,8.168486,11.073981,43.274853,0.013924,44.634735
TGTTCCGCAATGCCAT_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,7831.0,8.965973,1846,7.521318,21.210573,34.746521,0.000000,48.665560
TTAGGCAAGAGTGACC_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,11279.0,9.330787,2487,7.819234,9.752638,40.739426,0.000000,41.794485
TTATGCTTCGTCTGCT_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,8277.0,9.021357,1869,7.533694,16.261930,39.809113,0.000000,48.133382


## 2. Pre-prepare before schema adjustments

### Add row counts

In [5]:
raw = pd.read_table('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/raw/wang20/GSE125970_raw_UMIcounts.txt.gz', index_col=0)

In [6]:
raw

Unnamed: 0_level_0,AAATGCCCAATCTGCA_Ileum-1_Enterocyte,AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,AAGACCTCACGGACAA_Ileum-1_Enterocyte,AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,ACGAGGATCGGCCGAT_Ileum-1_Enterocyte,ACGGCCAGTCTAAACC_Ileum-1_Enterocyte,AGGCCGTTCGAGCCCA_Ileum-1_Enterocyte,AGTGAGGGTCGGCTCA_Ileum-1_Enterocyte,ATGAGGGAGGATATAC_Ileum-1_Enterocyte,...,TAGTTGGCAGTCCTTC_Rectum-2_Stem Cell,TAGTTGGTCTGCCCTA_Rectum-2_Stem Cell,TATCTCAGTGCGGTAA_Rectum-2_Stem Cell,TATTACCCAATAACGA_Rectum-2_Stem Cell,TCAATCTTCACGACTA_Rectum-2_Stem Cell,TCAGGATGTTCCACAA_Rectum-2_Stem Cell,TCCACACAGCAATATG_Rectum-2_Stem Cell,TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,TGCCCTAGTAGCCTCG_Rectum-2_Stem Cell
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FO538757.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
AP006222.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP3-510H16.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669P10.16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BACH1-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
raw = raw.loc[:,list(adata.obs.index)]

In [9]:
raw.shape

(19525, 4329)

In [10]:
adata

AnnData object with n_obs × n_vars = 4329 × 17181
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [11]:
sym2ensg = pd.read_table('../GRCh38-1.2.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [13]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [14]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in sym2ensg['gene_symbol']:
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
   
    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 17181/17181 [00:11<00:00, 1478.16it/s]


### Remove the genes that do not match in gene-ensemblID file

In [15]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [16]:
notvalid

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
RP4-635E18.8,False,False,False,19.0,22,19,False,0.003308,-0.191497,-0.282104,ENSG00000271895
NPPA-AS1,False,False,False,38.0,38,38,False,0.007177,0.194545,1.087477,ENSG00000242349
UBXN10-AS1,False,False,False,23.0,25,23,False,0.003669,0.025313,0.487085,ENSG00000225986
RP11-268J15.5,False,False,False,25.0,27,25,False,0.004058,-0.182576,-0.250454,ENSG00000116883
RP11-69E11.4,False,False,False,212.0,216,200,True,0.038396,0.203774,1.120221,ENSG00000182109
...,...,...,...,...,...,...,...,...,...,...,...
C10orf131,False,False,False,3.0,3,3,False,0.000287,-0.862585,-2.662959,ENSG00000173088
RP11-982M15.6,False,False,False,3.0,3,3,False,0.000442,-0.430415,-1.129726,ENSG00000256050
RP11-60L3.1,False,False,False,4.0,4,4,False,0.000577,-0.420502,-1.094556,ENSG00000259264
TMEM256-PLSCR3.1,False,False,False,3.0,3,3,False,0.000516,-0.278331,-0.590170,a


In [17]:
sum(~adata.var_names.isin(notvalid.index))

17015

In [18]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [19]:
adata

View of AnnData object with n_obs × n_vars = 4329 × 17015
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [20]:
raw = raw.loc[adata.var_names,:]

In [21]:
raw

Unnamed: 0_level_0,AAGGAGCAGTTAGGTA_Colon-1_Enterocyte,ACCTTTAGTACTTGAC_Colon-1_Enterocyte,ACGGGCTAGTCAAGCG_Colon-1_Enterocyte,AGATCTGCAAGGACAC_Colon-1_Enterocyte,ATGGGAGCAATGCCAT_Colon-1_Enterocyte,ATTCTACTCCGTACAA_Colon-1_Enterocyte,CAACTAGGTATAGGTA_Colon-1_Enterocyte,CACAAACAGTGCGATG_Colon-1_Enterocyte,CACCTTGAGTTAGGTA_Colon-1_Enterocyte,CATGCCTCAAGCTGGA_Colon-1_Enterocyte,...,GTCGTAATCGTCGTTC_Colon-2_Stem Cell,TGAAAGATCCCATTTA_Colon-2_Stem Cell,TGACAACAGCATCATC_Colon-2_Stem Cell,TGACGGCAGAGCTGCA_Colon-2_Stem Cell,TGTGGTAAGTTTAGGA_Colon-2_Stem Cell,TGTGGTACACCACCAG_Colon-2_Stem Cell,TGTTCCGCAATGCCAT_Colon-2_Stem Cell,TTAGGCAAGAGTGACC_Colon-2_Stem Cell,TTATGCTTCGTCTGCT_Colon-2_Stem Cell,TTGCGTCCAATGGAAT_Colon-2_Stem Cell
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FO538757.2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
AP006222.2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CH507-39O4.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CH507-154B10.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CH507-254M2.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AP000431.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [22]:
validlst.index = validlst['gene_ids']

In [23]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [24]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [None]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

 73%|███████▎  | 12349/17015 [03:47<01:32, 50.50it/s]

In [33]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [35]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [36]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [37]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [38]:
symbols

Unnamed: 0,oldsymb,newsymb
0,FO538757.2,WASH9P
1,AP006222.2,AP006222.1
2,RP4-669L17.10,RP4-669L17.4
3,RP11-206L10.9,LINC01409
4,FAM132A,C1QTNF12
...,...,...
1218,RP11-46C24.3,LINC02138
1219,RP11-63A1.1,GOSR2-DT
1220,RP11-973H7.1,LINC01882
1221,AC005775.2,MADCAM1-AS1


In [44]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000238009,False,False,False,9.0,10,9,False,0.001402,-0.223694,-0.396330,RP11-34P13.7
ENSG00000279457,False,False,False,620.0,637,572,False,0.092447,-0.075833,0.128244,WASH9P
ENSG00000228463,False,False,False,307.0,326,296,False,0.040123,-0.342289,-0.817075,AP006222.2
ENSG00000237094,False,False,False,3.0,3,3,False,0.000246,-1.017381,-3.212137,RP4-669L17.10
ENSG00000237491,False,False,False,120.0,130,118,False,0.018567,-0.113123,-0.004051,RP11-206L10.9
...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000279064,False,False,False,3.0,3,3,False,0.000405,-0.494990,-1.358822,CH507-39O4.2
ENSG00000280018,False,False,False,7.0,7,7,False,0.000980,-0.424253,-1.107865,CH507-154B10.2
ENSG00000277067,False,False,False,3.0,3,3,False,0.000386,-0.504515,-1.392615,CH507-254M2.1
ENSG00000235965,False,False,False,4.0,4,4,False,0.000723,0.047152,0.564565,AP000431.1


In [48]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000238009,False,False,False,9.0,10,9,False,0.001402,-0.223694,-0.396330,RP11-34P13.7
ENSG00000279457,False,False,False,620.0,637,572,False,0.092447,-0.075833,0.128244,WASH9P
ENSG00000228463,False,False,False,307.0,326,296,False,0.040123,-0.342289,-0.817075,AP006222.2
ENSG00000237094,False,False,False,3.0,3,3,False,0.000246,-1.017381,-3.212137,RP4-669L17.10
ENSG00000237491,False,False,False,120.0,130,118,False,0.018567,-0.113123,-0.004051,RP11-206L10.9
...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000279064,False,False,False,3.0,3,3,False,0.000405,-0.494990,-1.358822,CH507-39O4.2
ENSG00000280018,False,False,False,7.0,7,7,False,0.000980,-0.424253,-1.107865,CH507-154B10.2
ENSG00000277067,False,False,False,3.0,3,3,False,0.000386,-0.504515,-1.392615,CH507-254M2.1
ENSG00000235965,False,False,False,4.0,4,4,False,0.000723,0.047152,0.564565,AP000431.1


In [50]:
np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0]

array([], dtype=int64)

In [None]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

In [54]:
raw = raw.transpose()

In [55]:
raw = raw.set_axis(list(adata.var_names), axis = 1, inplace=False)

In [56]:
raw

Unnamed: 0,ENSG00000238009,ENSG00000279457,ENSG00000228463,ENSG00000237094,ENSG00000237491,ENSG00000225880,ENSG00000230368,ENSG00000187634,ENSG00000188976,ENSG00000187961,...,ENSG00000100156,ENSG00000281538,ENSG00000230319,ENSG00000188064,ENSG00000279494,ENSG00000279064,ENSG00000280018,ENSG00000277067,ENSG00000235965,ENSG00000261610
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACCTTTAGTACTTGAC_Colon-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AGATCTGCAAGGACAC_Colon-1_Enterocyte,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ATGGGAGCAATGCCAT_Colon-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGTGGTACACCACCAG_Colon-2_Stem Cell,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
TGTTCCGCAATGCCAT_Colon-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTAGGCAAGAGTGACC_Colon-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTATGCTTCGTCTGCT_Colon-2_Stem Cell,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
raww = adata.copy()

In [58]:
raww.X = scipy.sparse.csr_matrix(raw)

In [59]:
del raww.obs
del raww.uns
del raww.obsm
del raww.varm
del raww.var

In [60]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [61]:
raww.var['feature_biotype'] = ["gene"] * len(raww.var)

In [62]:
adata.raw = raww

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [63]:
adata.obs.columns

Index(['Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts',
       'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo',
       'percent_hb', 'percent_top50'],
      dtype='object')

## assay_ontology_term_id

At GEO:  
> "cDNA library establishment and sequencing: single cell suspensions were captured using the <ins>10X Genomics Chromium Single Cell 3' Library & Gel Bead Kit v2 (PN-120237)</ins> according to the manufacturers protoraw."

In [64]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009899"] * len(adata.obs)

In [65]:
adata.obs['assay_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    EFO:0009899
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    EFO:0009899
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    EFO:0009899
AGATCTGCAAGGACAC_Colon-1_Enterocyte    EFO:0009899
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    EFO:0009899
                                          ...     
TGTGGTACACCACCAG_Colon-2_Stem Cell     EFO:0009899
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     EFO:0009899
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     EFO:0009899
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     EFO:0009899
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     EFO:0009899
Name: assay_ontology_term_id, Length: 4329, dtype: object

In [66]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [67]:
adata.obs['assay_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    EFO:0009899
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    EFO:0009899
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    EFO:0009899
AGATCTGCAAGGACAC_Colon-1_Enterocyte    EFO:0009899
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    EFO:0009899
                                          ...     
TGTGGTACACCACCAG_Colon-2_Stem Cell     EFO:0009899
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     EFO:0009899
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     EFO:0009899
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     EFO:0009899
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     EFO:0009899
Name: assay_ontology_term_id, Length: 4329, dtype: category
Categories (1, object): ['EFO:0009899']

## cell_type_ontology_term_id

In [68]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [69]:
list(adata.obs['CellType'].unique())

['Enterocyte',
 'Goblet',
 'Paneth-like',
 'Enteriendocrine',
 'Progenitor',
 'TA',
 'Stem Cell']

In [70]:
adata.obs['cell_type_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    Enterocyte
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    Enterocyte
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    Enterocyte
AGATCTGCAAGGACAC_Colon-1_Enterocyte    Enterocyte
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    Enterocyte
                                          ...    
TGTGGTACACCACCAG_Colon-2_Stem Cell      Stem Cell
TGTTCCGCAATGCCAT_Colon-2_Stem Cell      Stem Cell
TTAGGCAAGAGTGACC_Colon-2_Stem Cell      Stem Cell
TTATGCTTCGTCTGCT_Colon-2_Stem Cell      Stem Cell
TTGCGTCCAATGGAAT_Colon-2_Stem Cell      Stem Cell
Name: cell_type_ontology_term_id, Length: 4329, dtype: category
Categories (7, object): ['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor', 'Stem Cell', 'TA']

In [72]:
mapping = {'Enterocyte': 'CL:1000347',
 'Goblet': 'CL:0009039',
 'Paneth-like': 'CL:0009009',
 'Enteriendocrine': 'CL:0009042',
 'Progenitor': 'CL:0011026',
 'TA': 'CL:0009011',
 'Stem Cell': 'CL:0000034'}

In [73]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [74]:
adata.obs['cell_type_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    CL:1000347
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    CL:1000347
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    CL:1000347
AGATCTGCAAGGACAC_Colon-1_Enterocyte    CL:1000347
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    CL:1000347
                                          ...    
TGTGGTACACCACCAG_Colon-2_Stem Cell     CL:0000034
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     CL:0000034
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     CL:0000034
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     CL:0000034
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     CL:0000034
Name: cell_type_ontology_term_id, Length: 4329, dtype: category
Categories (7, object): ['CL:0009042', 'CL:1000347', 'CL:0009039', 'CL:0009009', 'CL:0011026', 'CL:0000034', 'CL:0009011']

## development_stage_ontology_term_id

In [75]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Sample_ID']

In [76]:
adata.obs['development_stage_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    Colon-1
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    Colon-1
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    Colon-1
AGATCTGCAAGGACAC_Colon-1_Enterocyte    Colon-1
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    Colon-1
                                        ...   
TGTGGTACACCACCAG_Colon-2_Stem Cell     Colon-2
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     Colon-2
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     Colon-2
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     Colon-2
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     Colon-2
Name: development_stage_ontology_term_id, Length: 4329, dtype: category
Categories (2, object): ['Colon-1', 'Colon-2']

In [77]:
adata.obs['development_stage_ontology_term_id'].unique()

['Colon-1', 'Colon-2']
Categories (2, object): ['Colon-1', 'Colon-2']

In [78]:
mapping = {'Colon-1': 'HsapDv:0000148', 'Colon-2': 'HsapDv:0000161'}

In [79]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [80]:
adata.obs['development_stage_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    HsapDv:0000148
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    HsapDv:0000148
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    HsapDv:0000148
AGATCTGCAAGGACAC_Colon-1_Enterocyte    HsapDv:0000148
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    HsapDv:0000148
                                            ...      
TGTGGTACACCACCAG_Colon-2_Stem Cell     HsapDv:0000161
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     HsapDv:0000161
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     HsapDv:0000161
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     HsapDv:0000161
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     HsapDv:0000161
Name: development_stage_ontology_term_id, Length: 4329, dtype: category
Categories (2, object): ['HsapDv:0000148', 'HsapDv:0000161']

## disease_ontology_term_id

**"PATO:0000461"** for normal or healthy.

In [81]:
adata.obs['disease_ontology_term_id'] = adata.obs['Sample_ID']

In [82]:
adata.obs['disease_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    Colon-1
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    Colon-1
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    Colon-1
AGATCTGCAAGGACAC_Colon-1_Enterocyte    Colon-1
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    Colon-1
                                        ...   
TGTGGTACACCACCAG_Colon-2_Stem Cell     Colon-2
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     Colon-2
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     Colon-2
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     Colon-2
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     Colon-2
Name: disease_ontology_term_id, Length: 4329, dtype: category
Categories (2, object): ['Colon-1', 'Colon-2']

In [83]:
mapping = {'Colon-1': 'MONDO:0004970', 'Colon-2': 'MONDO:0004970'}

In [84]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].map(mapping)

In [85]:
adata.obs['disease_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    MONDO:0004970
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    MONDO:0004970
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    MONDO:0004970
AGATCTGCAAGGACAC_Colon-1_Enterocyte    MONDO:0004970
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    MONDO:0004970
                                           ...      
TGTGGTACACCACCAG_Colon-2_Stem Cell     MONDO:0004970
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     MONDO:0004970
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     MONDO:0004970
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     MONDO:0004970
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     MONDO:0004970
Name: disease_ontology_term_id, Length: 4329, dtype: object

## ethnicity_ontology_term_id

In [86]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [87]:
adata.obs['ethnicity_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    unknown
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    unknown
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    unknown
AGATCTGCAAGGACAC_Colon-1_Enterocyte    unknown
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    unknown
                                        ...   
TGTGGTACACCACCAG_Colon-2_Stem Cell     unknown
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     unknown
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     unknown
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     unknown
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     unknown
Name: ethnicity_ontology_term_id, Length: 4329, dtype: object

## is_primary_data

In [88]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [89]:
adata.obs['is_primary_data']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    True
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    True
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    True
AGATCTGCAAGGACAC_Colon-1_Enterocyte    True
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    True
                                       ... 
TGTGGTACACCACCAG_Colon-2_Stem Cell     True
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     True
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     True
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     True
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     True
Name: is_primary_data, Length: 4329, dtype: bool

In [90]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [91]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [92]:
adata.obs['organism_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    NCBITaxon:9606
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    NCBITaxon:9606
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    NCBITaxon:9606
AGATCTGCAAGGACAC_Colon-1_Enterocyte    NCBITaxon:9606
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    NCBITaxon:9606
                                            ...      
TGTGGTACACCACCAG_Colon-2_Stem Cell     NCBITaxon:9606
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     NCBITaxon:9606
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     NCBITaxon:9606
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     NCBITaxon:9606
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 4329, dtype: object

In [93]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [94]:
adata.obs['organism_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    NCBITaxon:9606
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    NCBITaxon:9606
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    NCBITaxon:9606
AGATCTGCAAGGACAC_Colon-1_Enterocyte    NCBITaxon:9606
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    NCBITaxon:9606
                                            ...      
TGTGGTACACCACCAG_Colon-2_Stem Cell     NCBITaxon:9606
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     NCBITaxon:9606
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     NCBITaxon:9606
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     NCBITaxon:9606
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 4329, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [95]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sample_ID']

In [96]:
adata.obs['sex_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    Colon-1
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    Colon-1
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    Colon-1
AGATCTGCAAGGACAC_Colon-1_Enterocyte    Colon-1
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    Colon-1
                                        ...   
TGTGGTACACCACCAG_Colon-2_Stem Cell     Colon-2
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     Colon-2
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     Colon-2
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     Colon-2
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     Colon-2
Name: sex_ontology_term_id, Length: 4329, dtype: category
Categories (2, object): ['Colon-1', 'Colon-2']

**PATO_0000384 for male  
PATO_0000383 for female**

In [97]:
mapping = {'Colon-1': 'PATO:0000384', 'Colon-2': 'PATO:0000383'}

In [98]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [99]:
adata.obs['sex_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    PATO:0000384
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    PATO:0000384
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    PATO:0000384
AGATCTGCAAGGACAC_Colon-1_Enterocyte    PATO:0000384
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    PATO:0000384
                                           ...     
TGTGGTACACCACCAG_Colon-2_Stem Cell     PATO:0000383
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     PATO:0000383
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     PATO:0000383
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     PATO:0000383
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     PATO:0000383
Name: sex_ontology_term_id, Length: 4329, dtype: category
Categories (2, object): ['PATO:0000384', 'PATO:0000383']

## tissue_ontology_term_id

> <ins>**UBERON:0001156:**</ins> ascending rawon (according to supp pdf)

In [100]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0001156'] * len(adata.obs)

In [101]:
adata.obs['tissue_ontology_term_id']

index
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte    UBERON:0001156
ACCTTTAGTACTTGAC_Colon-1_Enterocyte    UBERON:0001156
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte    UBERON:0001156
AGATCTGCAAGGACAC_Colon-1_Enterocyte    UBERON:0001156
ATGGGAGCAATGCCAT_Colon-1_Enterocyte    UBERON:0001156
                                            ...      
TGTGGTACACCACCAG_Colon-2_Stem Cell     UBERON:0001156
TGTTCCGCAATGCCAT_Colon-2_Stem Cell     UBERON:0001156
TTAGGCAAGAGTGACC_Colon-2_Stem Cell     UBERON:0001156
TTATGCTTCGTCTGCT_Colon-2_Stem Cell     UBERON:0001156
TTGCGTCCAATGGAAT_Colon-2_Stem Cell     UBERON:0001156
Name: tissue_ontology_term_id, Length: 4329, dtype: object

---

# <ins>var and raw.var

In [102]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000238009,False,False,False,9.0,10,9,False,0.001402,-0.223694,-0.396330,RP11-34P13.7,gene
ENSG00000279457,False,False,False,620.0,637,572,False,0.092447,-0.075833,0.128244,WASH9P,gene
ENSG00000228463,False,False,False,307.0,326,296,False,0.040123,-0.342289,-0.817075,AP006222.2,gene
ENSG00000237094,False,False,False,3.0,3,3,False,0.000246,-1.017381,-3.212137,RP4-669L17.10,gene
ENSG00000237491,False,False,False,120.0,130,118,False,0.018567,-0.113123,-0.004051,RP11-206L10.9,gene
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000279064,False,False,False,3.0,3,3,False,0.000405,-0.494990,-1.358822,CH507-39O4.2,gene
ENSG00000280018,False,False,False,7.0,7,7,False,0.000980,-0.424253,-1.107865,CH507-154B10.2,gene
ENSG00000277067,False,False,False,3.0,3,3,False,0.000386,-0.504515,-1.392615,CH507-254M2.1,gene
ENSG00000235965,False,False,False,4.0,4,4,False,0.000723,0.047152,0.564565,AP000431.1,gene


## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [103]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [104]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000279457    False
ENSG00000228463    False
ENSG00000237094    False
ENSG00000237491    False
                   ...  
ENSG00000279064    False
ENSG00000280018    False
ENSG00000277067    False
ENSG00000235965    False
ENSG00000261610    False
Name: feature_is_filtered, Length: 17015, dtype: bool

# <ins>obsm

In [105]:
adata.obsm

AxisArrays with keys: X_umap_hm

In [106]:
adata.obsm['X_umap'] = adata.obsm['X_umap_hm']
del adata.obsm['X_umap_hm']

# <ins>uns

In [107]:
adata.uns['schema_version'] = '2.0.0'

In [108]:
adata.uns['title'] = 'Colon'

In [109]:
adata.uns['X_normalization'] = 'Scanpy'

In [110]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [111]:
adata.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,27109.0,10.207658,3977,8.288534,19.266663,12.837066,0.003689,46.656092,EFO:0009899,CL:1000347,HsapDv:0000148,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001156
ACCTTTAGTACTTGAC_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,16638.0,9.719504,2972,7.997327,28.753456,5.968265,0.000000,52.866931,EFO:0009899,CL:1000347,HsapDv:0000148,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001156
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,23970.0,10.084600,4110,8.321422,17.008760,15.640384,0.012516,42.515645,EFO:0009899,CL:1000347,HsapDv:0000148,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001156
AGATCTGCAAGGACAC_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,20066.0,9.906832,3751,8.230044,16.246387,15.389215,0.014951,42.614373,EFO:0009899,CL:1000347,HsapDv:0000148,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001156
ATGGGAGCAATGCCAT_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,23372.0,10.059337,4027,8.301025,21.303268,10.071881,0.017114,45.400479,EFO:0009899,CL:1000347,HsapDv:0000148,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGTGGTACACCACCAG_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,21546.0,9.977992,3527,8.168486,11.073981,43.274853,0.013924,44.634735,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001156
TGTTCCGCAATGCCAT_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,7831.0,8.965973,1846,7.521318,21.210573,34.746521,0.000000,48.665560,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001156
TTAGGCAAGAGTGACC_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,11279.0,9.330787,2487,7.819234,9.752638,40.739426,0.000000,41.794485,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001156
TTATGCTTCGTCTGCT_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,8277.0,9.021357,1869,7.533694,16.261930,39.809113,0.000000,48.133382,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0001156


In [112]:
del adata.obs['Location']

In [5]:
adata.obs['donor_id'] = adata.obs['Sample_ID']

In [5]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [6]:
del adata.obs['Sample_ID']

In [114]:
adata.write('wang20_colon_f.h5ad', compression = 'gzip')

... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical
