# **Curating `wang20_ileum.processed.h5ad` dataset:**

* ~4K cells and 17K genes
* one of the three datasets in the publication
* publication link: https://rupress.org/jem/article/217/2/e20191130/132578/Single-cell-transcriptome-analysis-reveals

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/wang20_ileum.processed.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 5980 × 16977
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [4]:
adata.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAATGCCCAATCTGCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,4956.0,8.508556,1621,7.391415,12.126715,6.799839,0.020178,44.915254
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,8600.0,9.059634,2303,7.742402,14.465116,7.813953,0.011628,43.825581
AAGACCTCACGGACAA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,7236.0,8.886962,1833,7.514255,9.798231,9.369818,0.000000,43.352681
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,9623.0,9.172015,2371,7.771489,18.528526,8.677128,0.020784,44.840486
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,4644.0,8.443546,1488,7.305860,11.886305,7.278208,0.000000,44.142980
...,...,...,...,...,...,...,...,...,...,...,...
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,8427.0,9.039315,1904,7.552237,21.798979,35.291325,0.011867,50.492465
TGGTTAGCATCACGAT_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,9821.0,9.192380,2207,7.699842,18.470625,37.694733,0.020365,48.416658
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,6560.0,8.788898,1443,7.275172,26.935976,34.192074,0.000000,55.533537
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,15412.0,9.642966,2941,7.986845,18.096289,21.749287,0.006488,44.835193


## 2. Pre-prepare before schema adjustments

### Add row counts

In [None]:
raw = pd.read_table('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/raw/wang20/GSE125970_raw_UMIcounts.txt.gz', index_col=0)

In [None]:
raw

In [None]:
raw.columns

In [None]:
# raw = raw.loc[:,(raw.columns.isin(list(adata.obs.index)))]

In [None]:
raw = raw.loc[:,list(adata.obs.index)]

In [None]:
raw.shape

In [None]:
adata

### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [6]:
sym2ensg = pd.read_table('../GRCh38-1.2.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [8]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [9]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in sym2ensg['gene_symbol']:
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
   
    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 16977/16977 [00:10<00:00, 1623.23it/s]


### Remove the genes that do not match in gene-ensemblID file

In [10]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [11]:
notvalid

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
RP4-635E18.8,False,False,False,24.0,27,24,False,0.004704,0.395262,0.443672,ENSG00000271895
NPPA-AS1,False,False,False,19.0,23,19,False,0.003779,0.444054,0.635517,ENSG00000242349
UBXN10-AS1,False,False,False,19.0,21,17,False,0.004685,1.069044,3.092903,ENSG00000225986
RP1-224A6.9,False,False,False,7.0,8,7,False,0.001168,0.180871,-0.399288,ENSG00000271840
RP11-268J15.5,False,False,False,165.0,171,158,False,0.030417,0.383252,0.396451,ENSG00000116883
...,...,...,...,...,...,...,...,...,...,...,...
LINC00202-1,False,False,False,4.0,4,4,False,0.000528,-0.014529,-1.167574,ENSG00000232224
RP11-407N17.3,False,False,False,20.0,21,20,False,0.004367,0.477098,0.765445,ENSG00000258941
FAM169B,False,False,False,6.0,6,6,False,0.001312,0.430753,0.583219,ENSG00000185087
UGT1A3,False,False,False,5.0,5,5,False,0.001161,0.439095,0.616019,ENSG00000243135


In [14]:
len(notvalid.loc[notvalid.gene_ids != 'a'])

150

In [18]:
sum(~adata.var_names.isin(notvalid.index))

16819

In [19]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [20]:
adata

View of AnnData object with n_obs × n_vars = 5980 × 16819
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [21]:
raw = raw.loc[adata.var_names,:]

In [22]:
raw

Unnamed: 0_level_0,AAATGCCCAATCTGCA_Ileum-1_Enterocyte,AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,AAGACCTCACGGACAA_Ileum-1_Enterocyte,AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,ACGAGGATCGGCCGAT_Ileum-1_Enterocyte,ACGGCCAGTCTAAACC_Ileum-1_Enterocyte,AGGCCGTTCGAGCCCA_Ileum-1_Enterocyte,AGTGAGGGTCGGCTCA_Ileum-1_Enterocyte,ATGAGGGAGGATATAC_Ileum-1_Enterocyte,...,TCTTTCCGTTACGTCA_Ileum-2_Stem Cell,TGACGGCGTTCCACGG_Ileum-2_Stem Cell,TGAGAGGTCCTGCAGG_Ileum-2_Stem Cell,TGAGGGATCACAGGCC_Ileum-2_Stem Cell,TGCTGCTAGTTGAGTA_Ileum-2_Stem Cell,TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell,TGGTTAGCATCACGAT_Ileum-2_Stem Cell,TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell,TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell,TTGACTTGTGACGGTA_Ileum-2_Stem Cell
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FO538757.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AP006222.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP3-510H16.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669P10.16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BACH1-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [23]:
validlst.index = validlst['gene_ids']

In [24]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [25]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [26]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

100%|██████████| 16819/16819 [05:16<00:00, 53.20it/s]


In [27]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [28]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [29]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [30]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [31]:
symbols

Unnamed: 0,oldsymb,newsymb
0,FO538757.2,WASH9P
1,AP006222.2,AP006222.1
2,RP4-669L17.10,RP4-669L17.4
3,RP11-206L10.9,LINC01409
4,FAM132A,C1QTNF12
...,...,...
1228,SDPR,CAVIN2
1229,RP11-83C7.1,LINC02513
1230,RP11-159K7.2,LINC02200
1231,RP11-43N5.1,LINC02421


In [32]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000238009,False,False,False,10.0,13,10,False,0.001702,0.196076,-0.339504,ENSG00000238009,RP11-34P13.7
ENSG00000279457,False,False,False,558.0,587,520,False,0.093239,0.305601,0.091138,ENSG00000279457,FO538757.2
ENSG00000228463,False,False,False,181.0,210,179,False,0.030459,0.244459,-0.149265,ENSG00000228463,AP006222.2
ENSG00000237094,False,False,False,5.0,8,5,False,0.001044,0.417806,0.532314,ENSG00000237094,RP4-669L17.10
ENSG00000237491,False,False,False,83.0,106,83,False,0.014199,0.258878,-0.092572,ENSG00000237491,RP11-206L10.9
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000133475,False,False,False,8.0,8,8,False,0.002016,0.457016,0.686485,ENSG00000133475,GGT2
ENSG00000273176,False,False,False,4.0,4,4,False,0.000818,0.436569,0.606087,ENSG00000273176,RP3-510H16.3
ENSG00000232710,False,False,False,20.0,20,20,False,0.004075,0.390260,0.424007,ENSG00000232710,RP4-669P10.16
ENSG00000232118,False,False,False,6.0,6,6,False,0.001167,0.265457,-0.066705,ENSG00000232118,BACH1-AS1


In [35]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]
100%|██████████| 1233/1233 [00:01<00:00, 919.08it/s]


In [36]:
raw = raw.transpose()

In [37]:
raw = raw.set_axis(list(adata.var_names), axis = 1, inplace=False)

In [38]:
raw

Unnamed: 0,ENSG00000238009,ENSG00000279457,ENSG00000228463,ENSG00000237094,ENSG00000237491,ENSG00000225880,ENSG00000230368,ENSG00000187634,ENSG00000188976,ENSG00000187961,...,ENSG00000267033,ENSG00000130307,ENSG00000269190,ENSG00000259108,ENSG00000104888,ENSG00000133475,ENSG00000273176,ENSG00000232710,ENSG00000232118,ENSG00000142182
AAATGCCCAATCTGCA_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AAGACCTCACGGACAA_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGGTTAGCATCACGAT_Ileum-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
raww = adata.copy()

In [40]:
raww.X = scipy.sparse.csr_matrix(raw)

In [41]:
del raww.obs
del raww.uns
del raww.obsm
del raww.varm
del raww.var

In [42]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [43]:
raww.var['feature_biotype'] = ["gene"] * len(raww.var)

In [44]:
adata.raw = raww

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [45]:
adata.obs.columns

Index(['Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts',
       'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo',
       'percent_hb', 'percent_top50'],
      dtype='object')

## assay_ontology_term_id

At GEO:  
> "cDNA library establishment and sequencing: single cell suspensions were captured using the <ins>10X Genomics Chromium Single Cell 3' Library & Gel Bead Kit v2 (PN-120237)</ins> according to the manufacturers protocol."

In [46]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009899"] * len(adata.obs)

In [47]:
adata.obs['assay_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    EFO:0009899
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    EFO:0009899
AAGACCTCACGGACAA_Ileum-1_Enterocyte    EFO:0009899
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    EFO:0009899
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    EFO:0009899
                                          ...     
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     EFO:0009899
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     EFO:0009899
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     EFO:0009899
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     EFO:0009899
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     EFO:0009899
Name: assay_ontology_term_id, Length: 5980, dtype: object

In [48]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [49]:
adata.obs['assay_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    EFO:0009899
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    EFO:0009899
AAGACCTCACGGACAA_Ileum-1_Enterocyte    EFO:0009899
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    EFO:0009899
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    EFO:0009899
                                          ...     
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     EFO:0009899
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     EFO:0009899
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     EFO:0009899
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     EFO:0009899
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     EFO:0009899
Name: assay_ontology_term_id, Length: 5980, dtype: category
Categories (1, object): ['EFO:0009899']

## cell_type_ontology_term_id

In [50]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [51]:
list(adata.obs['CellType'].unique())

['Enterocyte',
 'Goblet',
 'Paneth-like',
 'Enteriendocrine',
 'Progenitor',
 'TA',
 'Stem Cell']

In [52]:
adata.obs['cell_type_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    Enterocyte
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    Enterocyte
AAGACCTCACGGACAA_Ileum-1_Enterocyte    Enterocyte
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    Enterocyte
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    Enterocyte
                                          ...    
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell      Stem Cell
TGGTTAGCATCACGAT_Ileum-2_Stem Cell      Stem Cell
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell      Stem Cell
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell      Stem Cell
TTGACTTGTGACGGTA_Ileum-2_Stem Cell      Stem Cell
Name: cell_type_ontology_term_id, Length: 5980, dtype: category
Categories (7, object): ['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor', 'Stem Cell', 'TA']

In [53]:
#list(adata.obs['cell_type_ontology_term_id'].unique())

In [54]:
mapping = {'Enterocyte': 'CL:1000347',
 'Goblet': 'CL:0009039',
 'Paneth-like': 'CL:0009009',
 'Enteriendocrine': 'CL:0009042',
 'Progenitor': 'CL:0011026',
 'TA': 'CL:0009011',
 'Stem Cell': 'CL:0000034'}

In [55]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [56]:
adata.obs['cell_type_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    CL:1000347
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    CL:1000347
AAGACCTCACGGACAA_Ileum-1_Enterocyte    CL:1000347
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    CL:1000347
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    CL:1000347
                                          ...    
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     CL:0000034
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     CL:0000034
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     CL:0000034
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     CL:0000034
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     CL:0000034
Name: cell_type_ontology_term_id, Length: 5980, dtype: category
Categories (7, object): ['CL:0009042', 'CL:1000347', 'CL:0009039', 'CL:0009009', 'CL:0011026', 'CL:0000034', 'CL:0009011']

## development_stage_ontology_term_id

In [57]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Sample_ID']

In [58]:
adata.obs['development_stage_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    Ileum-1
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    Ileum-1
AAGACCTCACGGACAA_Ileum-1_Enterocyte    Ileum-1
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    Ileum-1
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    Ileum-1
                                        ...   
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     Ileum-2
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     Ileum-2
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     Ileum-2
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     Ileum-2
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     Ileum-2
Name: development_stage_ontology_term_id, Length: 5980, dtype: category
Categories (2, object): ['Ileum-1', 'Ileum-2']

In [59]:
adata.obs['development_stage_ontology_term_id'].unique()

['Ileum-1', 'Ileum-2']
Categories (2, object): ['Ileum-1', 'Ileum-2']

In [60]:
mapping = {'Ileum-1': 'HsapDv:0000154', 'Ileum-2': 'HsapDv:0000161'}

In [61]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [62]:
adata.obs['development_stage_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    HsapDv:0000154
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    HsapDv:0000154
AAGACCTCACGGACAA_Ileum-1_Enterocyte    HsapDv:0000154
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    HsapDv:0000154
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    HsapDv:0000154
                                            ...      
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     HsapDv:0000161
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     HsapDv:0000161
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     HsapDv:0000161
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     HsapDv:0000161
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     HsapDv:0000161
Name: development_stage_ontology_term_id, Length: 5980, dtype: category
Categories (2, object): ['HsapDv:0000154', 'HsapDv:0000161']

## disease_ontology_term_id

**"PATO:0000461"** for normal or healthy.

In [63]:
adata.obs['disease_ontology_term_id'] = adata.obs['Sample_ID']

In [64]:
adata.obs['disease_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    Ileum-1
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    Ileum-1
AAGACCTCACGGACAA_Ileum-1_Enterocyte    Ileum-1
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    Ileum-1
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    Ileum-1
                                        ...   
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     Ileum-2
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     Ileum-2
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     Ileum-2
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     Ileum-2
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     Ileum-2
Name: disease_ontology_term_id, Length: 5980, dtype: category
Categories (2, object): ['Ileum-1', 'Ileum-2']

In [65]:
mapping = {'Ileum-1': 'MONDO:0002120', 'Ileum-2': 'MONDO:0004970'}

In [66]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].map(mapping)

In [67]:
adata.obs['disease_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    MONDO:0002120
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    MONDO:0002120
AAGACCTCACGGACAA_Ileum-1_Enterocyte    MONDO:0002120
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    MONDO:0002120
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    MONDO:0002120
                                           ...      
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     MONDO:0004970
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     MONDO:0004970
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     MONDO:0004970
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     MONDO:0004970
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     MONDO:0004970
Name: disease_ontology_term_id, Length: 5980, dtype: category
Categories (2, object): ['MONDO:0002120', 'MONDO:0004970']

## ethnicity_ontology_term_id

In [68]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [69]:
adata.obs['ethnicity_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    unknown
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    unknown
AAGACCTCACGGACAA_Ileum-1_Enterocyte    unknown
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    unknown
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    unknown
                                        ...   
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     unknown
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     unknown
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     unknown
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     unknown
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     unknown
Name: ethnicity_ontology_term_id, Length: 5980, dtype: object

## is_primary_data

In [70]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [71]:
adata.obs['is_primary_data']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    True
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    True
AAGACCTCACGGACAA_Ileum-1_Enterocyte    True
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    True
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    True
                                       ... 
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     True
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     True
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     True
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     True
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     True
Name: is_primary_data, Length: 5980, dtype: bool

In [72]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [73]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [74]:
adata.obs['organism_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    NCBITaxon:9606
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    NCBITaxon:9606
AAGACCTCACGGACAA_Ileum-1_Enterocyte    NCBITaxon:9606
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    NCBITaxon:9606
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    NCBITaxon:9606
                                            ...      
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     NCBITaxon:9606
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     NCBITaxon:9606
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     NCBITaxon:9606
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     NCBITaxon:9606
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 5980, dtype: object

In [75]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [76]:
adata.obs['organism_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    NCBITaxon:9606
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    NCBITaxon:9606
AAGACCTCACGGACAA_Ileum-1_Enterocyte    NCBITaxon:9606
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    NCBITaxon:9606
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    NCBITaxon:9606
                                            ...      
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     NCBITaxon:9606
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     NCBITaxon:9606
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     NCBITaxon:9606
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     NCBITaxon:9606
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 5980, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [77]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sample_ID']

In [78]:
adata.obs['sex_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    Ileum-1
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    Ileum-1
AAGACCTCACGGACAA_Ileum-1_Enterocyte    Ileum-1
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    Ileum-1
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    Ileum-1
                                        ...   
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     Ileum-2
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     Ileum-2
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     Ileum-2
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     Ileum-2
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     Ileum-2
Name: sex_ontology_term_id, Length: 5980, dtype: category
Categories (2, object): ['Ileum-1', 'Ileum-2']

**PATO_0000384 for male  
PATO_0000383 for female**

In [79]:
mapping = {'Ileum-1': 'PATO:0000384', 'Ileum-2': 'PATO:0000383'}

In [80]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [81]:
adata.obs['sex_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    PATO:0000384
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    PATO:0000384
AAGACCTCACGGACAA_Ileum-1_Enterocyte    PATO:0000384
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    PATO:0000384
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    PATO:0000384
                                           ...     
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     PATO:0000383
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     PATO:0000383
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     PATO:0000383
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     PATO:0000383
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     PATO:0000383
Name: sex_ontology_term_id, Length: 5980, dtype: category
Categories (2, object): ['PATO:0000384', 'PATO:0000383']

## tissue_ontology_term_id

> <ins>**UBERON:0002116:**</ins> ileum

In [82]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002116'] * len(adata.obs)

In [83]:
adata.obs['tissue_ontology_term_id']

index
AAATGCCCAATCTGCA_Ileum-1_Enterocyte    UBERON:0002116
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte    UBERON:0002116
AAGACCTCACGGACAA_Ileum-1_Enterocyte    UBERON:0002116
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte    UBERON:0002116
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte    UBERON:0002116
                                            ...      
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell     UBERON:0002116
TGGTTAGCATCACGAT_Ileum-2_Stem Cell     UBERON:0002116
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell     UBERON:0002116
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell     UBERON:0002116
TTGACTTGTGACGGTA_Ileum-2_Stem Cell     UBERON:0002116
Name: tissue_ontology_term_id, Length: 5980, dtype: object

---

# <ins>var and raw.var

In [84]:
adata.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSG00000238009,False,False,False,10.0,13,10,False,0.001702,0.196076,-0.339504,ENSG00000238009,RP11-34P13.7,gene
ENSG00000279457,False,False,False,558.0,587,520,False,0.093239,0.305601,0.091138,ENSG00000279457,WASH9P,gene
ENSG00000228463,False,False,False,181.0,210,179,False,0.030459,0.244459,-0.149265,ENSG00000228463,AP006222.1,gene
ENSG00000237094,False,False,False,5.0,8,5,False,0.001044,0.417806,0.532314,ENSG00000237094,RP4-669L17.4,gene
ENSG00000237491,False,False,False,83.0,106,83,False,0.014199,0.258878,-0.092572,ENSG00000237491,LINC01409,gene
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000133475,False,False,False,8.0,8,8,False,0.002016,0.457016,0.686485,ENSG00000133475,GGT2,gene
ENSG00000273176,False,False,False,4.0,4,4,False,0.000818,0.436569,0.606087,ENSG00000273176,RP3-510H16.3,gene
ENSG00000232710,False,False,False,20.0,20,20,False,0.004075,0.390260,0.424007,ENSG00000232710,RP4-669P10.16,gene
ENSG00000232118,False,False,False,6.0,6,6,False,0.001167,0.265457,-0.066705,ENSG00000232118,BACH1-AS1,gene


## feature_biotype

Adjusted at the beginning!

In [85]:
#adata2.var['feature_biotype'] = ['gene'] * len(adata2.var.index)

## feature_id

Adjusted at the beginning!

In [86]:
#adata2.var_names = adata2.var["gene_ids"].copy()
#adata2.var_names.name = "feature_id"
#adata2.raw.var_names = adata2.raw.var["gene_ids"].copy()
#adata2.raw.var_names.name = "feature_id"

## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [87]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [88]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000238009    False
ENSG00000279457    False
ENSG00000228463    False
ENSG00000237094    False
ENSG00000237491    False
                   ...  
ENSG00000133475    False
ENSG00000273176    False
ENSG00000232710    False
ENSG00000232118    False
ENSG00000142182    False
Name: feature_is_filtered, Length: 16819, dtype: bool

# <ins>obsm

In [89]:
adata.obsm

AxisArrays with keys: X_umap_hm

In [90]:
adata.obsm['X_umap'] = adata.obsm['X_umap_hm']
del adata.obsm['X_umap_hm']

# <ins>uns

In [91]:
adata.uns['schema_version'] = '2.0.0'

In [92]:
adata.uns['title'] = 'Ileum'

In [93]:
adata.uns['X_normalization'] = 'Scanpy'

In [94]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [95]:
adata.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAATGCCCAATCTGCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,4956.0,8.508556,1621,7.391415,12.126715,6.799839,0.020178,44.915254,EFO:0009899,CL:1000347,HsapDv:0000154,MONDO:0002120,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002116
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,8600.0,9.059634,2303,7.742402,14.465116,7.813953,0.011628,43.825581,EFO:0009899,CL:1000347,HsapDv:0000154,MONDO:0002120,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002116
AAGACCTCACGGACAA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,7236.0,8.886962,1833,7.514255,9.798231,9.369818,0.000000,43.352681,EFO:0009899,CL:1000347,HsapDv:0000154,MONDO:0002120,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002116
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,9623.0,9.172015,2371,7.771489,18.528526,8.677128,0.020784,44.840486,EFO:0009899,CL:1000347,HsapDv:0000154,MONDO:0002120,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002116
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,4644.0,8.443546,1488,7.305860,11.886305,7.278208,0.000000,44.142980,EFO:0009899,CL:1000347,HsapDv:0000154,MONDO:0002120,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0002116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,8427.0,9.039315,1904,7.552237,21.798979,35.291325,0.011867,50.492465,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0002116
TGGTTAGCATCACGAT_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,9821.0,9.192380,2207,7.699842,18.470625,37.694733,0.020365,48.416658,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0002116
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,6560.0,8.788898,1443,7.275172,26.935976,34.192074,0.000000,55.533537,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0002116
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,15412.0,9.642966,2941,7.986845,18.096289,21.749287,0.006488,44.835193,EFO:0009899,CL:0000034,HsapDv:0000161,MONDO:0004970,unknown,True,NCBITaxon:9606,PATO:0000383,UBERON:0002116


In [96]:
adata

AnnData object with n_obs × n_vars = 5980 × 16819
    obs: 'Sample_ID', 'CellType', 'Location', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'leiden', 'neighbors_hm', 'pca', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    varm: 'PCs'

In [97]:
del adata.obs['Location']

In [98]:
adata

AnnData object with n_obs × n_vars = 5980 × 16819
    obs: 'Sample_ID', 'CellType', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'leiden', 'neighbors_hm', 'pca', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'
    varm: 'PCs'

In [101]:
del adata.var['gene_ids']

In [103]:
adata.write('wang20_ileum_f.h5ad', compression = 'gzip')

---

# 5. Fixes

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

In [2]:
adata = sc.read('wang20_ileum_f.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 5980 × 16819
    obs: 'CellType', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id'
    var: 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'X_normalization', 'default_embedding', 'leiden', 'neighbors_hm', 'pca', 'schema_version', 'title'
    obsm: 'X_umap'
    varm: 'PCs'

In [4]:
adata.obs['donor_id'] = adata.obs['Sample_ID']

In [4]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [5]:
del adata.obs['Sample_ID']

In [5]:
adata.write('wang20_ileum_f.h5ad', compression = 'gzip')

... storing 'suspension_type' as categorical
