# **Curating `lukassen20-airway.processed.h5ad` dataset:**

* ~17K cells and 22K genes
* publication link: https://www.embopress.org/doi/full/10.15252/embj.20105114

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

In [3]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/lukassen20_airway_orig.processed.h5ad')

In [4]:
adata.var

Unnamed: 0_level_0,name
index,Unnamed: 1_level_1
AL627309.1,AL627309.1
AP006222.2,AP006222.2
RP4-669L17.10,RP4-669L17.10
RP11-206L10.3,RP11-206L10.3
RP11-206L10.2,RP11-206L10.2
...,...
AC008753.4,AC008753.4
XXbac-B476C20.17,XXbac-B476C20.17
AF127936.3,AF127936.3
AP000695.6,AP000695.6


In [5]:
adata.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,sample,percent.mt,CellType,sex,age,smoking
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
p46c_AAACCTGAGACCCACC,p46c,11554.0,3040,p46c,2.432058,Secretory3,m,53.0,no
p46c_AAACCTGAGGACACCA,p46c,9517.0,3152,p46c,10.906798,Ciliated1,m,53.0,no
p46c_AAACCTGAGTAACCCT,p46c,5597.0,1878,p46c,12.274433,Goblet,m,53.0,no
p46c_AAACCTGGTCATCGGC,p46c,2367.0,1065,p46c,10.984368,Goblet,m,53.0,no
p46c_AAACCTGTCACCCGAG,p46c,6394.0,2269,p46c,5.395683,Ciliated2,m,53.0,no
...,...,...,...,...,...,...,...,...,...
p51c_TTTGTCACACTGCCAG,p51c,8984.0,2304,p51c,6.222173,Secretory3,m,67.0,former
p51c_TTTGTCAGTTCACGGC,p51c,4172.0,1364,p51c,11.241611,Secretory2,m,67.0,former
p51c_TTTGTCATCAACCAAC,p51c,3463.0,1358,p51c,6.092983,Basal1,m,67.0,former
p51c_TTTGTCATCCTTTCTC,p51c,3089.0,1028,p51c,5.373907,Goblet,m,67.0,former


In [6]:
adata.obs['age'].unique()

array([53., 55., 67.])

In [7]:
adata

AnnData object with n_obs × n_vars = 17451 × 22100
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'percent.mt', 'CellType', 'sex', 'age', 'smoking'
    var: 'name'
    obsm: 'X_umap'

## 2. Pre-prepare before schema adjustments

### Add row counts

In [8]:
raw = pd.read_csv('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/raw/lukassen20/Counts_HBECs.csv')

In [9]:
raw

Unnamed: 0,p46c_AAACCTGAGACCCACC,p46c_AAACCTGAGGACACCA,p46c_AAACCTGAGTAACCCT,p46c_AAACCTGGTCATCGGC,p46c_AAACCTGTCACCCGAG,p46c_AAACGGGAGCCTTGAT,p46c_AAACGGGAGGTGTTAA,p46c_AAACGGGCAAGCCTAT,p46c_AAACGGGGTATAGTAG,p46c_AAACGGGTCAACGCTA,...,p51c_TTTGGTTTCTGGTGTA,p51c_TTTGTCAAGATAGTCA,p51c_TTTGTCAAGCAGGCTA,p51c_TTTGTCAAGGAGCGAG,p51c_TTTGTCACAAGGGTCA,p51c_TTTGTCACACTGCCAG,p51c_TTTGTCAGTTCACGGC,p51c_TTTGTCATCAACCAAC,p51c_TTTGTCATCCTTTCTC,p51c_TTTGTCATCTCCGGTT
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AP006222.2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC008753.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
XXbac-B476C20.17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AF127936.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AP000695.6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
adata.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,sample,percent.mt,CellType,sex,age,smoking
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
p46c_AAACCTGAGACCCACC,p46c,11554.0,3040,p46c,2.432058,Secretory3,m,53.0,no
p46c_AAACCTGAGGACACCA,p46c,9517.0,3152,p46c,10.906798,Ciliated1,m,53.0,no
p46c_AAACCTGAGTAACCCT,p46c,5597.0,1878,p46c,12.274433,Goblet,m,53.0,no
p46c_AAACCTGGTCATCGGC,p46c,2367.0,1065,p46c,10.984368,Goblet,m,53.0,no
p46c_AAACCTGTCACCCGAG,p46c,6394.0,2269,p46c,5.395683,Ciliated2,m,53.0,no
...,...,...,...,...,...,...,...,...,...
p51c_TTTGTCACACTGCCAG,p51c,8984.0,2304,p51c,6.222173,Secretory3,m,67.0,former
p51c_TTTGTCAGTTCACGGC,p51c,4172.0,1364,p51c,11.241611,Secretory2,m,67.0,former
p51c_TTTGTCATCAACCAAC,p51c,3463.0,1358,p51c,6.092983,Basal1,m,67.0,former
p51c_TTTGTCATCCTTTCTC,p51c,3089.0,1028,p51c,5.373907,Goblet,m,67.0,former


In [11]:
raw = raw.loc[:,list(adata.obs.index)]

In [12]:
raw.shape

(22100, 17451)

In [13]:
adata

AnnData object with n_obs × n_vars = 17451 × 22100
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'percent.mt', 'CellType', 'sex', 'age', 'smoking'
    var: 'name'
    obsm: 'X_umap'

### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [14]:
sym2ensg = pd.read_table('../hg19-3.0.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [15]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [41]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [42]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in sym2ensg['gene_symbol']:
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        poss = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))
        if len(poss) > 0:
            ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
        else:
            continue

    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 22100/22100 [00:13<00:00, 1619.34it/s]


### Remove the genes that do not match in gene-ensemblID file

In [45]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [46]:
notvalid

Unnamed: 0_level_0,name,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1
AL627309.1,AL627309.1,ENSG00000237683
RP11-206L10.3,RP11-206L10.3,ENSG00000235373
AL590822.2,AL590822.2,ENSG00000269554
AL590822.1,AL590822.1,ENSG00000203301
RP4-635E18.8,RP4-635E18.8,ENSG00000271895
...,...,...
RP11-1M18.1,RP11-1M18.1,ENSG00000231217
AC027228.1,AC027228.1,ENSG00000268838
RP11-61A14.2,RP11-61A14.2,ENSG00000261705
TBC1D3B,TBC1D3B,ENSG00000224226


In [48]:
sum(notvalid['gene_ids'] == 'a')

51

In [49]:
sum(notvalid['gene_ids'] != 'a')

833

In [50]:
sum(~adata.var_names.isin(notvalid.index))

21216

In [51]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [52]:
adata

View of AnnData object with n_obs × n_vars = 17451 × 21216
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'percent.mt', 'CellType', 'sex', 'age', 'smoking'
    var: 'name', 'gene_ids'
    obsm: 'X_umap'

In [53]:
raw = raw.loc[adata.var_names,:]

In [54]:
raw

Unnamed: 0_level_0,p46c_AAACCTGAGACCCACC,p46c_AAACCTGAGGACACCA,p46c_AAACCTGAGTAACCCT,p46c_AAACCTGGTCATCGGC,p46c_AAACCTGTCACCCGAG,p46c_AAACGGGAGCCTTGAT,p46c_AAACGGGAGGTGTTAA,p46c_AAACGGGCAAGCCTAT,p46c_AAACGGGGTATAGTAG,p46c_AAACGGGTCAACGCTA,...,p51c_TTTGGTTTCTGGTGTA,p51c_TTTGTCAAGATAGTCA,p51c_TTTGTCAAGCAGGCTA,p51c_TTTGTCAAGGAGCGAG,p51c_TTTGTCACAAGGGTCA,p51c_TTTGTCACACTGCCAG,p51c_TTTGTCAGTTCACGGC,p51c_TTTGTCATCAACCAAC,p51c_TTTGTCATCCTTTCTC,p51c_TTTGTCATCTCCGGTT
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AP006222.2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP4-669L17.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-206L10.9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LINC00115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC007193.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC008753.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
XXbac-B476C20.17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AF127936.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [55]:
validlst.index = validlst['gene_ids']

In [56]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [57]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [58]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

100%|██████████| 21216/21216 [06:03<00:00, 58.32it/s]


In [59]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [60]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [61]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [62]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [63]:
symbols

Unnamed: 0,oldsymb,newsymb
0,AP006222.2,AP006222.1
1,RP4-669L17.10,RP4-669L17.4
2,RP11-206L10.9,LINC01409
3,C1orf170,PERM1
4,FAM132A,C1QTNF12
...,...,...
2404,PTPN20A,PTPN20
2405,LINC00371,C13orf42
2406,MIR142,RP5-1171I10.5
2407,RP1-155G6.4,CSE1L-AS1


In [64]:
adata.var

Unnamed: 0_level_0,name,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000228463,AP006222.2,ENSG00000228463,AP006222.2
ENSG00000237094,RP4-669L17.10,ENSG00000237094,RP4-669L17.10
ENSG00000228327,RP11-206L10.2,ENSG00000228327,RP11-206L10.2
ENSG00000237491,RP11-206L10.9,ENSG00000237491,RP11-206L10.9
ENSG00000225880,LINC00115,ENSG00000225880,LINC00115
...,...,...,...
ENSG00000269151,AC007193.8,ENSG00000269151,AC007193.8
ENSG00000269564,AC008753.4,ENSG00000269564,AC008753.4
ENSG00000235295,XXbac-B476C20.17,ENSG00000235295,XXbac-B476C20.17
ENSG00000232884,AF127936.3,ENSG00000232884,AF127936.3


In [65]:
adata.var

Unnamed: 0_level_0,name,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000228463,AP006222.2,ENSG00000228463,AP006222.2
ENSG00000237094,RP4-669L17.10,ENSG00000237094,RP4-669L17.10
ENSG00000228327,RP11-206L10.2,ENSG00000228327,RP11-206L10.2
ENSG00000237491,RP11-206L10.9,ENSG00000237491,RP11-206L10.9
ENSG00000225880,LINC00115,ENSG00000225880,LINC00115
...,...,...,...
ENSG00000269151,AC007193.8,ENSG00000269151,AC007193.8
ENSG00000269564,AC008753.4,ENSG00000269564,AC008753.4
ENSG00000235295,XXbac-B476C20.17,ENSG00000235295,XXbac-B476C20.17
ENSG00000232884,AF127936.3,ENSG00000232884,AF127936.3


In [66]:
del adata.var['gene_ids']

In [67]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

100%|██████████| 2409/2409 [00:02<00:00, 963.79it/s]


In [68]:
raw = raw.transpose()

In [69]:
raw = raw.set_axis(list(adata.var_names), axis = 1, inplace=False)

In [70]:
raw

Unnamed: 0,ENSG00000228463,ENSG00000237094,ENSG00000228327,ENSG00000237491,ENSG00000225880,ENSG00000230368,ENSG00000188976,ENSG00000187961,ENSG00000187583,ENSG00000187642,...,ENSG00000265206,ENSG00000272625,ENSG00000227431,ENSG00000269480,ENSG00000267934,ENSG00000269151,ENSG00000269564,ENSG00000235295,ENSG00000232884,ENSG00000230479
p46c_AAACCTGAGACCCACC,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
p46c_AAACCTGAGGACACCA,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p46c_AAACCTGAGTAACCCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p46c_AAACCTGGTCATCGGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p46c_AAACCTGTCACCCGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p51c_TTTGTCACACTGCCAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p51c_TTTGTCAGTTCACGGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p51c_TTTGTCATCAACCAAC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p51c_TTTGTCATCCTTTCTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
raww = adata.copy()

In [72]:
raww.X = scipy.sparse.csr_matrix(raw)

In [73]:
del raww.obs
del raww.uns
del raww.obsm
del raww.varm
del raww.var

In [74]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [75]:
raww.var['feature_biotype'] = ["gene"] * len(raww.var)

In [76]:
adata.raw = raww

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [77]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'percent.mt',
       'CellType', 'sex', 'age', 'smoking'],
      dtype='object')

## assay_ontology_term_id

> "Single cell suspensions were loaded onto the 10× Chromium device using the 10× Genomics Single Cell 3’ Library Kit v2 (10× Genomics; PN-120237, PN-120236, PN-120262) to generate cell and gel bead emulsions" from Lukassen et al

In [78]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009899"] * len(adata.obs)

In [79]:
adata.obs['assay_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    EFO:0009899
p46c_AAACCTGAGGACACCA    EFO:0009899
p46c_AAACCTGAGTAACCCT    EFO:0009899
p46c_AAACCTGGTCATCGGC    EFO:0009899
p46c_AAACCTGTCACCCGAG    EFO:0009899
                            ...     
p51c_TTTGTCACACTGCCAG    EFO:0009899
p51c_TTTGTCAGTTCACGGC    EFO:0009899
p51c_TTTGTCATCAACCAAC    EFO:0009899
p51c_TTTGTCATCCTTTCTC    EFO:0009899
p51c_TTTGTCATCTCCGGTT    EFO:0009899
Name: assay_ontology_term_id, Length: 17451, dtype: object

In [80]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [81]:
adata.obs['assay_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    EFO:0009899
p46c_AAACCTGAGGACACCA    EFO:0009899
p46c_AAACCTGAGTAACCCT    EFO:0009899
p46c_AAACCTGGTCATCGGC    EFO:0009899
p46c_AAACCTGTCACCCGAG    EFO:0009899
                            ...     
p51c_TTTGTCACACTGCCAG    EFO:0009899
p51c_TTTGTCAGTTCACGGC    EFO:0009899
p51c_TTTGTCATCAACCAAC    EFO:0009899
p51c_TTTGTCATCCTTTCTC    EFO:0009899
p51c_TTTGTCATCTCCGGTT    EFO:0009899
Name: assay_ontology_term_id, Length: 17451, dtype: category
Categories (1, object): ['EFO:0009899']

## cell_type_ontology_term_id

In [82]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [83]:
list(adata.obs['CellType'].unique())

['Secretory3',
 'Ciliated1',
 'Goblet',
 'Ciliated2',
 'Club',
 'Secretory2',
 'FOXN4',
 'Basal1',
 'Secretory1',
 'Fibroblast',
 'Ionocyte',
 'Basal3',
 'Basal_Mitotic',
 'Basal2']

In [84]:
adata.obs['cell_type_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    Secretory3
p46c_AAACCTGAGGACACCA     Ciliated1
p46c_AAACCTGAGTAACCCT        Goblet
p46c_AAACCTGGTCATCGGC        Goblet
p46c_AAACCTGTCACCCGAG     Ciliated2
                            ...    
p51c_TTTGTCACACTGCCAG    Secretory3
p51c_TTTGTCAGTTCACGGC    Secretory2
p51c_TTTGTCATCAACCAAC        Basal1
p51c_TTTGTCATCCTTTCTC        Goblet
p51c_TTTGTCATCTCCGGTT     Ciliated1
Name: cell_type_ontology_term_id, Length: 17451, dtype: category
Categories (14, object): ['Basal1', 'Basal2', 'Basal3', 'Basal_Mitotic', ..., 'Ciliated2', 'FOXN4', 'Ionocyte', 'Fibroblast']

In [85]:
#list(adata.obs['cell_type_ontology_term_id'].unique())

In [86]:
mapping = {'Secretory3': 'CL:1000272',
 'Ciliated1': 'CL:1000271',
 'Goblet': 'CL:1000329',
 'Ciliated2': 'CL:1000271',
 'Club': 'CL:0000158',
 'Secretory2': 'CL:1000272',
 'FOXN4':'CL:0000001',
 'Basal1': 'CL:0000646',
 'Secretory1': 'CL:1000272',
 'Fibroblast': 'CL:0002553',
 'Ionocyte': 'CL:0017000',
 'Basal3': 'CL:0000646',
 'Basal_Mitotic': 'CL:0000646',
 'Basal2': 'CL:0000646'}

In [87]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [88]:
adata.obs['cell_type_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    CL:1000272
p46c_AAACCTGAGGACACCA    CL:1000271
p46c_AAACCTGAGTAACCCT    CL:1000329
p46c_AAACCTGGTCATCGGC    CL:1000329
p46c_AAACCTGTCACCCGAG    CL:1000271
                            ...    
p51c_TTTGTCACACTGCCAG    CL:1000272
p51c_TTTGTCAGTTCACGGC    CL:1000272
p51c_TTTGTCATCAACCAAC    CL:0000646
p51c_TTTGTCATCCTTTCTC    CL:1000329
p51c_TTTGTCATCTCCGGTT    CL:1000271
Name: cell_type_ontology_term_id, Length: 17451, dtype: object

## development_stage_ontology_term_id

In [89]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age']

In [90]:
adata.obs['development_stage_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    53.0
p46c_AAACCTGAGGACACCA    53.0
p46c_AAACCTGAGTAACCCT    53.0
p46c_AAACCTGGTCATCGGC    53.0
p46c_AAACCTGTCACCCGAG    53.0
                         ... 
p51c_TTTGTCACACTGCCAG    67.0
p51c_TTTGTCAGTTCACGGC    67.0
p51c_TTTGTCATCAACCAAC    67.0
p51c_TTTGTCATCCTTTCTC    67.0
p51c_TTTGTCATCTCCGGTT    67.0
Name: development_stage_ontology_term_id, Length: 17451, dtype: float64

In [91]:
adata.obs['development_stage_ontology_term_id'].unique()

array([53., 55., 67.])

In [92]:
mapping = {53.0: 'HsapDv:0000147', 55.0: 'HsapDv:0000149', 67.0: 'HsapDv:0000161'}

In [93]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [94]:
adata.obs['development_stage_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    HsapDv:0000147
p46c_AAACCTGAGGACACCA    HsapDv:0000147
p46c_AAACCTGAGTAACCCT    HsapDv:0000147
p46c_AAACCTGGTCATCGGC    HsapDv:0000147
p46c_AAACCTGTCACCCGAG    HsapDv:0000147
                              ...      
p51c_TTTGTCACACTGCCAG    HsapDv:0000161
p51c_TTTGTCAGTTCACGGC    HsapDv:0000161
p51c_TTTGTCATCAACCAAC    HsapDv:0000161
p51c_TTTGTCATCCTTTCTC    HsapDv:0000161
p51c_TTTGTCATCTCCGGTT    HsapDv:0000161
Name: development_stage_ontology_term_id, Length: 17451, dtype: object

## disease_ontology_term_id

**"PATO:0000461"** for normal or healthy.

In [95]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [96]:
adata.obs['disease_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    PATO:0000461
p46c_AAACCTGAGGACACCA    PATO:0000461
p46c_AAACCTGAGTAACCCT    PATO:0000461
p46c_AAACCTGGTCATCGGC    PATO:0000461
p46c_AAACCTGTCACCCGAG    PATO:0000461
                             ...     
p51c_TTTGTCACACTGCCAG    PATO:0000461
p51c_TTTGTCAGTTCACGGC    PATO:0000461
p51c_TTTGTCATCAACCAAC    PATO:0000461
p51c_TTTGTCATCCTTTCTC    PATO:0000461
p51c_TTTGTCATCTCCGGTT    PATO:0000461
Name: disease_ontology_term_id, Length: 17451, dtype: object

In [97]:
adata.obs['disease_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    PATO:0000461
p46c_AAACCTGAGGACACCA    PATO:0000461
p46c_AAACCTGAGTAACCCT    PATO:0000461
p46c_AAACCTGGTCATCGGC    PATO:0000461
p46c_AAACCTGTCACCCGAG    PATO:0000461
                             ...     
p51c_TTTGTCACACTGCCAG    PATO:0000461
p51c_TTTGTCAGTTCACGGC    PATO:0000461
p51c_TTTGTCATCAACCAAC    PATO:0000461
p51c_TTTGTCATCCTTTCTC    PATO:0000461
p51c_TTTGTCATCTCCGGTT    PATO:0000461
Name: disease_ontology_term_id, Length: 17451, dtype: object

## ethnicity_ontology_term_id

In [98]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [99]:
adata.obs['ethnicity_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    unknown
p46c_AAACCTGAGGACACCA    unknown
p46c_AAACCTGAGTAACCCT    unknown
p46c_AAACCTGGTCATCGGC    unknown
p46c_AAACCTGTCACCCGAG    unknown
                          ...   
p51c_TTTGTCACACTGCCAG    unknown
p51c_TTTGTCAGTTCACGGC    unknown
p51c_TTTGTCATCAACCAAC    unknown
p51c_TTTGTCATCCTTTCTC    unknown
p51c_TTTGTCATCTCCGGTT    unknown
Name: ethnicity_ontology_term_id, Length: 17451, dtype: object

## is_primary_data

In [100]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [101]:
adata.obs['is_primary_data']

index
p46c_AAACCTGAGACCCACC    True
p46c_AAACCTGAGGACACCA    True
p46c_AAACCTGAGTAACCCT    True
p46c_AAACCTGGTCATCGGC    True
p46c_AAACCTGTCACCCGAG    True
                         ... 
p51c_TTTGTCACACTGCCAG    True
p51c_TTTGTCAGTTCACGGC    True
p51c_TTTGTCATCAACCAAC    True
p51c_TTTGTCATCCTTTCTC    True
p51c_TTTGTCATCTCCGGTT    True
Name: is_primary_data, Length: 17451, dtype: bool

In [102]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [103]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [104]:
adata.obs['organism_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    NCBITaxon:9606
p46c_AAACCTGAGGACACCA    NCBITaxon:9606
p46c_AAACCTGAGTAACCCT    NCBITaxon:9606
p46c_AAACCTGGTCATCGGC    NCBITaxon:9606
p46c_AAACCTGTCACCCGAG    NCBITaxon:9606
                              ...      
p51c_TTTGTCACACTGCCAG    NCBITaxon:9606
p51c_TTTGTCAGTTCACGGC    NCBITaxon:9606
p51c_TTTGTCATCAACCAAC    NCBITaxon:9606
p51c_TTTGTCATCCTTTCTC    NCBITaxon:9606
p51c_TTTGTCATCTCCGGTT    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 17451, dtype: object

In [105]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [106]:
adata.obs['organism_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    NCBITaxon:9606
p46c_AAACCTGAGGACACCA    NCBITaxon:9606
p46c_AAACCTGAGTAACCCT    NCBITaxon:9606
p46c_AAACCTGGTCATCGGC    NCBITaxon:9606
p46c_AAACCTGTCACCCGAG    NCBITaxon:9606
                              ...      
p51c_TTTGTCACACTGCCAG    NCBITaxon:9606
p51c_TTTGTCAGTTCACGGC    NCBITaxon:9606
p51c_TTTGTCATCAACCAAC    NCBITaxon:9606
p51c_TTTGTCATCCTTTCTC    NCBITaxon:9606
p51c_TTTGTCATCTCCGGTT    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 17451, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [107]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex']

In [108]:
adata.obs['sex_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    m
p46c_AAACCTGAGGACACCA    m
p46c_AAACCTGAGTAACCCT    m
p46c_AAACCTGGTCATCGGC    m
p46c_AAACCTGTCACCCGAG    m
                        ..
p51c_TTTGTCACACTGCCAG    m
p51c_TTTGTCAGTTCACGGC    m
p51c_TTTGTCATCAACCAAC    m
p51c_TTTGTCATCCTTTCTC    m
p51c_TTTGTCATCTCCGGTT    m
Name: sex_ontology_term_id, Length: 17451, dtype: category
Categories (2, object): ['f', 'm']

**PATO_0000384 for male  
PATO_0000383 for female**

In [109]:
mapping = {'f':'PATO:0000383', 'm':'PATO:0000384'}

In [110]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [111]:
adata.obs['sex_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    PATO:0000384
p46c_AAACCTGAGGACACCA    PATO:0000384
p46c_AAACCTGAGTAACCCT    PATO:0000384
p46c_AAACCTGGTCATCGGC    PATO:0000384
p46c_AAACCTGTCACCCGAG    PATO:0000384
                             ...     
p51c_TTTGTCACACTGCCAG    PATO:0000384
p51c_TTTGTCAGTTCACGGC    PATO:0000384
p51c_TTTGTCATCAACCAAC    PATO:0000384
p51c_TTTGTCATCCTTTCTC    PATO:0000384
p51c_TTTGTCATCTCCGGTT    PATO:0000384
Name: sex_ontology_term_id, Length: 17451, dtype: category
Categories (2, object): ['PATO:0000383', 'PATO:0000384']

## tissue_ontology_term_id

> **UBERON:0001005:** respiratory airway

In [112]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0001005'] * len(adata.obs)

In [113]:
adata.obs['tissue_ontology_term_id']

index
p46c_AAACCTGAGACCCACC    UBERON:0001005
p46c_AAACCTGAGGACACCA    UBERON:0001005
p46c_AAACCTGAGTAACCCT    UBERON:0001005
p46c_AAACCTGGTCATCGGC    UBERON:0001005
p46c_AAACCTGTCACCCGAG    UBERON:0001005
                              ...      
p51c_TTTGTCACACTGCCAG    UBERON:0001005
p51c_TTTGTCAGTTCACGGC    UBERON:0001005
p51c_TTTGTCATCAACCAAC    UBERON:0001005
p51c_TTTGTCATCCTTTCTC    UBERON:0001005
p51c_TTTGTCATCTCCGGTT    UBERON:0001005
Name: tissue_ontology_term_id, Length: 17451, dtype: object

---

# <ins>var and raw.var

In [114]:
adata.var

Unnamed: 0_level_0,name,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000228463,AP006222.2,AP006222.1,gene
ENSG00000237094,RP4-669L17.10,RP4-669L17.4,gene
ENSG00000228327,RP11-206L10.2,RP11-206L10.2,gene
ENSG00000237491,RP11-206L10.9,LINC01409,gene
ENSG00000225880,LINC00115,LINC00115,gene
...,...,...,...
ENSG00000269151,AC007193.8,AC007193.8,gene
ENSG00000269564,AC008753.4,AC008753.4,gene
ENSG00000235295,XXbac-B476C20.17,LINC01634,gene
ENSG00000232884,AF127936.3,AF127936.3,gene


## feature_is_filtered

In [116]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [117]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000228463    False
ENSG00000237094    False
ENSG00000228327    False
ENSG00000237491    False
ENSG00000225880    False
                   ...  
ENSG00000269151    False
ENSG00000269564    False
ENSG00000235295    False
ENSG00000232884    False
ENSG00000230479    False
Name: feature_is_filtered, Length: 21216, dtype: bool

# <ins>obsm

In [118]:
adata.obsm

AxisArrays with keys: X_umap

# <ins>uns

In [119]:
adata.uns['schema_version'] = '2.0.0'

In [120]:
adata.uns['title'] = 'Airway'

In [121]:
adata.uns['X_normalization'] = 'Scanpy'

In [122]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [123]:
adata.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,sample,percent.mt,CellType,sex,age,smoking,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
p46c_AAACCTGAGACCCACC,p46c,11554.0,3040,p46c,2.432058,Secretory3,m,53.0,no,EFO:0009899,CL:1000272,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p46c_AAACCTGAGGACACCA,p46c,9517.0,3152,p46c,10.906798,Ciliated1,m,53.0,no,EFO:0009899,CL:1000271,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p46c_AAACCTGAGTAACCCT,p46c,5597.0,1878,p46c,12.274433,Goblet,m,53.0,no,EFO:0009899,CL:1000329,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p46c_AAACCTGGTCATCGGC,p46c,2367.0,1065,p46c,10.984368,Goblet,m,53.0,no,EFO:0009899,CL:1000329,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p46c_AAACCTGTCACCCGAG,p46c,6394.0,2269,p46c,5.395683,Ciliated2,m,53.0,no,EFO:0009899,CL:1000271,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p51c_TTTGTCACACTGCCAG,p51c,8984.0,2304,p51c,6.222173,Secretory3,m,67.0,former,EFO:0009899,CL:1000272,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p51c_TTTGTCAGTTCACGGC,p51c,4172.0,1364,p51c,11.241611,Secretory2,m,67.0,former,EFO:0009899,CL:1000272,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p51c_TTTGTCATCAACCAAC,p51c,3463.0,1358,p51c,6.092983,Basal1,m,67.0,former,EFO:0009899,CL:0000646,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005
p51c_TTTGTCATCCTTTCTC,p51c,3089.0,1028,p51c,5.373907,Goblet,m,67.0,former,EFO:0009899,CL:1000329,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005


In [125]:
# Removing the same slots created for the schema
del adata.obs['sex']
del adata.obs['age']
del adata.obs['sample']
del adata.obs['orig.ident']
del adata.var['name']

In [126]:
adata

AnnData object with n_obs × n_vars = 17451 × 21216
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'CellType', 'smoking', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id'
    var: 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_umap'

In [127]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,CellType,smoking,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id,donor_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
p46c_AAACCTGAGACCCACC,11554.0,3040,2.432058,Secretory3,no,EFO:0009899,CL:1000272,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p46c
p46c_AAACCTGAGGACACCA,9517.0,3152,10.906798,Ciliated1,no,EFO:0009899,CL:1000271,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p46c
p46c_AAACCTGAGTAACCCT,5597.0,1878,12.274433,Goblet,no,EFO:0009899,CL:1000329,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p46c
p46c_AAACCTGGTCATCGGC,2367.0,1065,10.984368,Goblet,no,EFO:0009899,CL:1000329,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p46c
p46c_AAACCTGTCACCCGAG,6394.0,2269,5.395683,Ciliated2,no,EFO:0009899,CL:1000271,HsapDv:0000147,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p46c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p51c_TTTGTCACACTGCCAG,8984.0,2304,6.222173,Secretory3,former,EFO:0009899,CL:1000272,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p51c
p51c_TTTGTCAGTTCACGGC,4172.0,1364,11.241611,Secretory2,former,EFO:0009899,CL:1000272,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p51c
p51c_TTTGTCATCAACCAAC,3463.0,1358,6.092983,Basal1,former,EFO:0009899,CL:0000646,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p51c
p51c_TTTGTCATCCTTTCTC,3089.0,1028,5.373907,Goblet,former,EFO:0009899,CL:1000329,HsapDv:0000161,PATO:0000461,unknown,True,NCBITaxon:9606,PATO:0000384,UBERON:0001005,p51c


In [128]:
adata.var

Unnamed: 0_level_0,gene_symbols,feature_biotype,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000228463,AP006222.1,gene,False
ENSG00000237094,RP4-669L17.4,gene,False
ENSG00000228327,RP11-206L10.2,gene,False
ENSG00000237491,LINC01409,gene,False
ENSG00000225880,LINC00115,gene,False
...,...,...,...
ENSG00000269151,AC007193.8,gene,False
ENSG00000269564,AC008753.4,gene,False
ENSG00000235295,LINC01634,gene,False
ENSG00000232884,AF127936.3,gene,False


In [129]:
adata.obs.dtypes

nCount_RNA                             float64
nFeature_RNA                             int32
percent.mt                             float64
CellType                              category
smoking                               category
assay_ontology_term_id                category
cell_type_ontology_term_id              object
development_stage_ontology_term_id      object
disease_ontology_term_id                object
ethnicity_ontology_term_id              object
is_primary_data                           bool
organism_ontology_term_id             category
sex_ontology_term_id                  category
tissue_ontology_term_id                 object
donor_id                              category
dtype: object

In [130]:
# adata.obs
print('obs')
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

# adata.var
print('var')
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

obs
changed nCount_RNA from float64 to float32
changed percent.mt from float64 to float32
var


In [131]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [124]:
adata.obs['donor_id'] = adata.obs['orig.ident']

In [5]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [132]:
adata.write('lukassen20_airway_f.h5ad', compression = 'gzip')

... storing 'cell_type_ontology_term_id' as categorical
... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical
