# **Curating `lukassen20-lung.processed.h5ad` dataset:**

* includes ~17K cells and 22K genes
* two datasets (airway + lung)
* publication link: https://www.embopress.org/doi/full/10.15252/embj.20105114

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm

In [4]:
adata = sc.read('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/downloaded/lukassen20_lung_orig.processed.h5ad')

In [5]:
adata.X

<39778x32738 sparse matrix of type '<class 'numpy.float32'>'
	with 76186702 stored elements in Compressed Sparse Column format>

## 2. Pre-prepare before schema adjustments

### Add row counts

In [6]:
raw = pd.read_csv('/lustre/scratch117/cellgen/cellgeni/covid19cellatlas/raw/lukassen20/Counts_lung_cells.csv', low_memory=False)

In [7]:
raw

Unnamed: 0,X101CV0ng_AAACCTGAGGTTCCTA.1,X101CV0ng_AAACCTGCAAGGCTCC.1,X101CV0ng_AAACCTGCACGAAATA.1,X101CV0ng_AAACCTGCAGCTATTG.1,X101CV0ng_AAACCTGGTGATGCCC.1,X101CV0ng_AAACCTGTCATCATTC.1,X101CV0ng_AAACCTGTCCCAGGTG.1,X101CV0ng_AAACGGGAGCTGATAA.1,X101CV0ng_AAACGGGCACATTTCT.1,X101CV0ng_AAACGGGGTCTAAACC.1,...,ZM3KACng_TTTGGTTGTGGGTCAA.1,ZM3KACng_TTTGGTTGTTTAGGAA.1,ZM3KACng_TTTGGTTTCCGAATGT.1,ZM3KACng_TTTGGTTTCTTGCCGT.1,ZM3KACng_TTTGTCAAGACTCGGA.1,ZM3KACng_TTTGTCACAAATACAG.1,ZM3KACng_TTTGTCACAGAGTGTG.1,ZM3KACng_TTTGTCACAGTTTACG.1,ZM3KACng_TTTGTCACATCCGCGA.1,ZM3KACng_TTTGTCACATGGATGG.1
MIR1302-10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC145205.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BAGE5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CU459201.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC002321.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Changing cell names in raw data to match with the normalized h5ad

1. Remove X from some of the cellnames
2. Change . with -

In [8]:
adata.obs_names

Index(['101CV0ng_AAACCTGAGGTTCCTA-1', '101CV0ng_AAACCTGCAAGGCTCC-1',
       '101CV0ng_AAACCTGCACGAAATA-1', '101CV0ng_AAACCTGCAGCTATTG-1',
       '101CV0ng_AAACCTGGTGATGCCC-1', '101CV0ng_AAACCTGTCATCATTC-1',
       '101CV0ng_AAACCTGTCCCAGGTG-1', '101CV0ng_AAACGGGAGCTGATAA-1',
       '101CV0ng_AAACGGGCACATTTCT-1', '101CV0ng_AAACGGGGTCTAAACC-1',
       ...
       'ZM3KACng_TTTGGTTGTGGGTCAA-1', 'ZM3KACng_TTTGGTTGTTTAGGAA-1',
       'ZM3KACng_TTTGGTTTCCGAATGT-1', 'ZM3KACng_TTTGGTTTCTTGCCGT-1',
       'ZM3KACng_TTTGTCAAGACTCGGA-1', 'ZM3KACng_TTTGTCACAAATACAG-1',
       'ZM3KACng_TTTGTCACAGAGTGTG-1', 'ZM3KACng_TTTGTCACAGTTTACG-1',
       'ZM3KACng_TTTGTCACATCCGCGA-1', 'ZM3KACng_TTTGTCACATGGATGG-1'],
      dtype='object', name='index', length=39778)

In [9]:
raw.columns = [i[1:] if len(i.split('_')[0]) > 8 else i for i in raw.columns]

In [10]:
raw.columns = ['-'.join(i.split('.')) for i in raw.columns]

In [11]:
raw.columns

Index(['101CV0ng_AAACCTGAGGTTCCTA-1', '101CV0ng_AAACCTGCAAGGCTCC-1',
       '101CV0ng_AAACCTGCACGAAATA-1', '101CV0ng_AAACCTGCAGCTATTG-1',
       '101CV0ng_AAACCTGGTGATGCCC-1', '101CV0ng_AAACCTGTCATCATTC-1',
       '101CV0ng_AAACCTGTCCCAGGTG-1', '101CV0ng_AAACGGGAGCTGATAA-1',
       '101CV0ng_AAACGGGCACATTTCT-1', '101CV0ng_AAACGGGGTCTAAACC-1',
       ...
       'ZM3KACng_TTTGGTTGTGGGTCAA-1', 'ZM3KACng_TTTGGTTGTTTAGGAA-1',
       'ZM3KACng_TTTGGTTTCCGAATGT-1', 'ZM3KACng_TTTGGTTTCTTGCCGT-1',
       'ZM3KACng_TTTGTCAAGACTCGGA-1', 'ZM3KACng_TTTGTCACAAATACAG-1',
       'ZM3KACng_TTTGTCACAGAGTGTG-1', 'ZM3KACng_TTTGTCACAGTTTACG-1',
       'ZM3KACng_TTTGTCACATCCGCGA-1', 'ZM3KACng_TTTGTCACATGGATGG-1'],
      dtype='object', length=39778)

In [12]:
adata

AnnData object with n_obs × n_vars = 39778 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sex', 'age', 'smoking', 'packyears', 'percent.mito', 'CellType'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

In [13]:
adata.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,sex,age,smoking,packyears,percent.mito,CellType
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
101CV0ng_AAACCTGAGGTTCCTA-1,101CV0ng,2866.0,1448,F,75.0,NonSmoking,0,0.001396,AT2
101CV0ng_AAACCTGCAAGGCTCC-1,101CV0ng,4128.0,1909,F,75.0,NonSmoking,0,0.007752,AT2
101CV0ng_AAACCTGCACGAAATA-1,101CV0ng,2438.0,1634,F,75.0,NonSmoking,0,0.001231,Immuno_Monocytes
101CV0ng_AAACCTGCAGCTATTG-1,101CV0ng,3589.0,1821,F,75.0,NonSmoking,0,0.002508,AT2
101CV0ng_AAACCTGGTGATGCCC-1,101CV0ng,2556.0,1656,F,75.0,NonSmoking,0,0.001565,Immuno_Monocytes
...,...,...,...,...,...,...,...,...,...
ZM3KACng_TTTGTCACAAATACAG-1,ZM3KACng,6058.0,1793,F,79.0,NonSmoking,0,0.010399,Club
ZM3KACng_TTTGTCACAGAGTGTG-1,ZM3KACng,15761.0,4547,F,79.0,NonSmoking,0,0.023793,Immuno_Monocytes
ZM3KACng_TTTGTCACAGTTTACG-1,ZM3KACng,7096.0,3414,F,79.0,NonSmoking,0,0.007046,Immuno_TCells
ZM3KACng_TTTGTCACATCCGCGA-1,ZM3KACng,3213.0,1215,F,79.0,NonSmoking,0,0.000934,AT2


In [14]:
len(set(adata.obs_names).intersection(set(raw.columns)))

39778

In [15]:
raw = raw.loc[:,list(adata.obs.index)]

In [16]:
raw.shape

(32738, 39778)

In [17]:
adata

AnnData object with n_obs × n_vars = 39778 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sex', 'age', 'smoking', 'packyears', 'percent.mito', 'CellType'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

In [18]:
raw

Unnamed: 0,101CV0ng_AAACCTGAGGTTCCTA-1,101CV0ng_AAACCTGCAAGGCTCC-1,101CV0ng_AAACCTGCACGAAATA-1,101CV0ng_AAACCTGCAGCTATTG-1,101CV0ng_AAACCTGGTGATGCCC-1,101CV0ng_AAACCTGTCATCATTC-1,101CV0ng_AAACCTGTCCCAGGTG-1,101CV0ng_AAACGGGAGCTGATAA-1,101CV0ng_AAACGGGCACATTTCT-1,101CV0ng_AAACGGGGTCTAAACC-1,...,ZM3KACng_TTTGGTTGTGGGTCAA-1,ZM3KACng_TTTGGTTGTTTAGGAA-1,ZM3KACng_TTTGGTTTCCGAATGT-1,ZM3KACng_TTTGGTTTCTTGCCGT-1,ZM3KACng_TTTGTCAAGACTCGGA-1,ZM3KACng_TTTGTCACAAATACAG-1,ZM3KACng_TTTGTCACAGAGTGTG-1,ZM3KACng_TTTGTCACAGTTTACG-1,ZM3KACng_TTTGTCACATCCGCGA-1,ZM3KACng_TTTGTCACATGGATGG-1
MIR1302-10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC145205.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BAGE5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CU459201.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC002321.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Convert gene symbols to ENSG ids by using the reference gene list of this dataset

In [19]:
sym2ensg = pd.read_table('../hg19-3.0.0.gene_names.tsv', header = None, names = ['ensg_id', 'gene_symbol'])

validlst = pd.read_csv('~/genes_homo_sapiens.csv', header = None, names = ['gene_ids', 'name','no'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [20]:
sym2ensg.index = sym2ensg['gene_symbol']

validlst.index = validlst['name']

In [21]:
adata.var['gene_ids'] = ['a'] * len(adata.var)

In [22]:
skippd = [] # genes not in sym2ensg
multensg = [] # genes that have multiple ensg's from sym2ensg
all_ensg = list(validlst['gene_ids'])
for i in tqdm(adata.var_names):
    if not i in sym2ensg['gene_symbol']:
        skippd.append(i)
        continue
        
    ensg_id = sym2ensg.at[i, 'ensg_id']
    
    if type(ensg_id) == pd.core.series.Series:
        multensg.append(i)
        poss = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))
        if len(poss) > 0:
            ensg_id = list(set(sym2ensg.at[i, 'ensg_id']).intersection(all_ensg))[0]
        else:
            continue
   
    adata.var.at[i, 'gene_ids'] = ensg_id

100%|██████████| 32738/32738 [00:20<00:00, 1630.20it/s]


### Remove the genes that do not match in gene-ensemblID file

In [23]:
notvalid = adata.var[adata.var['gene_ids'].isin(list(set(adata.var['gene_ids']).difference(set(validlst['gene_ids']))))]

In [24]:
notvalid

Unnamed: 0_level_0,name,gene_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1
AL627309.1,AL627309.1,ENSG00000237683
OR4F29,OR4F29,ENSG00000235249
RP5-857K21.15,RP5-857K21.15,ENSG00000236743
RP5-857K21.1,RP5-857K21.1,ENSG00000231709
RP5-857K21.3,RP5-857K21.3,ENSG00000239664
...,...,...
AC145205.1,AC145205.1,ENSG00000215635
BAGE5,BAGE5,ENSG00000268590
CU459201.1,CU459201.1,ENSG00000251180
AC002321.2,AC002321.2,ENSG00000215616


In [26]:
notvalid['gene_ids'].value_counts()

a                  116
ENSG00000269093      1
ENSG00000215346      1
ENSG00000229796      1
ENSG00000253753      1
                  ... 
ENSG00000260687      1
ENSG00000224226      1
ENSG00000269810      1
ENSG00000229510      1
ENSG00000273237      1
Name: gene_ids, Length: 1774, dtype: int64

In [27]:
sum(~adata.var_names.isin(notvalid.index))

30849

In [28]:
adata = adata[:,~adata.var_names.isin(notvalid.index)]

  res = method(*args, **kwargs)


In [29]:
adata

View of AnnData object with n_obs × n_vars = 39778 × 30849
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sex', 'age', 'smoking', 'packyears', 'percent.mito', 'CellType'
    var: 'name', 'gene_ids'
    obsm: 'X_pca', 'X_umap'

In [30]:
raw = raw.loc[adata.var_names,:]

In [31]:
raw

Unnamed: 0_level_0,101CV0ng_AAACCTGAGGTTCCTA-1,101CV0ng_AAACCTGCAAGGCTCC-1,101CV0ng_AAACCTGCACGAAATA-1,101CV0ng_AAACCTGCAGCTATTG-1,101CV0ng_AAACCTGGTGATGCCC-1,101CV0ng_AAACCTGTCATCATTC-1,101CV0ng_AAACCTGTCCCAGGTG-1,101CV0ng_AAACGGGAGCTGATAA-1,101CV0ng_AAACGGGCACATTTCT-1,101CV0ng_AAACGGGGTCTAAACC-1,...,ZM3KACng_TTTGGTTGTGGGTCAA-1,ZM3KACng_TTTGGTTGTTTAGGAA-1,ZM3KACng_TTTGGTTTCCGAATGT-1,ZM3KACng_TTTGGTTTCTTGCCGT-1,ZM3KACng_TTTGTCAAGACTCGGA-1,ZM3KACng_TTTGTCACAAATACAG-1,ZM3KACng_TTTGTCACAGAGTGTG-1,ZM3KACng_TTTGTCACAGTTTACG-1,ZM3KACng_TTTGTCACATCCGCGA-1,ZM3KACng_TTTGTCACATGGATGG-1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIR1302-10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4L,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,3,0,0,0
MT-ND4,1,2,0,0,0,5,0,0,0,0,...,7,3,0,6,1,7,29,7,0,0
MT-ND5,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,3,1,0,0
MT-ND6,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


### Updating gene names from the current reference to Gencode-v38

by checking ENSG ids in Gencode-v38 list, example:  
A -> ENSG01 (ref of the dataset)  
B -> ENSG01 (Gencode-v38)

In [32]:
validlst.index = validlst['gene_ids']

In [33]:
adata.var['gene_symbols'] = adata.var_names
adata.var_names = adata.var['gene_ids']

Trying to set attribute `.var` of view, copying.


In [34]:
validlst

Unnamed: 0_level_0,gene_ids,name,no
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,ENSG00000223972,DDX11L1,5
ENST00000456328,ENST00000456328,DDX11L1-202,2
ENST00000450305,ENST00000450305,DDX11L1-201,2
ENSG00000227232,ENSG00000227232,WASH7P,5
ENST00000488147,ENST00000488147,WASH7P-201,1
...,...,...,...
ENST00000621424,ENST00000621424,ENSG110105435.4-001,4
ENSG00000278625,ENSG00000278625,U6,1
ENST00000616830,ENST00000616830,U6.88-201,1
ENSG00000277374,ENSG00000277374,U1,1


In [35]:
oldsymb = []
newsymb = []
listee = list(validlst['gene_ids'])
for i in tqdm(adata.var['gene_ids']):
    if i in listee:
        symb_val = list(validlst.loc[validlst['gene_ids'] == i, 'name'])
        symb_org = adata.var.at[i, 'gene_symbols']
        symb_dif = list(set(symb_val).difference(set(symb_org)))
        if len(symb_dif) > 0:
            oldsymb.append(symb_org)
            newsymb.append(symb_dif[0])

100%|██████████| 30849/30849 [08:20<00:00, 61.59it/s]


In [36]:
symbols = pd.DataFrame([oldsymb,newsymb]).T

In [37]:
drop_lines = []
for i in range(0,len(symbols)):
    if symbols.iloc[i,0] == symbols.iloc[i,1]:
        drop_lines.append(i)

In [38]:
symbols = symbols.drop(drop_lines).rename(columns = {0: 'oldsymb', 1: 'newsymb'})

In [39]:
symbols = symbols.reset_index().loc[:,['oldsymb', 'newsymb']]

In [40]:
symbols

Unnamed: 0,oldsymb,newsymb
0,MIR1302-10,MIR1302-2HG
1,AP006222.2,AP006222.1
2,RP4-669L17.10,RP4-669L17.4
3,RP5-857K21.4,RP11-206L10.17
4,RP11-206L10.9,LINC01409
...,...,...
4064,C21orf67,LINC01547
4065,AP001505.9,LINC00165
4066,PRED58,RP5-1023B21.1
4067,AL133493.2,LINC01694


In [41]:
adata.var

Unnamed: 0_level_0,name,gene_ids,gene_symbols
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,MIR1302-10,ENSG00000243485,MIR1302-10
ENSG00000237613,FAM138A,ENSG00000237613,FAM138A
ENSG00000186092,OR4F5,ENSG00000186092,OR4F5
ENSG00000238009,RP11-34P13.7,ENSG00000238009,RP11-34P13.7
ENSG00000239945,RP11-34P13.8,ENSG00000239945,RP11-34P13.8
...,...,...,...
ENSG00000212907,MT-ND4L,ENSG00000212907,MT-ND4L
ENSG00000198886,MT-ND4,ENSG00000198886,MT-ND4
ENSG00000198786,MT-ND5,ENSG00000198786,MT-ND5
ENSG00000198695,MT-ND6,ENSG00000198695,MT-ND6


In [42]:
del adata.var['gene_ids']

In [43]:
for i in tqdm(range(0,len(symbols))):
    idxx = np.where(adata.var['gene_symbols'].isin([symbols['oldsymb'][i]]))[0][0]
    adata.var['gene_symbols'][idxx] = symbols['newsymb'][i]

100%|██████████| 4069/4069 [00:04<00:00, 852.63it/s]


In [44]:
raw = raw.transpose()

In [45]:
raw = raw.set_axis(list(adata.var_names), axis = 1, inplace=False)

In [46]:
raw

Unnamed: 0,ENSG00000243485,ENSG00000237613,ENSG00000186092,ENSG00000238009,ENSG00000239945,ENSG00000239906,ENSG00000241599,ENSG00000228463,ENSG00000237094,ENSG00000236601,...,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,ENSG00000198886,ENSG00000198786,ENSG00000198695,ENSG00000198727
101CV0ng_AAACCTGAGGTTCCTA-1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,2,0,0,1,0,0,0
101CV0ng_AAACCTGCAAGGCTCC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,4,6,6,0,2,1,0,5
101CV0ng_AAACCTGCACGAAATA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0,0,0,0,0
101CV0ng_AAACCTGCAGCTATTG-1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,3,1,0,0,1,0,1
101CV0ng_AAACCTGGTGATGCCC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZM3KACng_TTTGTCACAAATACAG-1,0,0,0,0,0,0,0,0,0,0,...,6,0,3,10,7,0,7,0,0,7
ZM3KACng_TTTGTCACAGAGTGTG-1,0,0,0,0,0,0,0,0,0,0,...,50,0,14,73,39,3,29,3,0,21
ZM3KACng_TTTGTCACAGTTTACG-1,0,0,0,0,0,0,0,0,0,0,...,7,0,4,5,8,0,7,1,0,4
ZM3KACng_TTTGTCACATCCGCGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [47]:
raww = adata.copy()

In [48]:
raww.X = scipy.sparse.csr_matrix(raw)

In [49]:
del raww.obs
del raww.uns
del raww.obsm
del raww.varm
del raww.var

In [50]:
adata.var['feature_biotype'] = ["gene"] * len(adata.var)

In [51]:
raww.var['feature_biotype'] = ["gene"] * len(raww.var)

In [52]:
adata.raw = raww

---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [53]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sex', 'age', 'smoking',
       'packyears', 'percent.mito', 'CellType'],
      dtype='object')

## assay_ontology_term_id

Assay used in this study is 10x 5' v1. Publication

10x 3' v2	"EFO:0009899"  
10x 3' v3	"EFO:0009922"  
10x 5' v1	"EFO:0011025"  
Smart-seq	"EFO:0008930"  
Smart-seq2	"EFO:0008931"

In [54]:
adata.obs['assay_ontology_term_id'] = ["EFO:0009899"] * len(adata.obs)

In [55]:
adata.obs['assay_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    EFO:0009899
101CV0ng_AAACCTGCAAGGCTCC-1    EFO:0009899
101CV0ng_AAACCTGCACGAAATA-1    EFO:0009899
101CV0ng_AAACCTGCAGCTATTG-1    EFO:0009899
101CV0ng_AAACCTGGTGATGCCC-1    EFO:0009899
                                  ...     
ZM3KACng_TTTGTCACAAATACAG-1    EFO:0009899
ZM3KACng_TTTGTCACAGAGTGTG-1    EFO:0009899
ZM3KACng_TTTGTCACAGTTTACG-1    EFO:0009899
ZM3KACng_TTTGTCACATCCGCGA-1    EFO:0009899
ZM3KACng_TTTGTCACATGGATGG-1    EFO:0009899
Name: assay_ontology_term_id, Length: 39778, dtype: object

In [56]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [57]:
adata.obs['assay_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    EFO:0009899
101CV0ng_AAACCTGCAAGGCTCC-1    EFO:0009899
101CV0ng_AAACCTGCACGAAATA-1    EFO:0009899
101CV0ng_AAACCTGCAGCTATTG-1    EFO:0009899
101CV0ng_AAACCTGGTGATGCCC-1    EFO:0009899
                                  ...     
ZM3KACng_TTTGTCACAAATACAG-1    EFO:0009899
ZM3KACng_TTTGTCACAGAGTGTG-1    EFO:0009899
ZM3KACng_TTTGTCACAGTTTACG-1    EFO:0009899
ZM3KACng_TTTGTCACATCCGCGA-1    EFO:0009899
ZM3KACng_TTTGTCACATGGATGG-1    EFO:0009899
Name: assay_ontology_term_id, Length: 39778, dtype: category
Categories (1, object): ['EFO:0009899']

## cell_type_ontology_term_id

In [58]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [59]:
list(adata.obs['CellType'].unique())

['AT2',
 'Immuno_Monocytes',
 'AT1',
 'Endothelial',
 'Fibroblasts',
 'Club',
 'LymphaticEndothelium',
 'Ciliated',
 'Immuno_TCells']

In [60]:
adata.obs['cell_type_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1                 AT2
101CV0ng_AAACCTGCAAGGCTCC-1                 AT2
101CV0ng_AAACCTGCACGAAATA-1    Immuno_Monocytes
101CV0ng_AAACCTGCAGCTATTG-1                 AT2
101CV0ng_AAACCTGGTGATGCCC-1    Immuno_Monocytes
                                     ...       
ZM3KACng_TTTGTCACAAATACAG-1                Club
ZM3KACng_TTTGTCACAGAGTGTG-1    Immuno_Monocytes
ZM3KACng_TTTGTCACAGTTTACG-1       Immuno_TCells
ZM3KACng_TTTGTCACATCCGCGA-1                 AT2
ZM3KACng_TTTGTCACATGGATGG-1                 AT1
Name: cell_type_ontology_term_id, Length: 39778, dtype: category
Categories (9, object): ['AT2', 'Immuno_Monocytes', 'AT1', 'Endothelial', ..., 'Fibroblasts', 'Club', 'Immuno_TCells', 'LymphaticEndothelium']

In [61]:
mapping = {'AT2': 'CL:0002063',
 'Immuno_Monocytes': 'CL:0000576',
 'AT1': 'CL:0002062',
 'Endothelial': 'CL:0000115',
 'Fibroblasts': 'CL:0002553',
 'Club': 'CL:0000158',
 'LymphaticEndothelium': 'CL:0002138',
 'Ciliated': 'CL:1000271',
 'Immuno_TCells': 'CL:0000084'}

In [62]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [63]:
adata.obs['cell_type_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    CL:0002063
101CV0ng_AAACCTGCAAGGCTCC-1    CL:0002063
101CV0ng_AAACCTGCACGAAATA-1    CL:0000576
101CV0ng_AAACCTGCAGCTATTG-1    CL:0002063
101CV0ng_AAACCTGGTGATGCCC-1    CL:0000576
                                  ...    
ZM3KACng_TTTGTCACAAATACAG-1    CL:0000158
ZM3KACng_TTTGTCACAGAGTGTG-1    CL:0000576
ZM3KACng_TTTGTCACAGTTTACG-1    CL:0000084
ZM3KACng_TTTGTCACATCCGCGA-1    CL:0002063
ZM3KACng_TTTGTCACATGGATGG-1    CL:0002062
Name: cell_type_ontology_term_id, Length: 39778, dtype: category
Categories (9, object): ['CL:0002063', 'CL:0000576', 'CL:0002062', 'CL:0000115', ..., 'CL:0002553', 'CL:0000158', 'CL:0000084', 'CL:0002138']

## development_stage_ontology_term_id

In [64]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['age']

In [65]:
adata.obs['development_stage_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    75.0
101CV0ng_AAACCTGCAAGGCTCC-1    75.0
101CV0ng_AAACCTGCACGAAATA-1    75.0
101CV0ng_AAACCTGCAGCTATTG-1    75.0
101CV0ng_AAACCTGGTGATGCCC-1    75.0
                               ... 
ZM3KACng_TTTGTCACAAATACAG-1    79.0
ZM3KACng_TTTGTCACAGAGTGTG-1    79.0
ZM3KACng_TTTGTCACAGTTTACG-1    79.0
ZM3KACng_TTTGTCACATCCGCGA-1    79.0
ZM3KACng_TTTGTCACATGGATGG-1    79.0
Name: development_stage_ontology_term_id, Length: 39778, dtype: float64

In [66]:
adata.obs['development_stage_ontology_term_id'].unique()

array([75., 50., 47., 45., 49., 76., 44., 46., 56., 79.])

In [67]:
mapping = {75.0: 'HsapDv:0000169', 50.0: 'HsapDv:0000144', 47.0: 'HsapDv:0000141', 45.0: 'HsapDv:0000139', 49.0: 'HsapDv:0000143', 
           76.0: 'HsapDv:0000170', 44.0: 'HsapDv:0000138', 46.0: 'HsapDv:0000140', 56.0: 'HsapDv:0000150', 79.0: 'HsapDv:0000173'}

In [68]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [69]:
adata.obs['development_stage_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    HsapDv:0000169
101CV0ng_AAACCTGCAAGGCTCC-1    HsapDv:0000169
101CV0ng_AAACCTGCACGAAATA-1    HsapDv:0000169
101CV0ng_AAACCTGCAGCTATTG-1    HsapDv:0000169
101CV0ng_AAACCTGGTGATGCCC-1    HsapDv:0000169
                                    ...      
ZM3KACng_TTTGTCACAAATACAG-1    HsapDv:0000173
ZM3KACng_TTTGTCACAGAGTGTG-1    HsapDv:0000173
ZM3KACng_TTTGTCACAGTTTACG-1    HsapDv:0000173
ZM3KACng_TTTGTCACATCCGCGA-1    HsapDv:0000173
ZM3KACng_TTTGTCACATGGATGG-1    HsapDv:0000173
Name: development_stage_ontology_term_id, Length: 39778, dtype: object

## disease_ontology_term_id

In [70]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461'] * len(adata.obs)

In [71]:
adata.obs['disease_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    PATO:0000461
101CV0ng_AAACCTGCAAGGCTCC-1    PATO:0000461
101CV0ng_AAACCTGCACGAAATA-1    PATO:0000461
101CV0ng_AAACCTGCAGCTATTG-1    PATO:0000461
101CV0ng_AAACCTGGTGATGCCC-1    PATO:0000461
                                   ...     
ZM3KACng_TTTGTCACAAATACAG-1    PATO:0000461
ZM3KACng_TTTGTCACAGAGTGTG-1    PATO:0000461
ZM3KACng_TTTGTCACAGTTTACG-1    PATO:0000461
ZM3KACng_TTTGTCACATCCGCGA-1    PATO:0000461
ZM3KACng_TTTGTCACATGGATGG-1    PATO:0000461
Name: disease_ontology_term_id, Length: 39778, dtype: object

In [72]:
adata.obs['disease_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    PATO:0000461
101CV0ng_AAACCTGCAAGGCTCC-1    PATO:0000461
101CV0ng_AAACCTGCACGAAATA-1    PATO:0000461
101CV0ng_AAACCTGCAGCTATTG-1    PATO:0000461
101CV0ng_AAACCTGGTGATGCCC-1    PATO:0000461
                                   ...     
ZM3KACng_TTTGTCACAAATACAG-1    PATO:0000461
ZM3KACng_TTTGTCACAGAGTGTG-1    PATO:0000461
ZM3KACng_TTTGTCACAGTTTACG-1    PATO:0000461
ZM3KACng_TTTGTCACATCCGCGA-1    PATO:0000461
ZM3KACng_TTTGTCACATGGATGG-1    PATO:0000461
Name: disease_ontology_term_id, Length: 39778, dtype: object

## ethnicity_ontology_term_id

In [73]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [74]:
adata.obs['ethnicity_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    unknown
101CV0ng_AAACCTGCAAGGCTCC-1    unknown
101CV0ng_AAACCTGCACGAAATA-1    unknown
101CV0ng_AAACCTGCAGCTATTG-1    unknown
101CV0ng_AAACCTGGTGATGCCC-1    unknown
                                ...   
ZM3KACng_TTTGTCACAAATACAG-1    unknown
ZM3KACng_TTTGTCACAGAGTGTG-1    unknown
ZM3KACng_TTTGTCACAGTTTACG-1    unknown
ZM3KACng_TTTGTCACATCCGCGA-1    unknown
ZM3KACng_TTTGTCACATGGATGG-1    unknown
Name: ethnicity_ontology_term_id, Length: 39778, dtype: object

## is_primary_data

In [75]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [76]:
adata.obs['is_primary_data']

index
101CV0ng_AAACCTGAGGTTCCTA-1    True
101CV0ng_AAACCTGCAAGGCTCC-1    True
101CV0ng_AAACCTGCACGAAATA-1    True
101CV0ng_AAACCTGCAGCTATTG-1    True
101CV0ng_AAACCTGGTGATGCCC-1    True
                               ... 
ZM3KACng_TTTGTCACAAATACAG-1    True
ZM3KACng_TTTGTCACAGAGTGTG-1    True
ZM3KACng_TTTGTCACAGTTTACG-1    True
ZM3KACng_TTTGTCACATCCGCGA-1    True
ZM3KACng_TTTGTCACATGGATGG-1    True
Name: is_primary_data, Length: 39778, dtype: bool

In [77]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [78]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [79]:
adata.obs['organism_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    NCBITaxon:9606
101CV0ng_AAACCTGCAAGGCTCC-1    NCBITaxon:9606
101CV0ng_AAACCTGCACGAAATA-1    NCBITaxon:9606
101CV0ng_AAACCTGCAGCTATTG-1    NCBITaxon:9606
101CV0ng_AAACCTGGTGATGCCC-1    NCBITaxon:9606
                                    ...      
ZM3KACng_TTTGTCACAAATACAG-1    NCBITaxon:9606
ZM3KACng_TTTGTCACAGAGTGTG-1    NCBITaxon:9606
ZM3KACng_TTTGTCACAGTTTACG-1    NCBITaxon:9606
ZM3KACng_TTTGTCACATCCGCGA-1    NCBITaxon:9606
ZM3KACng_TTTGTCACATGGATGG-1    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 39778, dtype: object

In [80]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [81]:
adata.obs['organism_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    NCBITaxon:9606
101CV0ng_AAACCTGCAAGGCTCC-1    NCBITaxon:9606
101CV0ng_AAACCTGCACGAAATA-1    NCBITaxon:9606
101CV0ng_AAACCTGCAGCTATTG-1    NCBITaxon:9606
101CV0ng_AAACCTGGTGATGCCC-1    NCBITaxon:9606
                                    ...      
ZM3KACng_TTTGTCACAAATACAG-1    NCBITaxon:9606
ZM3KACng_TTTGTCACAGAGTGTG-1    NCBITaxon:9606
ZM3KACng_TTTGTCACAGTTTACG-1    NCBITaxon:9606
ZM3KACng_TTTGTCACATCCGCGA-1    NCBITaxon:9606
ZM3KACng_TTTGTCACATGGATGG-1    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 39778, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [82]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex']

In [83]:
adata.obs['sex_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    F
101CV0ng_AAACCTGCAAGGCTCC-1    F
101CV0ng_AAACCTGCACGAAATA-1    F
101CV0ng_AAACCTGCAGCTATTG-1    F
101CV0ng_AAACCTGGTGATGCCC-1    F
                              ..
ZM3KACng_TTTGTCACAAATACAG-1    F
ZM3KACng_TTTGTCACAGAGTGTG-1    F
ZM3KACng_TTTGTCACAGTTTACG-1    F
ZM3KACng_TTTGTCACATCCGCGA-1    F
ZM3KACng_TTTGTCACATGGATGG-1    F
Name: sex_ontology_term_id, Length: 39778, dtype: category
Categories (2, object): ['F', 'M']

**PATO_0000384 for male  
PATO_0000383 for female**

In [84]:
mapping = {'F':'PATO:0000383', 'M':'PATO:0000384'}

In [85]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [86]:
adata.obs['sex_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    PATO:0000383
101CV0ng_AAACCTGCAAGGCTCC-1    PATO:0000383
101CV0ng_AAACCTGCACGAAATA-1    PATO:0000383
101CV0ng_AAACCTGCAGCTATTG-1    PATO:0000383
101CV0ng_AAACCTGGTGATGCCC-1    PATO:0000383
                                   ...     
ZM3KACng_TTTGTCACAAATACAG-1    PATO:0000383
ZM3KACng_TTTGTCACAGAGTGTG-1    PATO:0000383
ZM3KACng_TTTGTCACAGTTTACG-1    PATO:0000383
ZM3KACng_TTTGTCACATCCGCGA-1    PATO:0000383
ZM3KACng_TTTGTCACATGGATGG-1    PATO:0000383
Name: sex_ontology_term_id, Length: 39778, dtype: category
Categories (2, object): ['PATO:0000383', 'PATO:0000384']

## tissue_ontology_term_id

In [87]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0002048'] * len(adata.obs)

In [88]:
adata.obs['tissue_ontology_term_id']

index
101CV0ng_AAACCTGAGGTTCCTA-1    UBERON:0002048
101CV0ng_AAACCTGCAAGGCTCC-1    UBERON:0002048
101CV0ng_AAACCTGCACGAAATA-1    UBERON:0002048
101CV0ng_AAACCTGCAGCTATTG-1    UBERON:0002048
101CV0ng_AAACCTGGTGATGCCC-1    UBERON:0002048
                                    ...      
ZM3KACng_TTTGTCACAAATACAG-1    UBERON:0002048
ZM3KACng_TTTGTCACAGAGTGTG-1    UBERON:0002048
ZM3KACng_TTTGTCACAGTTTACG-1    UBERON:0002048
ZM3KACng_TTTGTCACATCCGCGA-1    UBERON:0002048
ZM3KACng_TTTGTCACATGGATGG-1    UBERON:0002048
Name: tissue_ontology_term_id, Length: 39778, dtype: object

---

# <ins>var and raw.var

In [89]:
adata.var

Unnamed: 0_level_0,name,gene_symbols,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,MIR1302-10,MIR1302-2HG,gene
ENSG00000237613,FAM138A,FAM138A,gene
ENSG00000186092,OR4F5,OR4F5,gene
ENSG00000238009,RP11-34P13.7,RP11-34P13.7,gene
ENSG00000239945,RP11-34P13.8,RP11-34P13.8,gene
...,...,...,...
ENSG00000212907,MT-ND4L,MT-ND4L,gene
ENSG00000198886,MT-ND4,MT-ND4,gene
ENSG00000198786,MT-ND5,MT-ND5,gene
ENSG00000198695,MT-ND6,MT-ND6,gene


## feature_is_filtered

In [92]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [93]:
adata.var['feature_is_filtered']

gene_ids
ENSG00000243485    False
ENSG00000237613    False
ENSG00000186092    False
ENSG00000238009    False
ENSG00000239945    False
                   ...  
ENSG00000212907    False
ENSG00000198886    False
ENSG00000198786    False
ENSG00000198695    False
ENSG00000198727    False
Name: feature_is_filtered, Length: 30849, dtype: bool

# <ins>obsm

In [94]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

# <ins>uns

In [95]:
adata.uns['schema_version'] = '2.0.0'

In [96]:
adata.uns['title'] = 'Lung'

In [97]:
adata.uns['X_normalization'] = 'Scanpy'

In [98]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [99]:
adata

AnnData object with n_obs × n_vars = 39778 × 30849
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sex', 'age', 'smoking', 'packyears', 'percent.mito', 'CellType', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'name', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [100]:
# Removing the same slots created for the schema
del adata.obs['CellType']
del adata.obs['sex']
del adata.obs['age']

In [101]:
adata

AnnData object with n_obs × n_vars = 39778 × 30849
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'smoking', 'packyears', 'percent.mito', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'name', 'gene_symbols', 'feature_biotype', 'feature_is_filtered'
    uns: 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [102]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [7]:
adata.obs['donor_id'] = adata.obs['orig.ident']

In [4]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [8]:
del adata.obs['orig.ident']

In [103]:
adata.write('lukassen20_lung_f.h5ad', compression = 'gzip')

... storing 'development_stage_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_biotype' as categorical
... storing 'feature_biotype' as categorical
