# **Curating `haniffa21.processed.h5ad` dataset:**

* includes ~648K cells
* biggest dataset in `covid19cellatlas.org`
* publication --> https://www.nature.com/articles/s41591-021-01329-2
* first dataset to be uploaded to cellxgene data portal!

# <ins>STEPS:</ins>

## 1. Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
adata = sc.read('./datasets/haniffa21.processed.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 647366 × 24929
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

## 2. Pre-prepare before schema adjustments

### Remove 'Antibody Capture' rows

In [4]:
indices = [i for i, x in enumerate(adata.var['feature_types']) if x == "Gene Expression"] # Removing 'Antibody Capture'

In [5]:
adata = adata[:,indices]

  res = method(*args, **kwargs)


In [6]:
adata

View of AnnData object with n_obs × n_vars = 647366 × 24737
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

### Remove the genes that do not match in gene-ensemblID file

In [7]:
# matching the genes in dataset with the genes in the file
geno = pd.read_csv('genes_homo_sapiens.csv', header = None, names = ['ENSEMBLid', 'gena','no'])
gena = pd.DataFrame(adata.var_names, columns = ['gena'])
gene = pd.merge(geno,gena, on='gena', how='inner')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
gene

Unnamed: 0,ENSEMBLid,gena,no
0,ENSG00000243485,MIR1302-2HG,5
1,ENSG00000177757,FAM87B,2
2,ENSG00000225880,LINC00115,5
3,ENSG00000230368,FAM41C,2
4,ENSG00000187634,SAMD11,13
...,...,...,...
19835,ENSG00000212907,MT-ND4L,2
19836,ENSG00000198886,MT-ND4,2
19837,ENSG00000198786,MT-ND5,2
19838,ENSG00000198695,MT-ND6,2


In [9]:
# finding the indices of existing genes and filtering nonexistent ones
x= [list(adata.var_names).index(i) for i in gene.loc[:,'gena']]

adata = adata[:,x]

In [15]:
del x

### Adjust index names for the schema

In [11]:
adata.var_names = gene['ENSEMBLid'].copy()
adata.var_names.name = 'feature_id'

adata.var = adata.var.rename(columns={'feature_types': 'feature_biotype'})
adata.var['feature_biotype'] = adata.var['feature_biotype'].map({'Gene Expression':'gene'})

### Remove non-unique genes (ask!)

In [16]:
adata.var_names_make_unique()

In [18]:
unique_genes_list = [name for name in adata.var_names if not name.endswith('-1')]
adata = adata[:, unique_genes_list]

### Copy the existing adata and prepare for raw!

In [19]:
adata2 = adata.copy()

In [20]:
adata

View of AnnData object with n_obs × n_vars = 647366 × 19821
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_biotype'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [21]:
adata2

AnnData object with n_obs × n_vars = 647366 × 19821
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_biotype'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [22]:
adata2.X = adata2.layers['raw'].copy()

In [23]:
del adata2.obs
del adata2.uns
del adata2.obsm
del adata2.layers

In [24]:
adata2

AnnData object with n_obs × n_vars = 647366 × 19821
    var: 'feature_biotype'

In [25]:
adata.raw = adata2

In [26]:
adata

AnnData object with n_obs × n_vars = 647366 × 19821
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id'
    var: 'feature_biotype'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [None]:
adata.raw

---

### Example dataset that suits schema requirements:

In [27]:
adata1 = sc.read('./datasets/lung_portal_schema.h5ad')

In [28]:
adata1.obs

Unnamed: 0,Genes,Run,X,Y,Virus,Name,Treatment,Passch.Basal,Passch.Interm. basal.secr.,Passch.Secretory,...,Name3,tissue_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,development_stage_ontology_term_id,cell_type_ontology_term_id,ethnicity_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id
CV3_AAACGAACATAACAGA.1,962,CV3,-13.557582,-17.807835,,CIG_Ctrl,CIG,1.045434,0.0,0.457923,...,CS Basal,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000646,unknown,unknown,True,NCBITaxon:9606
CV3_AAACGAATCTCATGCC.1,1073,CV3,-10.283603,0.933573,,CIG_Ctrl,CIG,0.799446,0.0,0.573960,...,CS Basal,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000646,unknown,unknown,True,NCBITaxon:9606
CV3_AAACGCTCAAGCCTGC.1,1753,CV3,1.350025,-18.973487,,CIG_Ctrl,CIG,0.800865,0.0,0.387041,...,CS Basal,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000646,unknown,unknown,True,NCBITaxon:9606
CV3_AAACGCTCATGTAACC.1,837,CV3,4.078426,-24.264368,,CIG_Ctrl,CIG,0.690304,0.0,0.455314,...,CS Basal,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000646,unknown,unknown,True,NCBITaxon:9606
CV3_AAACGCTGTTGTCCCT.1,1308,CV3,-9.501638,-15.940682,,CIG_Ctrl,CIG,0.969025,0.0,0.373085,...,CS Basal,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000646,unknown,unknown,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV2_TTTGGTTAGTAGTCAA.1,962,CV2,-3.013199,12.830320,Covid,Mock_Covid,,0.366218,0.0,0.723494,...,No CS + Sars-Cov2 Secretory,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000151,unknown,unknown,True,NCBITaxon:9606
CV2_TTTGGTTAGTAGTGCG.1,844,CV2,11.066150,13.811140,Covid,Mock_Covid,,0.343824,0.0,0.820676,...,No CS + Sars-Cov2 Secretory,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000151,unknown,unknown,True,NCBITaxon:9606
CV2_TTTGGTTGTATCGCAT.1,1182,CV2,17.224610,10.050120,Covid,Mock_Covid,,0.625184,0.0,0.623866,...,No CS + Sars-Cov2 Secretory,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000151,unknown,unknown,True,NCBITaxon:9606
CV2_TTTGTTGAGGGCAACT.1,1096,CV2,-2.256609,12.197100,Covid,Mock_Covid,,0.509072,0.0,0.568154,...,No CS + Sars-Cov2 Secretory,UBERON:0000115,EFO:0009922,PATO:0000461,HsapDv:0000087,CL:0000151,unknown,unknown,True,NCBITaxon:9606


---

---

## 3. Apply the schema requirements:

# <ins>X

Raw and normal count matrices are in the object, no additional adjustment is needed!

# <ins>obs

In [29]:
adata.obs.columns

Index(['sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts',
       'total_counts_mt', 'pct_counts_mt', 'full_clustering',
       'initial_clustering', 'Resample', 'Collection_Day', 'Sex',
       'Age_interval', 'Swab_result', 'Status', 'Smoker',
       'Status_on_day_collection', 'Status_on_day_collection_summary',
       'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status',
       'Outcome', 'patient_id'],
      dtype='object')

## assay_ontology_term_id

Assay used in this study is 10x 5' v1. Publication

In [30]:
adata.obs['assay_ontology_term_id'] = ["EFO:0011025"] * len(adata.obs)

In [31]:
adata.obs['assay_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    EFO:0011025
AAACCTGAGAGTAATC-MH9179824    EFO:0011025
AAACCTGAGAGTGAGA-MH9179824    EFO:0011025
AAACCTGAGGAATCGC-MH9179824    EFO:0011025
AAACCTGAGTGTTGAA-MH9179824    EFO:0011025
                                 ...     
BGCV15_TTTGTCAAGGCGATAC-1     EFO:0011025
BGCV15_TTTGTCACAAGCCATT-1     EFO:0011025
BGCV15_TTTGTCACAGACACTT-1     EFO:0011025
BGCV15_TTTGTCAGTTACGGAG-1     EFO:0011025
BGCV15_TTTGTCATCGAATGGG-1     EFO:0011025
Name: assay_ontology_term_id, Length: 647366, dtype: object

In [32]:
adata.obs['assay_ontology_term_id'] = pd.Categorical(adata.obs['assay_ontology_term_id'])

In [33]:
adata.obs['assay_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    EFO:0011025
AAACCTGAGAGTAATC-MH9179824    EFO:0011025
AAACCTGAGAGTGAGA-MH9179824    EFO:0011025
AAACCTGAGGAATCGC-MH9179824    EFO:0011025
AAACCTGAGTGTTGAA-MH9179824    EFO:0011025
                                 ...     
BGCV15_TTTGTCAAGGCGATAC-1     EFO:0011025
BGCV15_TTTGTCACAAGCCATT-1     EFO:0011025
BGCV15_TTTGTCACAGACACTT-1     EFO:0011025
BGCV15_TTTGTCAGTTACGGAG-1     EFO:0011025
BGCV15_TTTGTCATCGAATGGG-1     EFO:0011025
Name: assay_ontology_term_id, Length: 647366, dtype: category
Categories (1, object): ['EFO:0011025']

## cell_type_ontology_term_id

In [34]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['full_clustering']

In [35]:
adata.obs['cell_type_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824       CD8.TE
AAACCTGAGAGTAATC-MH9179824     CD4.IL22
AAACCTGAGAGTGAGA-MH9179824    CD8.Naive
AAACCTGAGGAATCGC-MH9179824    CD4.Naive
AAACCTGAGTGTTGAA-MH9179824       CD8.EM
                                ...    
BGCV15_TTTGTCAAGGCGATAC-1        CD4.CM
BGCV15_TTTGTCACAAGCCATT-1       NK_16hi
BGCV15_TTTGTCACAGACACTT-1        CD8.EM
BGCV15_TTTGTCAGTTACGGAG-1     CD4.Naive
BGCV15_TTTGTCATCGAATGGG-1       NK_16hi
Name: cell_type_ontology_term_id, Length: 647366, dtype: category
Categories (51, object): ['ASDC', 'B_exhausted', 'B_immature', 'B_malignant', ..., 'RBC', 'Treg', 'gdT', 'pDC']

In [36]:
#list(adata.obs['cell_type_ontology_term_id'].unique())

In [37]:
mapping = {'CD8.TE': 'CL:0001050', 'CD4.IL22': 'CL:0001042', 'CD8.Naive': 'CL:0000900', 'CD4.Naive': 'CL:0000895',
           'CD8.EM': 'CL:0000913', 'CD83_CD14_mono': 'CL:0001054', 'B_naive': 'CL:0000788', 'CD14_mono': 'CL:0001054', 
           'B_switched_memory': 'CL:0000972', 'NKT': 'CL:0000814', 'NK_16hi': 'CL:0000939', 'CD16_mono': 'CL:0000814', 
           'NK_prolif': 'CL:0000623', 'CD4.CM': 'CL:0000904', 'MAIT': 'CL:0000940', 'Plasma_cell_IgG':'CL:0000985',
           'CD4.Tfh': 'CL:0002038', 'NK_56hi': 'CL:0000938', 'C1_CD16_mono': 'CL:0002396', 'Plasmablast': 'CL:0000980',
           'gdT': 'CL:0000798', 'Platelets': 'CL:0000233', 'DC2': 'CL:0000782', 'Plasma_cell_IgA': 'CL:0000987',
           'B_non-switched_memory': 'CL:0000970', 
           'B_exhausted': 'CL:0000236',
 'CD4.EM': 'CL:0000905',
 'CD8.Prolif': 'CL:0000625',
 'B_immature': 'CL:0000816',
 'HSC_CD38pos': 'CL:0001024',
 'pDC': 'CL:0000784',
 'DC3': 'CL:0001056',
 'ASDC': 'CL:0000451',
 'Plasma_cell_IgM': 'CL:0000986',
 'ILC1_3': 'CL:0001077',
 'CD4.Prolif': 'CL:0000624',
 'DC1': 'CL:0000451',
 'Treg': 'CL:0000815',
 'CD4.Th1': 'CL:0000545',
 'RBC': 'CL:0000232',
 'CD4.Th2': 'CL:0000546',
 'DC_prolif': 'CL:0000451',
 'HSC_erythroid': 'CL:0001066',
 'ILC2': 'CL:0001081',
 'Mono_prolif': 'CL:0000576',
 'HSC_prolif': 'CL:0008001',
 'HSC_CD38neg': 'CL:0001024',
 'HSC_myeloid': 'CL:0000839',
 'HSC_MK': 'CL:0000556',
 'CD4.Th17': 'CL:0000899',
 'B_malignant': 'CL:0001064'}

In [39]:
# cell types from `initial_clustering`
['CD8',
 'CD4',
 'CD14',
 'B_cell',
 'NK_16hi',
 'Lymph_prolif',
 'Treg',
 'MAIT',
 'Plasmablast',
 'NK_56hi',
 'CD16',
 'Platelets',
 'DCs',
 'gdT',
 'HSC',
 'pDC',
 'RBC',
 'Mono_prolif']

['CD8',
 'CD4',
 'CD14',
 'B_cell',
 'NK_16hi',
 'Lymph_prolif',
 'Treg',
 'MAIT',
 'Plasmablast',
 'NK_56hi',
 'CD16',
 'Platelets',
 'DCs',
 'gdT',
 'HSC',
 'pDC',
 'RBC',
 'Mono_prolif']

In [40]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].map(mapping)

In [41]:
adata.obs['cell_type_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    CL:0001050
AAACCTGAGAGTAATC-MH9179824    CL:0001042
AAACCTGAGAGTGAGA-MH9179824    CL:0000900
AAACCTGAGGAATCGC-MH9179824    CL:0000895
AAACCTGAGTGTTGAA-MH9179824    CL:0000913
                                 ...    
BGCV15_TTTGTCAAGGCGATAC-1     CL:0000904
BGCV15_TTTGTCACAAGCCATT-1     CL:0000939
BGCV15_TTTGTCACAGACACTT-1     CL:0000913
BGCV15_TTTGTCAGTTACGGAG-1     CL:0000895
BGCV15_TTTGTCATCGAATGGG-1     CL:0000939
Name: cell_type_ontology_term_id, Length: 647366, dtype: object

## development_stage_ontology_term_id

In [42]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['Age_interval']

In [43]:
adata.obs['development_stage_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    (50, 59]
AAACCTGAGAGTAATC-MH9179824    (50, 59]
AAACCTGAGAGTGAGA-MH9179824    (50, 59]
AAACCTGAGGAATCGC-MH9179824    (50, 59]
AAACCTGAGTGTTGAA-MH9179824    (50, 59]
                                ...   
BGCV15_TTTGTCAAGGCGATAC-1     (70, 79]
BGCV15_TTTGTCACAAGCCATT-1     (50, 59]
BGCV15_TTTGTCACAGACACTT-1     (50, 59]
BGCV15_TTTGTCAGTTACGGAG-1     (50, 59]
BGCV15_TTTGTCATCGAATGGG-1     (70, 79]
Name: development_stage_ontology_term_id, Length: 647366, dtype: category
Categories (8, object): ['(20, 29]', '(30, 39]', '(40, 49]', '(50, 59]', '(60, 69]', '(70, 79]', '(80, 89]', '(90, 99]']

In [44]:
mapping = {'(20, 29]': 'HsapDv:0000237', '(30, 39]': 'HsapDv:0000238', '(40, 49]': 'HsapDv:0000239', '(50, 59]': 'HsapDv:0000240', '(60, 69]': 'HsapDv:0000241', '(70, 79]': 'HsapDv:0000242', '(80, 89]': 'HsapDv:0000243', '(90, 99]': 'HsapDv:0000244'}

In [45]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].map(mapping)

In [46]:
adata.obs['development_stage_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    HsapDv:0000240
AAACCTGAGAGTAATC-MH9179824    HsapDv:0000240
AAACCTGAGAGTGAGA-MH9179824    HsapDv:0000240
AAACCTGAGGAATCGC-MH9179824    HsapDv:0000240
AAACCTGAGTGTTGAA-MH9179824    HsapDv:0000240
                                   ...      
BGCV15_TTTGTCAAGGCGATAC-1     HsapDv:0000242
BGCV15_TTTGTCACAAGCCATT-1     HsapDv:0000240
BGCV15_TTTGTCACAGACACTT-1     HsapDv:0000240
BGCV15_TTTGTCAGTTACGGAG-1     HsapDv:0000240
BGCV15_TTTGTCATCGAATGGG-1     HsapDv:0000242
Name: development_stage_ontology_term_id, Length: 647366, dtype: category
Categories (8, object): ['HsapDv:0000237', 'HsapDv:0000238', 'HsapDv:0000239', 'HsapDv:0000240', 'HsapDv:0000241', 'HsapDv:0000242', 'HsapDv:0000243', 'HsapDv:0000244']

## disease_ontology_term_id

In [47]:
adata.obs['disease_ontology_term_id'] = adata.obs['Status']

In [48]:
adata.obs['disease_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824      Covid
AAACCTGAGAGTAATC-MH9179824      Covid
AAACCTGAGAGTGAGA-MH9179824      Covid
AAACCTGAGGAATCGC-MH9179824      Covid
AAACCTGAGTGTTGAA-MH9179824      Covid
                               ...   
BGCV15_TTTGTCAAGGCGATAC-1       Covid
BGCV15_TTTGTCACAAGCCATT-1     Healthy
BGCV15_TTTGTCACAGACACTT-1       Covid
BGCV15_TTTGTCAGTTACGGAG-1       Covid
BGCV15_TTTGTCATCGAATGGG-1       Covid
Name: disease_ontology_term_id, Length: 647366, dtype: category
Categories (4, object): ['Covid', 'Healthy', 'LPS', 'Non_covid']

In [49]:
mapping = {'Covid':'MONDO:0100096', 'Healthy':'PATO:0000461', 'LPS': 'PATO:0000461', 'Non_covid': 'MONDO:0000001'}

Since cells treated with LPS are healthy, they were assigned as healthy according to the reply from Slack.

In [50]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].map(mapping)

In [51]:
adata.obs['disease_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    MONDO:0100096
AAACCTGAGAGTAATC-MH9179824    MONDO:0100096
AAACCTGAGAGTGAGA-MH9179824    MONDO:0100096
AAACCTGAGGAATCGC-MH9179824    MONDO:0100096
AAACCTGAGTGTTGAA-MH9179824    MONDO:0100096
                                  ...      
BGCV15_TTTGTCAAGGCGATAC-1     MONDO:0100096
BGCV15_TTTGTCACAAGCCATT-1      PATO:0000461
BGCV15_TTTGTCACAGACACTT-1     MONDO:0100096
BGCV15_TTTGTCAGTTACGGAG-1     MONDO:0100096
BGCV15_TTTGTCATCGAATGGG-1     MONDO:0100096
Name: disease_ontology_term_id, Length: 647366, dtype: object

## ethnicity_ontology_term_id

In [52]:
adata.obs['ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [53]:
adata.obs['ethnicity_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    unknown
AAACCTGAGAGTAATC-MH9179824    unknown
AAACCTGAGAGTGAGA-MH9179824    unknown
AAACCTGAGGAATCGC-MH9179824    unknown
AAACCTGAGTGTTGAA-MH9179824    unknown
                               ...   
BGCV15_TTTGTCAAGGCGATAC-1     unknown
BGCV15_TTTGTCACAAGCCATT-1     unknown
BGCV15_TTTGTCACAGACACTT-1     unknown
BGCV15_TTTGTCAGTTACGGAG-1     unknown
BGCV15_TTTGTCATCGAATGGG-1     unknown
Name: ethnicity_ontology_term_id, Length: 647366, dtype: object

## is_primary_data

In [54]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [55]:
adata.obs['is_primary_data']

covid_index
AAACCTGAGAAACCTA-MH9179824    True
AAACCTGAGAGTAATC-MH9179824    True
AAACCTGAGAGTGAGA-MH9179824    True
AAACCTGAGGAATCGC-MH9179824    True
AAACCTGAGTGTTGAA-MH9179824    True
                              ... 
BGCV15_TTTGTCAAGGCGATAC-1     True
BGCV15_TTTGTCACAAGCCATT-1     True
BGCV15_TTTGTCACAGACACTT-1     True
BGCV15_TTTGTCAGTTACGGAG-1     True
BGCV15_TTTGTCATCGAATGGG-1     True
Name: is_primary_data, Length: 647366, dtype: bool

In [56]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

## organism_ontology_term_id

In [57]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [58]:
adata.obs['organism_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    NCBITaxon:9606
AAACCTGAGAGTAATC-MH9179824    NCBITaxon:9606
AAACCTGAGAGTGAGA-MH9179824    NCBITaxon:9606
AAACCTGAGGAATCGC-MH9179824    NCBITaxon:9606
AAACCTGAGTGTTGAA-MH9179824    NCBITaxon:9606
                                   ...      
BGCV15_TTTGTCAAGGCGATAC-1     NCBITaxon:9606
BGCV15_TTTGTCACAAGCCATT-1     NCBITaxon:9606
BGCV15_TTTGTCACAGACACTT-1     NCBITaxon:9606
BGCV15_TTTGTCAGTTACGGAG-1     NCBITaxon:9606
BGCV15_TTTGTCATCGAATGGG-1     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 647366, dtype: object

In [59]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [60]:
adata.obs['organism_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    NCBITaxon:9606
AAACCTGAGAGTAATC-MH9179824    NCBITaxon:9606
AAACCTGAGAGTGAGA-MH9179824    NCBITaxon:9606
AAACCTGAGGAATCGC-MH9179824    NCBITaxon:9606
AAACCTGAGTGTTGAA-MH9179824    NCBITaxon:9606
                                   ...      
BGCV15_TTTGTCAAGGCGATAC-1     NCBITaxon:9606
BGCV15_TTTGTCACAAGCCATT-1     NCBITaxon:9606
BGCV15_TTTGTCACAGACACTT-1     NCBITaxon:9606
BGCV15_TTTGTCAGTTACGGAG-1     NCBITaxon:9606
BGCV15_TTTGTCATCGAATGGG-1     NCBITaxon:9606
Name: organism_ontology_term_id, Length: 647366, dtype: category
Categories (1, object): ['NCBITaxon:9606']

## sex_ontology_term_id

In [61]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sex']

In [62]:
adata.obs['sex_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    Female
AAACCTGAGAGTAATC-MH9179824    Female
AAACCTGAGAGTGAGA-MH9179824    Female
AAACCTGAGGAATCGC-MH9179824    Female
AAACCTGAGTGTTGAA-MH9179824    Female
                               ...  
BGCV15_TTTGTCAAGGCGATAC-1       Male
BGCV15_TTTGTCACAAGCCATT-1       Male
BGCV15_TTTGTCACAGACACTT-1       Male
BGCV15_TTTGTCAGTTACGGAG-1       Male
BGCV15_TTTGTCATCGAATGGG-1       Male
Name: sex_ontology_term_id, Length: 647366, dtype: category
Categories (2, object): ['Female', 'Male']

**PATO_0000384 for male  
PATO_0000383 for female**

In [63]:
mapping = {'Female':'PATO:0000383', 'Male':'PATO:0000384'}

In [64]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].map(mapping)

In [65]:
adata.obs['sex_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    PATO:0000383
AAACCTGAGAGTAATC-MH9179824    PATO:0000383
AAACCTGAGAGTGAGA-MH9179824    PATO:0000383
AAACCTGAGGAATCGC-MH9179824    PATO:0000383
AAACCTGAGTGTTGAA-MH9179824    PATO:0000383
                                  ...     
BGCV15_TTTGTCAAGGCGATAC-1     PATO:0000384
BGCV15_TTTGTCACAAGCCATT-1     PATO:0000384
BGCV15_TTTGTCACAGACACTT-1     PATO:0000384
BGCV15_TTTGTCAGTTACGGAG-1     PATO:0000384
BGCV15_TTTGTCATCGAATGGG-1     PATO:0000384
Name: sex_ontology_term_id, Length: 647366, dtype: category
Categories (2, object): ['PATO:0000383', 'PATO:0000384']

## tissue_ontology_term_id

In [66]:
adata.obs['tissue_ontology_term_id'] = ['UBERON:0000178'] * len(adata.obs)

In [67]:
adata.obs['tissue_ontology_term_id']

covid_index
AAACCTGAGAAACCTA-MH9179824    UBERON:0000178
AAACCTGAGAGTAATC-MH9179824    UBERON:0000178
AAACCTGAGAGTGAGA-MH9179824    UBERON:0000178
AAACCTGAGGAATCGC-MH9179824    UBERON:0000178
AAACCTGAGTGTTGAA-MH9179824    UBERON:0000178
                                   ...      
BGCV15_TTTGTCAAGGCGATAC-1     UBERON:0000178
BGCV15_TTTGTCACAAGCCATT-1     UBERON:0000178
BGCV15_TTTGTCACAGACACTT-1     UBERON:0000178
BGCV15_TTTGTCAGTTACGGAG-1     UBERON:0000178
BGCV15_TTTGTCATCGAATGGG-1     UBERON:0000178
Name: tissue_ontology_term_id, Length: 647366, dtype: object

---

# <ins>var and raw.var

In [68]:
adata.var

Unnamed: 0_level_0,feature_biotype
feature_id,Unnamed: 1_level_1
ENSG00000243485,gene
ENSG00000177757,gene
ENSG00000225880,gene
ENSG00000230368,gene
ENSG00000187634,gene
...,...
ENSG00000212907,gene
ENSG00000198886,gene
ENSG00000198786,gene
ENSG00000198695,gene


## feature_biotype

Adjusted at the beginning!

In [69]:
#adata2.var['feature_biotype'] = ['gene'] * len(adata2.var.index)

## feature_id

Adjusted at the beginning!

In [70]:
#adata2.var_names = adata2.var["gene_ids"].copy()
#adata2.var_names.name = "feature_id"
#adata2.raw.var_names = adata2.raw.var["gene_ids"].copy()
#adata2.raw.var_names.name = "feature_id"

In [71]:
adata2.var.index

Index(['ENSG00000243485', 'ENSG00000177757', 'ENSG00000225880',
       'ENSG00000230368', 'ENSG00000187634', 'ENSG00000188976',
       'ENSG00000187961', 'ENSG00000187583', 'ENSG00000187642',
       'ENSG00000188290',
       ...
       'ENSG00000198712', 'ENSG00000228253', 'ENSG00000198899',
       'ENSG00000198938', 'ENSG00000198840', 'ENSG00000212907',
       'ENSG00000198886', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000198727'],
      dtype='object', name='feature_id', length=19821)

## feature_is_filtered

True if feature was filtered out in final matrix (X)  
Problem: This should be annotated in 'var', not 'raw.var'. __How to annotate the gene if it is already filtered out in the final matrix?__

In [72]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [73]:
adata.var['feature_is_filtered']

feature_id
ENSG00000243485    False
ENSG00000177757    False
ENSG00000225880    False
ENSG00000230368    False
ENSG00000187634    False
                   ...  
ENSG00000212907    False
ENSG00000198886    False
ENSG00000198786    False
ENSG00000198695    False
ENSG00000198727    False
Name: feature_is_filtered, Length: 19821, dtype: bool

# <ins>obsm

In [74]:
adata.obsm

AxisArrays with keys: X_pca, X_pca_harmony, X_umap

# <ins>uns

In [75]:
adata.uns['schema_version'] = '2.0.0'

In [76]:
adata.uns['title'] = 'COVID-19 PBMC Ncl-Cambridge-UCL'

In [77]:
adata.uns['X_normalization'] = 'Scanpy'

In [78]:
adata.uns['default_embedding'] = 'X_umap'

---

---

# 4. Final adjustments

Final adjustments include the removal of the slots that were already re-created for the schema.

In [79]:
adata

AnnData object with n_obs × n_vars = 647366 × 19821
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id'
    var: 'feature_biotype', 'feature_is_filtered'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [80]:
adata.obs['author_cell_type'] = adata.obs['full_clustering']

In [81]:
adata

AnnData object with n_obs × n_vars = 647366 × 19821
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type'
    var: 'feature_biotype', 'feature_is_filtered'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [82]:
# Removing the same slots created for the schema
del adata.obs['full_clustering']
del adata.obs['Sex']
del adata.obs['Age_interval']

In [83]:
adata

AnnData object with n_obs × n_vars = 647366 × 19821
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'initial_clustering', 'Resample', 'Collection_Day', 'Swab_result', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type'
    var: 'feature_biotype', 'feature_is_filtered'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap', 'schema_version', 'title', 'X_normalization', 'default_embedding'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'

In [84]:
adata.write('haniffa21_processed_curated2.h5ad')

... storing 'cell_type_ontology_term_id' as categorical
... storing 'disease_ontology_term_id' as categorical
... storing 'ethnicity_ontology_term_id' as categorical
... storing 'tissue_ontology_term_id' as categorical


# 5. Check if the file works

In [2]:
adata2 = sc.read('haniffa21_processed_curated.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [3]:
adata2

AnnData object with n_obs × n_vars = 647366 × 19840
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'initial_clustering', 'Resample', 'Collection_Day', 'Swab_result', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type'
    var: 'feature_biotype', 'feature_is_filtered'
    uns: 'X_normalization', 'default_embedding', 'hvg', 'leiden', 'neighbors', 'pca', 'schema_version', 'title', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'raw'