# Preparing adata files for cellxgene formatting

**You need to run notebooks 2 to 6 first!!!!**

In [None]:
from cellassign import assign_cats
import gzip
import itertools as itl
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
import triku as tk
import itertools

In [None]:
from functions import adata_plot_KOvsWT, stat_annot_gene
from functions import dict_WT_KO_colors

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
mpl.rcParams['figure.dpi'] = 200

pd.set_option('display.max_columns', None)

## Load adata

In [None]:
adata_fb = sc.read('adatas/adata_fibroblast.h5')
adata_krt = sc.read('adatas/adata_keratinocyte.h5')
adata_imm = sc.read('adatas/adata_immune.h5')
adata_vasc = sc.read('adatas/adata_vasculature.h5')
adata_all_harmony = sc.read_h5ad('adatas/adata_all_harmony.h5')

In [None]:
list_adatas = [adata_all_harmony, adata_fb, adata_krt, adata_imm, adata_vasc]

## Setting obs values

In [None]:
for adata in list_adatas:
    dict_vals = {'organism_ontology_term_id': 'NCBITaxon:10090', 
                 'tissue_ontology_term_id': 'UBERON:0015790', 
                 'assay_ontology_term_id': "EFO:0009922", 
                 'self_reported_ethnicity_ontology_term_id': 'na',
                 'development_stage_ontology_term_id': "MmusDv:0000046", 
                 'sex_ontology_term_id': 'unknown',
                 'assay_ontology_term_id': 'EFO:0008913', 
                 'suspension_type': 'cell', 
                 'is_primary_data': True}
    
    for key, val in dict_vals.items():
        adata.obs[key] = val
        adata.obs[key] = adata.obs[key].astype('category')
        
    
    disease_ontology_term_id = ['PATO:0000461' if i[:2] == 'WT' else 'MONDO:0006541' for i in adata.obs['batch'] ]
    adata.obs['disease_ontology_term_id'] = disease_ontology_term_id
    adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')
    
    adata.obs['donor_id'] = adata.obs['batch']
    
    adata.obs['cell_type'] = adata.obs['merged_cell_type']

## Setting cell type column based on ontology term id (if possible)

In [None]:
dict_general_ct = {'1: FB': 'CL:0000057', '0: KRT': 'CL:0000312', '2: CHFB': 'CL:0000138', 
                   '3: VEC': 'CL:0000115', '4: LEC': 'CL:0002554', '5: PVC': 'CL:0000669', 
                   '6: SCH': 'CL:0002573', '8: LYM': 'CL:0001065', '9: NEU': 'CL:0000775',  
                   '10: APC': 'CL:0000145', '7: MC': 'CL:0000097', 
                   'Fibroblast': 'CL:0000057', 'Keratinocyte': 'CL:0000312', 'Chondrocyte': 'CL:0000138', 
                   'Endothelial': 'CL:0000115', 'Lymphatic': 'CL:0002554', 'Perivascular cell': 'CL:0000669', 
                   'Schwann cell': 'CL:0002573', 'Lymphoid': 'CL:0001065', 'Neutrophil': 'CL:0000775',  
                   'APC': 'CL:0000145', 'Mast cell': 'CL:0000097'}

dict_immune = {'Act. neutrophil': 'CL:0000096', 'B cell': 'CL:0000236', 'Basophil': 'CL:0000767', 
               'Dendritic cell': 'CL:0000451', 'Langerhans cell': 'CL:0000453', 'M1': 'CL:0000235', 'M2': 'CL:0000235', 
               'Mast cell': 'CL:0000097', 'Monocyte': 'CL:0000576', 'NK cell': 'CL:0000814',
               'Neutrophil': 'CL:0000775', 'Plasma cell': 'CL:0000786', 'T cell': 'CL:0000084'}

dict_vasc = {'Endo 0 (vein)': 'UBERON:0001638', 'Endo 1 (capillary)': 'UBERON:0001982', 
             'Endo 2 (capillary)': 'UBERON:0001982', 'Endo 3': 'CL:0000115', 'Endo 4 (artery)': 'UBERON:0001637', 
             'Endo 5': 'CL:0000115', 'Lymph 0': 'CL:0002554', 'Lymph 1': 'CL:0002554', 'Peri 0': 'CL:0000669', 
             'Peri 1': 'CL:0000669', 'Peri 2': 'CL:0000669', 'Peri 3': 'CL:0000669', 'Peri 4': 'CL:0000669', 
             'Schwann cell': 'CL:0002573'}

In [None]:
for adata in list_adatas:
    adata.obs['cell_type_ontology_term_id'] = [dict_general_ct[i] for i in adata.obs['cell_type']]
    adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')
    
adata_imm.obs['subtype_ontology_term_id'] = [dict_immune[i] for i in adata_imm.obs['subtype']]
adata_imm.obs['subtype_ontology_term_id'] = adata_imm.obs['subtype_ontology_term_id'].astype('category')

adata_vasc.obs['subtype_ontology_term_id'] = [dict_vasc[i] for i in adata_vasc.obs['subtype']]
adata_vasc.obs['subtype_ontology_term_id'] = adata_vasc.obs['subtype_ontology_term_id'].astype('category')

adata_krt.obs['subtype_ontology_term_id'] = "CL:0000312"
adata_krt.obs['subtype_ontology_term_id'] = adata_krt.obs['subtype_ontology_term_id'].astype('category')

adata_fb.obs['subtype_ontology_term_id'] = "CL:0000057"
adata_fb.obs['subtype_ontology_term_id'] = adata_fb.obs['subtype_ontology_term_id'].astype('category')

## Setting var values

In [None]:
for adata in list_adatas:
    dict_vals = {'feature_biotype': 'gene', 
                 'feature_reference': 'NCBITaxon:10090', 
                }
    
    for key, val in dict_vals.items():
        adata.var[key] = val
        adata.var[key] = adata.var[key].astype('category')
        adata.raw.var[key] = val
        adata.raw.var[key] = adata.raw.var[key].astype('category')
    
    # feature_is_filtered
    adata.var['feature_is_filtered'] = False
    raw_feature_is_filtered = [False if i in adata.var.index else True for i in adata.raw.var.index]
    adata.raw.var['feature_is_filtered'] = raw_feature_is_filtered
    
    # Feature name
    adata.var['feature_name'] = adata.var.index
    adata.var['feature_name'] = adata.var['feature_name'].astype(str)
    adata.raw.var['feature_name'] = adata.raw.var.index
    adata.raw.var['feature_name'] = adata.raw.var['feature_name'].astype(str)
    
    # Change features to ensemble ID
    adata.var.index = adata.var['Accession'].values
    adata.raw.var.index = adata.raw.var['Accession'].values
    
    adata.var_names_make_unique()

## Setting uns values

In [None]:
for adata in list_adatas:
    dict_vals = {'schema_version': '3.0.0', 'default_embedding': 'X_umap', 'X_approximate_distribution': 'count',
                 'title': 'Single cell RNA sequencing of paw skin from healthy and Col7a1 knockout (RDEB) mice', 
                 'batch_condition': 'batch', 
                 }
    
    for key, val in dict_vals.items():
        adata.uns[key] = val

## Deleting unwanted columns

In [None]:
obs_cols_to_delete = ['condition', 'status', 'is_KOD11', 'is_KOD12', 'is_WT1', 'is_WT2', 'leiden', 'Keratinocyte $Krt5^+$', 
                      'Keratinocyte $Krt10^+$', 'Keratinocyte $Lor^+$', 'Keratinocyte $Tbx1^+$', 'Keratinocyte $Krt28^+$', 
                      'Keratinocyte $Krt75^+$', 'Keratinocyte $Defb6^+$', 'Keratinocyte $Anln^+$', 'Keratinocyte $Cidea^+$', 
                      'Fibroblast $Cxcl12^+$', 'Fibroblast $Thbs4^+$', 'Fibroblast $Cxcl1^+$', 'Fibroblast $Clec3b^+$', 
                      'Fibroblast $Col8a1^+$', 'Fibroblast $Coch^+$', 'Fibroblast $Rab37^+$', 'Fibroblast $Chf^+$', 
                      'Fibroblast $Ptgs2^+$', 'Fibroblast $Serpine2^+$', 'Chondrocyte', 'Endothelial', 'Lymphatic', 
                      'Perivascular cell $Inpp4b^+$', 'Perivascular cell $Il6^+$', 'Schwann cell', 'Glial cell', 'Melanocyte', 
                      'Skeletal muscle', 'Red blood cell', 'T cell', 'T cell (ILC/gd)?', 'B cell', 'Plasma cell', 'NK cell', 
                      'Macrophage', 'Monocyte', 'Neutrophil', 'Neutrophil*', 'Dendritic cell', 'Langerhans cell', 'Mast cell', 
                      'cell_type_whole_std', 'cell_type_whole_mean', 'cell_type_whole_max', 'cell_type_whole_CV', 
                      'cell_type_whole_Keratinocyte $Krt5^+$', 'cell_type_whole_Keratinocyte $Krt10^+$', 
                      'cell_type_whole_Keratinocyte $Lor^+$', 'cell_type_whole_Keratinocyte $Tbx1^+$', 
                      'cell_type_whole_Keratinocyte $Krt28^+$', 'cell_type_whole_Keratinocyte $Krt75^+$', 
                      'cell_type_whole_Keratinocyte $Defb6^+$', 'cell_type_whole_Keratinocyte $Anln^+$', 
                      'cell_type_whole_Keratinocyte $Cidea^+$', 'cell_type_whole_Fibroblast $Cxcl12^+$', 
                      'cell_type_whole_Fibroblast $Thbs4^+$', 'cell_type_whole_Fibroblast $Cxcl1^+$', 
                      'cell_type_whole_Fibroblast $Clec3b^+$', 'cell_type_whole_Fibroblast $Col8a1^+$', 
                      'cell_type_whole_Fibroblast $Coch^+$', 'cell_type_whole_Fibroblast $Rab37^+$', 
                      'cell_type_whole_Fibroblast $Chf^+$', 'cell_type_whole_Fibroblast $Ptgs2^+$', 
                      'cell_type_whole_Fibroblast $Serpine2^+$', 'cell_type_whole_Chondrocyte', 'cell_type_whole_Endothelial', 
                      'cell_type_whole_Lymphatic', 'cell_type_whole_Perivascular cell $Inpp4b^+$', 
                      'cell_type_whole_Perivascular cell $Il6^+$', 'cell_type_whole_Schwann cell', 
                      'cell_type_whole_Glial cell', 'cell_type_whole_Melanocyte', 'cell_type_whole_Skeletal muscle', 
                      'cell_type_whole_Red blood cell', 'cell_type_whole_T cell', 'cell_type_whole_T cell (ILC/gd)?', 
                      'cell_type_whole_B cell', 'cell_type_whole_Plasma cell', 'cell_type_whole_NK cell', 
                      'cell_type_whole_Macrophage', 'cell_type_whole_Monocyte', 'cell_type_whole_Neutrophil', 
                      'cell_type_whole_Neutrophil*', 'cell_type_whole_Dendritic cell', 'cell_type_whole_Langerhans cell', 
                      'cell_type_whole_Mast cell', 'cell_type_whole', 'merged_cell_type', 'conditon-cell_type', 
                      'Fibroblast Ptgs2$^+$', 'Fibroblast Hilpda$^+$', 'Fibroblast Cxcl1$^+$', 'Fibroblast Cxcl12$^+$', 
                      'Fibroblast Ccn5$^+$', 'Fibroblast Ltbp2$^+$', 'Fibroblast Clec3b$^+$', 'Fibroblast Cilp2$^+$', 
                      'Fibroblast Coch$^+$', 'Fibroblast Rab37$^+$', 'Fibroblast Cfh$^+$', 'Fibroblast Serpine2$^+$', 
                      'subtype_std', 'subtype_mean', 'subtype_max', 'subtype_CV', 'subtype_Fibroblast Ptgs2$^+$', 
                      'subtype_Fibroblast Hilpda$^+$', 'subtype_Fibroblast Cxcl1$^+$', 'subtype_Fibroblast Cxcl12$^+$', 
                      'subtype_Fibroblast Ccn5$^+$', 'subtype_Fibroblast Ltbp2$^+$', 'subtype_Fibroblast Clec3b$^+$', 
                      'subtype_Fibroblast Cilp2$^+$', 'subtype_Fibroblast Coch$^+$', 'subtype_Fibroblast Rab37$^+$', 
                      'subtype_Fibroblast Cfh$^+$', 'subtype_Fibroblast Serpine2$^+$', 'score_glycolysis', 
                      'score_Complement', 'score_Cytokine', 'score_ECM', 'score_Hypoxia', 'score_OXPHOS', 'score_Glycolysis', 
                      'score_Stress', 
                     'Krt Krt27$^+$ | IRS1', 'Krt Nkd2$^+$ | IRS2-6', 'Krt Krt35$^+$ | CX', 'Krt Shisa2$^+$ | SB OL', 'Krt Il11ra1$^+$ | B OL', 
                      'Krt Id3$^+$ | GL', 'Krt Anln$^+$ | IFE C', 'Krt Krt5$^+$ | IFE B (1)', 'Krt Ifi202b$^+$ | IFE B (2)', 'Krt Krt10$^+$ | SB1', 
                      'Krt Krt78$^+$ | SB2', 'Krt Lor$^+$ | GR', 'Krt Defb6$^+$ | uHF SB', 'Krt Sprr1b$^+$ | uHF ???', 'Krt Cidea$^+$ | SG', 
                      'Krt Cd74$^+$ | IMM', 'Krt Krt75$^+$ | CHN (1)??? ', 'Krt Gpx2$^+$ | CHN (2)??? ', 'Krt Fxyd1$^+$', 'Krt Myh11$^+$', 
                      'Krt Krt18$^+$', 'subtype_Krt Krt27$^+$ | IRS1', 'subtype_Krt Nkd2$^+$ | IRS2-6', 'subtype_Krt Krt35$^+$ | CX', 
                      'subtype_Krt Shisa2$^+$ | SB OL', 'subtype_Krt Il11ra1$^+$ | B OL', 'subtype_Krt Id3$^+$ | GL', 
                      'subtype_Krt Anln$^+$ | IFE C', 'subtype_Krt Krt5$^+$ | IFE B (1)', 'subtype_Krt Ifi202b$^+$ | IFE B (2)', 
                      'subtype_Krt Krt10$^+$ | SB1', 'subtype_Krt Krt78$^+$ | SB2', 'subtype_Krt Lor$^+$ | GR', 'subtype_Krt Defb6$^+$ | uHF SB', 
                      'subtype_Krt Sprr1b$^+$ | uHF ???', 'subtype_Krt Cidea$^+$ | SG', 'subtype_Krt Cd74$^+$ | IMM', 
                      'subtype_Krt Krt75$^+$ | CHN (1)??? ', 'subtype_Krt Gpx2$^+$ | CHN (2)??? ', 'subtype_Krt Fxyd1$^+$', 
                      'subtype_Krt Myh11$^+$', 'subtype_Krt Krt18$^+$',
                      'M1', 'M2', 'Act. neutrophil', 'Basophil', 'subtype_T cell', 'subtype_B cell', 'subtype_Plasma cell', 
                      'subtype_NK cell', 'subtype_Langerhans cell', 'subtype_Dendritic cell', 'subtype_M1', 'subtype_M2', 
                      'subtype_Monocyte', 'subtype_Neutrophil', 'subtype_Act. neutrophil', 'subtype_Mast cell', 
                      'subtype_Basophil',
                      'Endo 0 (vein)', 'Endo 1 (capillary)', 'Endo 2 (capillary)', 'Endo 3', 'Endo 4 (artery)', 'Endo 5', 
                      'Peri 0', 'Peri 1', 'Peri 2', 'Peri 3', 'Peri 4', 'Lymph 0', 'Lymph 1', 'subtype_Endo 0 (vein)', 
                      'subtype_Endo 1 (capillary)', 'subtype_Endo 2 (capillary)', 'subtype_Endo 3', 'subtype_Endo 4 (artery)', 
                      'subtype_Endo 5', 'subtype_Peri 0', 'subtype_Peri 1', 'subtype_Peri 2', 'subtype_Peri 3', 
                      'subtype_Peri 4', 'subtype_Lymph 0', 'subtype_Lymph 1', 'subtype_Schwann cell', ]

var_cols_to_delete = ['AccessionVersion', 'Aliases', 'CcdsID', 'CosmicID', 'DnaBindingDomain', 'HgncID', 'IsTFi (TcoF-DB)',  
                      'Location', 'LocationSortable', 'LocusGroup', 'MgdID', 'MirBaseID', 'MirBaseID', 'OmimID', 'PubmedID', 
                      'RefseqID', 'Regulates (TRRUST)', 'RgdID', 'Strand', 'UcscID', 'VegaID', 'mt', 
                      'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable']

In [None]:
for adata in list_adatas:
    for i in obs_cols_to_delete:
        if i in adata.obs.columns:
            del adata.obs[i]
            
    for i in var_cols_to_delete:
        if i in adata.var.columns:
            del adata.var[i]   
        if i in adata.raw.var.columns:
            del adata.raw.var[i]  

In [None]:
adata_all_harmony

In [None]:
adata_fb

In [None]:
adata_krt

In [None]:
adata_imm

In [None]:
adata_vasc

## Saving adatas

In [None]:
text_cell_by_gene = ''
for name, adatax in zip(['FIBROBLAST', 'KERATINOCYTE', 'IMMUNE', 'VASCULATURE', 'ALL'], [adata_fb, adata_krt, adata_imm, adata_vasc, adata_all_harmony]):
    text_cell_by_gene += name
    text_cell_by_gene += 'Dataset-level metadata in uns'
    text_cell_by_gene += 'schema_version: ' + str(adatax.uns['schema_version'])
    text_cell_by_gene += 'title: ' + str(adatax.uns['title'])
    text_cell_by_gene += 'batch_condition: ' + str(adatax.uns['batch_condition'])
    text_cell_by_gene += 'default_embedding: ' + str(adatax.uns['default_embedding'])

    text_cell_by_gene += '\nData in .X and raw.X'
    text_cell_by_gene += 'adata.X: ' + str(adatax.X.shape) + str(adatax.X.sum())
    text_cell_by_gene += 'adata.raw.X: ' + str(adatax.raw.X.shape) + str(adatax.raw.X.sum())

    text_cell_by_gene += '\nCells metadata in obs'
    text_cell_by_gene += 'organism_ontology_term_id: ' + str(adatax.obs['organism_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'tissue_ontology_term_id: ' + str(adatax.obs['tissue_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'assay_ontology_term_id: ' + str(adatax.obs['assay_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'disease_ontology_term_id: ' + str(adatax.obs['disease_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'cell_type_ontology_term_id: ' + str(adatax.obs['cell_type_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'self_reported_ethnicity_ontology_term_id: ' + str(adatax.obs['self_reported_ethnicity_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'development_stage_ontology_term_id: ' + str(adatax.obs['development_stage_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'sex_ontology_term_id: ' + str(adatax.obs['sex_ontology_term_id'].cat.categories.values)
    text_cell_by_gene += 'donor_id: ' + str(adatax.obs['donor_id'].cat.categories.values)
    text_cell_by_gene += 'suspension_type: ' + str(adatax.obs['suspension_type'].cat.categories.values)

    text_cell_by_gene += '\nEmbeddings in obsm'
    text_cell_by_gene += str(adatax.obsm)

    text_cell_by_gene += '\nFeatures in var'
    text_cell_by_gene += 'var.index: ' + str(adatax.var.index)
    text_cell_by_gene += 'var.columns: ' + str(adatax.var.columns)

    text_cell_by_gene += '\nFeatures in var.raw'
    text_cell_by_gene += '.raw.var.index: ' + str(adatax.raw.var.index)
    text_cell_by_gene += '.raw.var.columns: ' + str(adatax.raw.var.columns)
    text_cell_by_gene += '\n\n'

text_file = open("adatas/cellxgene_output.txt", "w")
n = text_file.write(text_cell_by_gene)
text_file.close()

In [None]:
adata_fb.write_h5ad('adatas/adata_fibroblast_cellxgene.h5', compression='gzip')
adata_krt.write_h5ad('adatas/adata_keratinocyte_cellxgene.h5', compression='gzip')
adata_imm.write_h5ad('adatas/adata_immune_cellxgene.h5', compression='gzip')
adata_vasc.write_h5ad('adatas/adata_vasculature_cellxgene.h5', compression='gzip')
adata_all_harmony.write_h5ad('adatas/adata_all_harmony_cellxgene.h5', compression='gzip')

In [None]:
list_md5sum = []
md5 = !md5sum adatas/adata_fibroblast_cellxgene.h5
list_md5sum.append(md5)
md5 = !md5sum adatas/adata_keratinocyte_cellxgene.h5
list_md5sum.append(md5)
md5 = !md5sum adatas/adata_immune_cellxgene.h5
list_md5sum.append(md5)
md5 = !md5sum adatas/adata_vasculature_cellxgene.h5
list_md5sum.append(md5)
md5 = !md5sum adatas/adata_all_harmony_cellxgene.h5
list_md5sum.append(md5)

In [None]:
list_md5sum = [i[0] for i in list_md5sum]

text_list_md5sum = '\n'.join(list_md5sum).replace('adatas/', '')

In [None]:
text_file = open("adatas/md5sum.txt", "w")
n = text_file.write(text_list_md5sum)
text_file.close()

In [154]:
!cd adatas &&  tar -czf adatas_cellxgene.tar.gz md5sum.txt cellxgene_output.txt adata_fibroblast_cellxgene.h5 adata_keratinocyte_cellxgene.h5 adata_immune_cellxgene.h5 adata_vasculature_cellxgene.h5 adata_all_harmony_cellxgene.h5