# X Export datasets to cellxgene

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import networkx as nx
from matplotlib import pylab

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties, make_dicts_fraction_mean, plot_dotplot_gene, plot_dotplot_list_genes
%store -r dict_colors_mouse
%store -r dict_colors_human
%store -r seed
%store -r magma
%store -r data_dir
%store -r dict_cats_fb_mouse

In [None]:
%store -r plot_params

pylab.rcParams.update(plot_params)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = "{:,.2f}".format

In [None]:
%store -r list_names_mouse
%store -r list_all_datasets_mouse
%store -r list_names_human
%store -r list_all_datasets_human

# Set normalised layer

In [None]:
for dataset in list_all_datasets_human + list_all_datasets_mouse:
    dataset.layers['scaled'] = dataset.X.copy()
    dataset.layers['processed'] = dataset.X.copy()
    sc.pp.scale(dataset, layer='scaled', zero_center=True, max_value=1)

# Setting values for human adatas

In [None]:
dict_HANCESTRO = {'White': 'HANCESTRO:0005', 'Caucasian': 'HANCESTRO:0005', 'African American': 'HANCESTRO:0568', 'Han chinese': 'HANCESTRO:0027', 'Hispanic': 'HANCESTRO:0014', 'Asian': 'HANCESTRO:0008', }
dict_UBERON = {'Abdomen': 'UBERON:0001416', 'Axilla': 'UBERON:0015474', 'Back': 'UBERON:0001068', 'Breast': 'UBERON:0001868', 'Calf': 'UBERON:0004264', 'Cheek': 'UBERON:0008803', 'Chest': 'UBERON:0001868', 
              'Dorsal': 'UBERON:0001068', 'Dorsum': 'UBERON:0001068', 'Extremity': 'UBERON:0003532', 'Face': 'UBERON:1000021', 'Flank': 'UBERON:0001554', 'Foot': 'UBERON:0001513', 'Forearm': 'UBERON:0003403', 
              'Foreskin': 'UBERON:0001332', 'Head': 'UBERON:0001084', 'Hip': 'UBERON:0001554', 'Inguinoiliac': 'UBERON:8410021', 'Knee': 'UBERON:0001510', 'Left arm': 'UBERON:0002427', 
              'Left forearm': 'UBERON:0003403', 'Leg': 'UBERON:0001511', 'Lower back': 'UBERON:0009014', 'Lower extremity': 'UBERON:0001511', 'Lower leg': 'UBERON:0004264', 'Nail': 'UBERON:0001705', 
              'Neck': 'UBERON:0001417', 'Outer forearm': 'UBERON:0003403', 'Shoulder': 'UBERON:0001483', 'Skin of body': 'UBERON:0002097', 'Thigh': 'UBERON:0004262', 'Trunk': 'UBERON:0001085', 
              'Upper extremity': 'UBERON:0002427', 'Vulva': 'UBERON:0000997', 'Waist': 'UBERON:0037468', }
dict_MONDO = {'Acne': 'MONDO:0011438', 'AD': 'MONDO:0011292', 'Aging': 'HP:0040006', 'Alopecia': 'MONDO:0004907', 'Atopic dermatitis': 'MONDO:0011292', 'Atopic eczema': 'MONDO:0004980', 'Ctcl': 'MONDO:0000607', 
             'CTCL': 'MONDO:0000607', 'DM - non ulcer': 'MONDO:0005015', 'DM - ulcer': 'MONDO:0005015', 'DM - ulcer - healing': 'MONDO:0005015', 'DM - ulcer - nonhealing': 'MONDO:0005015', 'Dress': 'MONDO:0044876', 
             'DSSC': 'MONDO:0016356', 'Eosinophilic fasciitis': 'MONDO:0009175', 'Healthy': 'PATO:0000461', 'Hidradenitis Suppurativa': 'MONDO:0006559', 'Keloid': 'MONDO:0005348', 'Leprosy': 'MONDO:0005124', 
             'Localised scleroderma': 'MONDO:0019562', 'LSSC': 'MONDO:0016359', 'Lupus': 'MONDO:0004670', 'Psoriasis': 'MONDO:0005083', 'Scar': 'HP:0100699', 'Systemic sclerosis': 'MONDO:0016359', 
             'VIH': 'MONDO:0005109', 'Vitiligo': 'MONDO:0008661', 'Wounding': 'HP:0100699', 'OTHER': 'PATO:0000461', }
dict_HsapDv = {'3y': 'HsapDv:0000097', '6y': 'HsapDv:0000100', '9y': 'HsapDv:0000103', '15y': 'HsapDv:0000109', '16y': 'HsapDv:0000110', '17y': 'HsapDv:0000111', '18y': 'HsapDv:0000112', '20y': 'HsapDv:0000114', 
               '21y': 'HsapDv:0000115', '22y': 'HsapDv:0000116', '23y': 'HsapDv:0000117', '24y': 'HsapDv:0000118', '25y': 'HsapDv:0000119', '26y': 'HsapDv:0000120', '27y': 'HsapDv:0000121', '28y': 'HsapDv:0000122', 
               '29y': 'HsapDv:0000123', '30y': 'HsapDv:0000124', '31y': 'HsapDv:0000125', '32y': 'HsapDv:0000126', '33y': 'HsapDv:0000127', '34y': 'HsapDv:0000128', '35y': 'HsapDv:0000129', '36y': 'HsapDv:0000130', 
               '37y': 'HsapDv:0000131', '38y': 'HsapDv:0000132', '39y': 'HsapDv:0000133', '40y': 'HsapDv:0000134', '41y': 'HsapDv:0000135', '42y': 'HsapDv:0000136', '43y': 'HsapDv:0000137', '44y': 'HsapDv:0000138', 
               '45y': 'HsapDv:0000139', '46y': 'HsapDv:0000140', '47y': 'HsapDv:0000141', '48y': 'HsapDv:0000142', '49y': 'HsapDv:0000143', '50y': 'HsapDv:0000144', '51y': 'HsapDv:0000145', '52y': 'HsapDv:0000146', 
               '53y': 'HsapDv:0000147', '54y': 'HsapDv:0000148', '55y': 'HsapDv:0000149', '56y': 'HsapDv:0000150', '57y': 'HsapDv:0000151', '58y': 'HsapDv:0000152', '59y': 'HsapDv:0000153', '60y': 'HsapDv:0000154', 
               '61y': 'HsapDv:0000155', '62y': 'HsapDv:0000156', '63y': 'HsapDv:0000157', '64y': 'HsapDv:0000158', '65y': 'HsapDv:0000159', '66y': 'HsapDv:0000160', '67y': 'HsapDv:0000161', '68y': 'HsapDv:0000162', 
               '69y': 'HsapDv:0000163', '70y': 'HsapDv:0000164', '71y': 'HsapDv:0000165', '72y': 'HsapDv:0000166', '73y': 'HsapDv:0000167', '74y': 'HsapDv:0000168', '75y': 'HsapDv:0000169', '76y': 'HsapDv:0000170', 
               '77y': 'HsapDv:0000171', '78y': 'HsapDv:0000172', '79y': 'HsapDv:0000173', '80y': 'HsapDv:0000206', '81y': 'HsapDv:0000207', '82y': 'HsapDv:0000208', '83y': 'HsapDv:0000209', }
dict_MmusDv = {'2d': 'MmusDv:0000113', '9d': 'MmusDv:0000113', '21d': 'MmusDv:0000047', '22d': 'MmusDv:0000047', '28d': 'MmusDv:0000048', '4.5w': 'MmusDv:0000048', '5w': 'MmusDv:0000049', '7w': 'MmusDv:0000051', 
               '8w': 'MmusDv:0000052', '9w': 'MmusDv:0000053', '3m': 'MmusDv:0000063', '18m': 'MmusDv:0000089', }
dict_sex = {'F': 'PATO:0000383', 'M': 'PATO:0000384'}

## obs (general)

In [None]:
for adata in list_all_datasets_human:
    dict_vals = {'organism_ontology_term_id': 'NCBITaxon:9606', 
                 'assay_ontology_term_id': 'EFO:0008913', 
                 'cell_type_ontology_term_id': 'CL:0002620', 
                 'suspension_type': 'cell', 
                 'is_primary_data': False}
    
    for key, val in dict_vals.items():
        adata.obs[key] = val
        adata.obs[key] = adata.obs[key].astype('category')
        
    
    adata.obs['donor_id'] = [n[:3] + '-' + str(y)[-2:] + '-' + b for n,y,b in zip(adata.obs['Author'], adata.obs['Year'], adata.obs['Internal sample identifier'])]
    
    adata.obs['cell_type'] = adata.obs['cluster_robust']

## obs (specific)

In [None]:
for adata in list_all_datasets_human:
    adata.obs['tissue_ontology_term_id'] = [dict_UBERON[i] if i in dict_UBERON else 'UBERON:0002097' for i in adata.obs['Sample location'] ]
    adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')
    
    adata.obs['self_reported_ethnicity_ontology_term_id'] = [dict_HANCESTRO[i] if i in dict_HANCESTRO else dict_HANCESTRO[j] if j in dict_HANCESTRO else 'unknown' for i,j  in  zip(adata.obs['Ethnicity'], adata.obs['Race']) ]
    adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')
    
    adata.obs['development_stage_ontology_term_id'] = [dict_HsapDv[i] if i in dict_HsapDv else 'unknown'  for i in adata.obs['Age (mean)'].astype(float).fillna(0).astype(int).astype(str) + adata.obs['Age format (y/m)'].astype(str) ]
    adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')
    
    adata.obs['sex_ontology_term_id'] = [dict_sex[i] if i in dict_sex else 'unknown' for i in adata.obs['Gender']]
    adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')
    
    adata.obs['disease_ontology_term_id'] = [dict_MONDO[i] if i in dict_MONDO else 'PATO:0000461' for i in adata.obs['Condition']]
    adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

## var

In [None]:
for adata in list_all_datasets_human:
    dict_vals = {'feature_biotype': 'gene', 
                 'feature_reference': 'NCBITaxon:9606', 
                }
    
    for key, val in dict_vals.items():
        adata.var[key] = val
        adata.var[key] = adata.var[key].astype('category')
        adata.raw.var[key] = val
        adata.raw.var[key] = adata.raw.var[key].astype('category')
    
    # feature_is_filtered
    adata.var['feature_is_filtered'] = False
    raw_feature_is_filtered = [False if i in adata.var.index else True for i in adata.raw.var.index]
    adata.raw.var['feature_is_filtered'] = raw_feature_is_filtered
    
    # Feature name
    adata.var['feature_name'] = adata.var.index
    adata.var['feature_name'] = adata.var['feature_name'].astype(str)
    adata.raw.var['feature_name'] = adata.raw.var.index
    adata.raw.var['feature_name'] = adata.raw.var['feature_name'].astype(str)

# Setting values for mouse adatas

## obs (general)

In [None]:
for adata in list_all_datasets_mouse:
    dict_vals = {'organism_ontology_term_id': 'NCBITaxon:10090', 
                 'assay_ontology_term_id': 'EFO:0008913', 
                 'cell_type_ontology_term_id': 'CL:0002620', 
                 'suspension_type': 'cell', 
                 'self_reported_ethnicity_ontology_term_id': 'na', 
                 'is_primary_data': False}
        
    for key, val in dict_vals.items():
        adata.obs[key] = val
        adata.obs[key] = adata.obs[key].astype('category')
        
    
    adata.obs['donor_id'] = [n[:3] + '-' + str(y)[-2:] + '-' + b for n,y,b in zip(adata.obs['Author'], adata.obs['Year'], adata.obs['Internal sample identifier'])]
    
    adata.obs['cell_type'] = adata.obs['cluster_robust']

## obs (specific)

In [None]:
for adata in list_all_datasets_mouse:
    adata.obs['tissue_ontology_term_id'] = [dict_UBERON[i] if i in dict_UBERON else 'UBERON:0002097' for i in adata.obs['Sample location'] ]
    adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')
    
    adata.obs['development_stage_ontology_term_id'] = [dict_HsapDv[i] if i in dict_HsapDv else 'unknown'  for i in adata.obs['Age (mean)'].astype(float).fillna(0).astype(int).astype(str) + adata.obs['Age format (y/m)'].astype(str) ]
    adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')
    
    adata.obs['sex_ontology_term_id'] = [dict_sex[i] if i in dict_sex else 'unknown' for i in adata.obs['Gender']]
    adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')
    
    adata.obs['disease_ontology_term_id'] = [dict_MONDO[i] if i in dict_MONDO else 'PATO:0000461' for i in adata.obs['Condition']]
    adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

## var

In [None]:
for adata in list_all_datasets_mouse:
    dict_vals = {'feature_biotype': 'gene', 
                 'feature_reference': 'NCBITaxon:10090', 
                }
    
    for key, val in dict_vals.items():
        adata.var[key] = val
        adata.var[key] = adata.var[key].astype('category')
        adata.raw.var[key] = val
        adata.raw.var[key] = adata.raw.var[key].astype('category')
    
    # feature_is_filtered
    adata.var['feature_is_filtered'] = False
    raw_feature_is_filtered = [False if i in adata.var.index else True for i in adata.raw.var.index]
    adata.raw.var['feature_is_filtered'] = raw_feature_is_filtered
    
    # Feature name
    adata.var['feature_name'] = adata.var.index
    adata.var['feature_name'] = adata.var['feature_name'].astype(str)
    adata.raw.var['feature_name'] = adata.raw.var.index
    adata.raw.var['feature_name'] = adata.raw.var['feature_name'].astype(str)

## Setting uns for mouse and human adatas

In [None]:
for adata in list_all_datasets_human + list_all_datasets_mouse:
    dict_vals = {'schema_version': '3.0.0', 'default_embedding': 'X_umap', 'X_approximate_distribution': 'count',
                 'title': 'Atlas of human and mouse dermal fibroblast heterogeneity', 
                 'batch_condition': 'donor_id', 
                 }
    
    for key, val in dict_vals.items():
        adata.uns[key] = val

# Other dataset corrections 
These should be addresed in previous notebooks, but so far they will be addressed here and traslated to other iterations.


In [None]:
# In phan there are some parts where it is set to 
list_all_datasets_mouse[5].obs['Author'] = 'Phan'
list_all_datasets_mouse[5].obs['Age'] = '21'
list_all_datasets_mouse[5].obs['Accession (Sample)'] = list_all_datasets_mouse[5].obs['Accession (Sample)'].astype(str)
list_all_datasets_mouse[5].obs['Accession (Sample)'][list_all_datasets_mouse[5].obs['Accession (Sample)'] == 'nan'] = 'GSM4647789'
list_all_datasets_mouse[5].obs['Aligner'] = 'Cell Ranger (3.0.2)'
list_all_datasets_mouse[5].obs['Condition'] = 'Healthy'
list_all_datasets_mouse[5].obs['Donor identifier'] = list_all_datasets_mouse[5].obs['Donor identifier'].astype(str)
list_all_datasets_mouse[5].obs['Donor identifier'][list_all_datasets_mouse[5].obs['Donor identifier'] == 'nan'] = 'P21_Un_2'
list_all_datasets_mouse[5].obs['Genome'] = 'GRCm38'
list_all_datasets_mouse[5].obs['Internal sample identifier'] = 'GSM4647789'
list_all_datasets_mouse[5].obs['Library preparation'] = '10X (v2)'
list_all_datasets_mouse[5].obs['Organism'] = 'Mus musculus'
list_all_datasets_mouse[5].obs['Race'] = 'C57BL/6J'
list_all_datasets_mouse[5].obs['Sample identifier'] = list_all_datasets_mouse[5].obs['Sample identifier'].astype(str)
list_all_datasets_mouse[5].obs['Sample identifier'][list_all_datasets_mouse[5].obs['Sample identifier'] == 'nan'] = 'skin'
list_all_datasets_mouse[5].obs['Sample location'] = 'Dorsal'
list_all_datasets_mouse[5].obs['Sequencer'] = 'Illumina HiSeq 4000'

In [None]:
list_all_datasets_mouse[1].obs['Library preparation'] = '10X (v2)'

# Merge adatas

In [None]:
def normalise_UMAP(adata, col_idx='ADATA NUMBER', w=0.25):
    n_adatas = len(adata.obs[col_idx].cat.categories)
    n_cols = int(n_adatas ** 0.5)
    n_rows = n_adatas // n_cols + int(n_adatas % n_cols != 0)
    
    X_umap_coords = adata.obsm['X_umap'].copy()
    
    for idx in range(n_adatas):
        adata_sub = adata[adata.obs[col_idx] == str(idx)]
        row, col = idx // n_cols, idx % n_cols
        
        X_coords = (adata_sub.obsm['X_umap'][:, 0] - adata_sub.obsm['X_umap'][:, 0].min()) / (adata_sub.obsm['X_umap'][:, 0].max() - adata_sub.obsm['X_umap'][:, 0].min())
        Y_coords = (adata_sub.obsm['X_umap'][:, 1] - adata_sub.obsm['X_umap'][:, 1].min()) / (adata_sub.obsm['X_umap'][:, 1].max() - adata_sub.obsm['X_umap'][:, 1].min())
        
        X_coords = X_coords + col * (1 + w)
        Y_coords = Y_coords - row * (1 + w)
        
        X_umap_coords[adata.obs[col_idx] == str(idx), :] = np.array([X_coords, Y_coords]).transpose().astype(np.float32)    
        
        adata.obsm['X_umap'] = X_umap_coords

In [None]:
adata_human = sc.AnnData.concatenate(*list_all_datasets_human, batch_key='ADATA NUMBER', join='outer')
adata_human.obs['cluster_robust'] = adata_human.obs['cluster_robust'].astype('category') 
adata_human.uns['cluster_robust_colors'] = [dict_colors_human[i] for i in adata_human.obs['cluster_robust'].cat.categories]

In [None]:
normalise_UMAP(adata_human)

In [None]:
sc.pl.umap(adata_human, color='cluster_robust')

In [None]:
adata_mouse = sc.AnnData.concatenate(*list_all_datasets_mouse, batch_key='ADATA NUMBER', join='outer')
adata_mouse.obs['cluster_robust'] = adata_mouse.obs['cluster_robust'].astype('category') 
adata_mouse.uns['cluster_robust_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in adata_mouse.obs['cluster_robust'].cat.categories]

In [None]:
normalise_UMAP(adata_mouse)

In [None]:
sc.pl.umap(adata_mouse, color='cluster_robust')

# Clean adatas

In [None]:
accepted_obs = ['cluster_robust', 
                'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 
                'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 
                'Author', 'Year', 'Accession (Sample)', 'Aligner', 'Genome', 'Donor identifier', 'Sample identifier', 
                'Internal sample identifier', 'Library preparation', 'Sequencer', 'Organism', 'Age',
                'Gender', 'Race', 'Ethnicity', 'Sample location', 'Condition', 'Condition (other)', 
                ]

In [None]:
adata_human.obs = adata_human.obs[accepted_obs]
adata_mouse.obs = adata_mouse.obs[accepted_obs]

In [None]:
df_ensemble = adata_human.var[[i for i in adata_human.var.columns if 'Ensemble ID' in i]]
np_ensemble = df_ensemble.astype(str).fillna('').values
list_ensemble = []
for i in range(len(df_ensemble.index)):
    list_ensemble.append([i for i in set(np_ensemble[i, :]) if i != 'nan'][0])



df_gene = adata_human.var[[i for i in adata_human.var.columns if 'gene_symbol' in i]]
np_gene = df_gene.astype(str).fillna('').values
list_gene = []
for i in range(len(df_gene.index)):
    list_gene.append([i for i in set(np_gene[i, :]) if i != 'nan'][0])

adata_human.var['Ensemble ID'] = list_ensemble
adata_human.var['gene_symbol'] = list_gene

In [None]:
df_ensemble = adata_mouse.var[[i for i in adata_mouse.var.columns if 'Ensemble ID' in i]]
np_ensemble = df_ensemble.astype(str).fillna('').values
list_ensemble = []
for i in range(len(df_ensemble.index)):
    list_ensemble.append([i for i in set(np_ensemble[i, :]) if i != 'nan'][0])



df_gene = adata_mouse.var[[i for i in adata_mouse.var.columns if 'gene_symbol' in i]]
np_gene = df_gene.astype(str).fillna('').values
list_gene = []
for i in range(len(df_gene.index)):
    list_gene.append([i for i in set(np_gene[i, :]) if i != 'nan'][0])

adata_mouse.var['Ensemble ID'] = list_ensemble
adata_mouse.var['gene_symbol'] = list_gene

In [None]:
accepted_vars = ['gene_symbol', 'Ensemble ID']

adata_human.var = adata_human.var[accepted_vars]
adata_mouse.var = adata_mouse.var[accepted_vars]

In [None]:
adata_human = adata_human[:, adata_human.var['Ensemble ID'] != 'NA']
adata_mouse = adata_mouse[:, adata_mouse.var['Ensemble ID'] != 'NA']

In [None]:
for obsm in ['X_pca', 'X_pca_harmony', 'X_triku', 'pca_cell_embeddings', 'tsne_cell_embeddings']:
    if obsm in adata_human.obsm:
        del adata_human.obsm[obsm]
    if obsm in adata_mouse.obsm:
        del adata_mouse.obsm[obsm]

In [None]:
adata_human

In [None]:
layers_del = ['spliced', 'unspliced', 'norm_data', 'scale_data', 'ambiguous', 'matrix', 'processed']
for layer in layers_del:
    if layer in adata_human.layers:
        del adata_human.layers[layer]
    if layer in adata_mouse.layers:
        del adata_mouse.layers[layer]

In [None]:
adata_human.write_h5ad('data/adata_human_cellxgene.h5', compression='gzip')

In [None]:
adata_mouse.write_h5ad('data/adata_mouse_cellxgene.h5', compression='gzip')

In [None]:
adata_mouse

In [None]:
os.system("jupyter nbconvert --to html X_export_to_cellxgene.ipynb")