# Identification of major cell populations
In this notebook we are going to identify the major skin cell populations in each of the datasets. To make things easier, we are going to join A and M datasets from each day, and get a joined file per day.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
from anndata import AnnData
import scanpy.external as sce
import matplotlib as mpl
import seaborn as sns
import scipy
from tqdm.notebook import tqdm

import scvelo as scv

from scripts.batch_process import batch_process
from cellassign import assign_cats

In [None]:
seed = 10
sc.set_figure_params(dpi=200, dpi_save=300)

In [None]:
# Selection of palettes for cluster coloring, and scatter values

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 50)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:43])

# Discrete palette [Combination of BOLD and VIVID from carto colors]
bold_and_vivid = ['#7F3C8D','#11A579','#3969AC','#F2B701','#E73F74','#80BA5A','#E68310','#008695','#CF1C90',
           '#f97b72','#4b4b8f', '#E58606','#5D69B1','#52BCA3','#99C945','#CC61B0','#24796C','#DAA51B',
           '#2F8AC4','#764E9F','#ED645A','#CC3A8E']

prism = ['#5F4690', '#1D6996', '#38A6A5', '#0F8554', '#73AF48', '#EDAD08', '#E17C05', '#CC503E', '#94346E', '#6F4070', '#994E95']
prism = prism[::2] + prism[1::2]
safe = ['#88CCEE', '#CC6677', '#DDCC77', '#117733', '#332288', '#AA4499', '#44AA99', '#999933', '#882255', '#661100', '#6699CC']
vivid = ['#E58606', '#5D69B1', '#52BCA3', '#99C945', '#CC61B0', '#24796C', '#DAA51B', '#2F8AC4', '#764E9F', '#ED645A', '#CC3A8E']
bold = ['#7F3C8D', '#11A579', '#3969AC', '#F2B701', '#E73F74', '#80BA5A', '#E68310', '#008695', '#CF1C90', '#f97b72', '#4b4b8f']
# Diverging palettes
temps = ['#009392', '#39b185', '#9ccb86', '#e9e29c', '#eeb479', '#e88471', '#cf597e']

# Continuous palettes
teal = ['#d1eeea', '#a8dbd9', '#85c4c9', '#68abb8', '#4f90a6', '#3b738f', '#2a5674']

In [None]:
dir_preprocessed = os.getcwd() + '/data/preprocessed/'

In [None]:
def simple_processing(adata, hvg_min_mean, hvg_max_mean, hvg_min_disp, pca_ncomps, umap_min_dist, leiden_resolution):
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=hvg_min_mean, max_mean=hvg_max_mean, min_disp=hvg_min_disp)
    sc.pl.highly_variable_genes(adata)
    sc.tl.pca(adata, n_comps=pca_ncomps)
    sce.pp.bbknn(adata, metric='angular', approx='False', n_pcs=50)
    sc.tl.umap(adata, random_state = seed, min_dist=umap_min_dist)
    sc.tl.leiden(adata, resolution=leiden_resolution, random_state = seed)
    sc.pl.umap(adata, color=['log1p_total_counts', 'batch', 'leiden'], cmap=magma, palette=bold_and_vivid, ncols=3, alpha=0.9)

In [None]:
dict_markers = {
    'Keratinocytes': ['KRT5', 'KRT10', 'KRT6A', 'KRT16', 'KRT17', 'KRT35', 'MGST1', 'FLG'], 
    'Fibroblasts': ['PDGFRA', 'COL1A1', 'LUM'], 
    'Perivascular': ['RGS5', 'MYL9', 'ACTA2', 'TPM2'], 
    'Lymphatic': ['LYVE1', 'PROX1', 'FLT4', 'PDPN'],
    'Endothelial': ['VWF', 'PECAM1', 'CDH5', 'VEGFC', 'ENG', 'IFI27', 'PLVAP'],
    'T cells': ['TRAC', 'CD3D', 'CD4', 'CD8A'], 
    'B cells': ['CD19', 'MS4A1', 'CD34',  'CD38', 'CD79A'], 
    'Mono/macro': ['AIF1', 'FCGR1A', 'CX3CR1', 'ITGAM', 'CD14', 'CD163'],
    'Dendritic': ['CD86', 'HLA-DRA', 'ITGAX'],
    'Plasma cell': ['IGKC', 'JCHAIN', 'IGHG2',  'IGHG1'],
    'Langerhans': ['CD1A', 'CD207', 'LY75', 'EPCAM'],
    'Schwann-melanocyte': ['S100B', 'MPZ',  'SOX10', 'PLP1',  # Schwann
                           'PMEL', 'TYR',  'TYRP1', 'MLANA'],  # Melanocyte
    'Sweat glands': ['FOXC1',  'TP63', 'SOX9', 'KRT18', 'KRT19', 'AQP5'],
    'Merkel': ['GPX2', 'ID2', 'TFAP4', 'FLT1'],
    'Fat cells':  ['LPL', 'PPARG', 'CAV1', 'TBK1', 'CEBPB']
}

palette = bold_and_vivid

dict_markers_colors = {
    'Keratinocytes': '#399c30',
    'Fibroblasts': '#de7400',
    'Perivascular': '#dc1886',
    'Lymphatic': '#dd7bb4',
    'Endothelial': '#dc1f3d',
    'T cells': '#204a7d',
    'B cells': '#3d82e7',
    'Mono/macro': '#2c35e9',
    'Dendritic': '#8498ee',
    'Plasma cell': '#622fed',
    'Schwann-melanocyte': '#733b00',
    'Sweat glands': '#c1d000',
    'Not assigned': '#bcbcbc'
}

# Day 0

In [None]:
adata_A_D0 = sc.read_h5ad(f'{dir_preprocessed}/Ap11_processed.h5')
adata_M_D0 = sc.read_h5ad(f'{dir_preprocessed}/Mp11_processed.h5')

adata_D0 = sc.AnnData.concatenate(adata_A_D0, adata_M_D0, batch_categories=['A/0', 'M/0'], join='outer')

In [None]:
batch_process(adata_D0)

In [None]:
simple_processing(adata_D0, hvg_min_mean=0.01, hvg_max_mean=8, hvg_min_disp=0.1, pca_ncomps=50, umap_min_dist=0.6, leiden_resolution=0.5)

In [None]:
assign_cats(adata_D0, column_groupby='leiden', key_added='cell_types', dict_cats=dict_markers, min_score=0.4, others_name='Not assigned')

In [None]:
adata_D0.uns['cell_types_colors'] = [dict_markers_colors[i] for i in sorted(set(adata_D0.obs['cell_types']))]
sc.pl.umap(adata_D0, color = ['cell_types'], ncols=2)

In [None]:
for cat, genes in dict_markers.items():
    print(cat)
    sc.pl.umap(adata_D0, color = ['cell_types'] + [i for i in genes if i in adata_D0.var_names], cmap=magma)

# Day 2

In [None]:
adata_A_D2 = sc.read_h5ad(f'{dir_preprocessed}/Ap13_processed.h5')
adata_M_D2 = sc.read_h5ad(f'{dir_preprocessed}/Mp13_processed.h5')

adata_D2 = sc.AnnData.concatenate(adata_A_D2, adata_M_D2, batch_categories=['A/2', 'M/2'], join='outer')

In [None]:
batch_process(adata_D2)

In [None]:
simple_processing(adata_D2, hvg_min_mean=0.01, hvg_max_mean=8, hvg_min_disp=0.1, pca_ncomps=50, umap_min_dist=0.1, leiden_resolution=5)

In [None]:
assign_cats(adata_D2, column_groupby='leiden', key_added='cell_types', dict_cats=dict_markers, min_score=0.45, quantile_gene_sel=0.7, others_name='Not assigned')

In [None]:
adata_D2.uns['cell_types_colors'] = [dict_markers_colors[i] for i in sorted(set(adata_D2.obs['cell_types']))]
sc.pl.umap(adata_D2, color = ['cell_types'], ncols=2)

In [None]:
for cat, genes in dict_markers.items():
    print(cat)
    sc.pl.umap(adata_D2, color = ['cell_types'] + [i for i in genes if i in adata_D2.var_names], cmap=magma)

# Day 4

In [None]:
adata_A_D4 = sc.read_h5ad(f'{dir_preprocessed}/Ap15_processed.h5')
adata_M_D4 = sc.read_h5ad(f'{dir_preprocessed}/Mp15_processed.h5')

adata_D4 = sc.AnnData.concatenate(adata_A_D4, adata_M_D4, batch_categories=['A/4', 'M/4'], join='outer')

In [None]:
batch_process(adata_D4)

In [None]:
simple_processing(adata_D4, hvg_min_mean=0.01, hvg_max_mean=8, hvg_min_disp=0.05, pca_ncomps=50, umap_min_dist=0.1, leiden_resolution=3)

In [None]:
assign_cats(adata_D4, column_groupby='leiden', key_added='cell_types', dict_cats=dict_markers, min_score=0.37, others_name='Not assigned')

In [None]:
adata_D4.uns['cell_types_colors'] = [dict_markers_colors[i] for i in sorted(set(adata_D4.obs['cell_types']))]
sc.pl.umap(adata_D4, color = ['cell_types'], ncols=2)

In [None]:
for cat, genes in dict_markers.items():
    print(cat)
    sc.pl.umap(adata_D4, color = ['cell_types'] + [i for i in genes if i in adata_D4.var_names], cmap=magma, use_raw=False)

# Adata Saving

In [None]:
for adata_name in ['adata_D0', 'adata_D2', 'adata_D4']:
    eval(adata_name).obs['donor'] = [i[0] for i in eval(adata_name).obs['batch']]
    eval(adata_name).obs['day'] = [i[-1] for i in eval(adata_name).obs['batch']]
    eval(adata_name).obs = eval(adata_name).obs[['percent_mito', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 
                                  'log1p_total_counts', 'scrublet_doublet', 'size_factors', 'leiden', 'batch', 'donor', 'day', 'cell_types']]
    del eval(adata_name).var

In [None]:
adata_D0.write_h5ad(dir_preprocessed + f'A+M_0.h5ad')

In [None]:
adata_D2.write_h5ad(dir_preprocessed + f'A+M_2.h5ad')

In [None]:
adata_D4.write_h5ad(dir_preprocessed + f'A+M_4.h5ad')