# Obtaining robust cell population markers, and redefining/reassuring the biased cell populations

**TO RUN THIS NOTEBOOK YOU HAVE TO RUN 3H NOTEBOOK FULLY!!!**

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx

In [None]:
!pip install cellassign

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r dict_colors_mouse
%store -r seed
%store -r magma
%store -r data_dir

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Anndata loading

In [None]:
abbasi_2020_dir = data_dir + '/abassi_2020'
abassi_2020_ctrl_mouse = sc.read(f"{abbasi_2020_dir}/abassi_2020_ctrl_mouse_processed.h5")
abassi_2020_ctrl_mouse_fb = sc.read(f"{abbasi_2020_dir}/abassi_2020_ctrl_mouse_fb_robust.h5")

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_mouse = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_processed.h5')
boothby_2021_ctrl_mouse_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_fb_robust.h5')

In [None]:
buechler_2021_dir = data_dir + '/buechler_2021'
buechler_2021_ctrl_mouse = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse_processed.h5')
buechler_2021_ctrl_mouse_fb = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse_fb_robust.h5')

In [None]:
haensel_2021_dir = data_dir + '/haensel_2021'
haensel_2021_ctrl_mouse_wounding = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_processed.h5')
haensel_2021_ctrl_mouse_wounding_fb = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_fb_robust.h5')

In [None]:
joost_2020_dir = data_dir + '/joost_2020'
joost_2020_ctrl_mouse = sc.read(joost_2020_dir + '/joost_2020_ctrl_mouse_processed.h5')
joost_2020_ctrl_mouse_fb = sc.read(joost_2020_dir + '/joost_2020_ctrl_mouse_fb_robust.h5')

In [None]:
phan_2020_dir = data_dir + '/phan_2020'
phan_2020_ctrl_mouse_21d = sc.read(f"{phan_2020_dir}/phan_2020_ctrl_mouse_21d_processed.h5")
phan_2020_ctrl_mouse_21d_fb = sc.read(f"{phan_2020_dir}/phan_2020_ctrl_mouse_21d_fb_robust.h5")

In [None]:
shook_2020_dir = data_dir + '/shook_2020'
shook_2020_ctrl_mouse = sc.read(f"{shook_2020_dir}/shook_2020_ctrl_mouse_processed.h5")
shook_2020_ctrl_mouse_fb = sc.read(f"{shook_2020_dir}/shook_2020_ctrl_mouse_fb_robust.h5")

In [None]:
vorstandlechner_2021_dir = data_dir + '/Vorstandlechner_2021'
vorstandlechner_2021_ctrl_mouse = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse_processed.h5")
vorstandlechner_2021_ctrl_mouse_fb = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse_fb_robust.h5")

# Geting the definitive list of genes
This is not *really* definitive. We use the produced markers to later on, in the analysis table (not in notebooks) do a selection based on gene function, or pattern of expression. The *final* list is in the variable **genes** below.

In [None]:
%store -r list_all_datasets_mouse
%store -r list_datasets_mouse

%store -r list_accepted_clusters_mouse
%store -r list_accepted_axis_mouse
%store -r list_names_mouse

In [None]:
list_names_mouse

In [None]:
dict_make_gene_scoring_cluster_robust_mouse = make_gene_scoring_with_expr(list_datasets=list_datasets_mouse, calculate_DEGs = True, group_name = 'cluster_robust', 
                                                     value_ref = 'scores', select_method = 'pval', 
                                                     list_clusters = list_accepted_clusters_mouse,
                                                     select_thres = 0.05)

In [None]:
dict_make_gene_scoring_axis_robust_mouse = make_gene_scoring_with_expr(list_datasets=list_datasets_mouse, calculate_DEGs = True, group_name = 'axis_robust',
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05, 
                                                         list_clusters =list_accepted_axis_mouse,)

In [None]:
%store dict_make_gene_scoring_cluster_robust_mouse
%store dict_make_gene_scoring_axis_robust_mouse

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['a1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['a2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['a/d'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b4'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b5'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b6'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b/c'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['c1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['c2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['c/d'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['d1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['d2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['d3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['e1'].iloc[:40].sort_index()

## Analyse the presence and expression pattern of marker genes (within fibroblast populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [None]:
def make_dicts_fraction_mean(genes, list_all_datasets_mouse, list_accepted_clusters_mouse, clusterby='cluster_robust'):
    dict_fraction_cells = {gene: pd.DataFrame(np.nan, index=list_names_mouse, columns=list_accepted_clusters_mouse) for gene in genes}
    dict_mean_exp = {gene: pd.DataFrame(np.nan, index=list_names_mouse, columns=list_accepted_clusters_mouse) for gene in genes}

    for adata, name in zip(list_all_datasets_mouse, list_names_mouse):
        genes_sub = [i for i in genes if i in adata.var_names]
        for cluster in set(adata.obs[clusterby]):
            counts = adata[adata.obs[clusterby] == cluster][:, genes_sub].X.toarray().copy()
            counts_frac = (counts > 0).sum(0) / counts.shape[0]
            counts[counts == 0] = np.nan
            counts_mean_exp = np.nanmean(counts, 0)

            for idx, gene in enumerate(genes_sub):
                dict_fraction_cells[gene].loc[name, cluster] = counts_frac[idx]
                dict_mean_exp[gene].loc[name, cluster] = counts_mean_exp[idx]

    for gene in genes:
        dict_fraction_cells_mean, dict_fraction_cells_std =  dict_fraction_cells[gene].mean(),  dict_fraction_cells[gene].std()
        dict_mean_exp_mean, dict_mean_exp_std =  dict_mean_exp[gene].mean(),  dict_mean_exp[gene].std()

        dict_fraction_cells[gene].loc['Mean'] = dict_fraction_cells_mean
        dict_fraction_cells[gene].loc['Std'] = dict_fraction_cells_std
        dict_mean_exp[gene].loc['Mean'] = dict_mean_exp_mean
        dict_mean_exp[gene].loc['Std'] = dict_mean_exp_std

        dict_fraction_cells[gene] = dict_fraction_cells[gene][list_accepted_clusters_mouse]
        dict_mean_exp[gene] = dict_mean_exp[gene][list_accepted_clusters_mouse]
    
    return dict_fraction_cells, dict_mean_exp

In [None]:
def plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp, rotate=False):
    dfplot_frac = dict_fraction_cells[gene] ** 0.66
    dfplot_exp = dict_mean_exp[gene] 
    exp_norm_vals = (dfplot_exp.loc['Mean'] - min(dfplot_exp.loc['Mean'])) / (max(dfplot_exp.loc['Mean']) - min(dfplot_exp.loc['Mean']))
    fig, ax = plt.subplots(1, 1, figsize=(10, 1))
    ax.set_xticks(range(len(dfplot_frac.columns)))
    
    if rotate:
        ax.set_xticklabels(dfplot_frac.columns, rotation=40, ha='right')
    else:
        ax.set_xticklabels(dfplot_frac.columns)
    
    ax.set_yticks([0])
    ax.set_yticklabels([gene])
    ax.set_ylim([-0.1, 0.1])
    plt.scatter(range(len(dfplot_frac.columns)), [0] * len(dfplot_frac.columns), s=dfplot_frac.loc['Mean'] * 550, c=[cm.OrRd(i) for i in exp_norm_vals], 
                linewidths=0.5, edgecolor='#878787', alpha = [max(0, i) for i in 1 - dict_fraction_cells[gene].loc['Std'] ** 0.75])
    
    plt.plot([-0.3, len(dfplot_frac.columns) - 0.3], [0, 0], c="#676767", linewidth=0.7, alpha=0.3)
    plt.plot([-0.3, len(dfplot_frac.columns) - 0.3], [0.025, 0.025], c="#676767", linewidth=0.7, alpha=0.3)
    plt.plot([-0.3, len(dfplot_frac.columns) - 0.3], [-0.025, -0.025], c="#676767", linewidth=0.7, alpha=0.3)

In [None]:
genes = sorted(list(set([TO BE ADDED])))

To create the dot plot we will use two variables: the percentage of cells expressing the marker in each cluster, and the mean expression value of the expressing-cells in each cluster. To do that, we will create a df for each case and gene, compile the info for each adata and get the mean/std across the adatas. If one gene is not expressed in an adata, or the adata does not have a certain cluster, the value registered is a NaN, not zero. 

In [None]:
plot_UMAPS_gene('cluster_robust', list_datasets_mouse=list_all_datasets_mouse, list_names_mouse=list_names_mouse)

In [None]:
plot_UMAPS_gene('cluster_robust', list_datasets_mouse=list_all_datasets_mouse, list_names_mouse=list_names_mouse)

In [None]:
dict_fraction_cells, dict_mean_exp = make_dicts_fraction_mean(genes, list_all_datasets_mouse, list_accepted_clusters_mouse, clusterby='cluster_robust')

To do the dotplot we do some adjustments. We adjust the fraction of cells beign expressed to $x^{2/3}$ to make it slightly more visual. We also apply a alpha of 1 - std(fraction of cells) to account for markers that are expressed in disparily, so that they are visually not that relevant.


In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp)
    plot_UMAPS_gene(gene, list_datasets_mouse=list_all_datasets_mouse, list_names_mouse=list_names_mouse, n_cols=5, n_rows=5)
    plt.show()

## Analyse the presence and expression pattern of marker genes (within all populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [None]:
list_accepted_clusters_mouse_full = [TO BE ADDED]

In [None]:
list_all_datasets_mouse_full = [ahlers_2022_young, boothby_2021_ctrl_mouse, 
                         deng_2021_scar, gao_2021_ctrl_mouse, gaydosik_2020_ctrl_mouse, gur_2022_ctrl_mouse, 
                         he_2020_ctrl_mouse, hughes_2020_ctrl_mouse, kim_2020_ctrl_mouse, 
                         liu_2021_ctrl_mouse, mariottoni_2021_ctrl_mouse,
                         mirizio_2020_scl, rindler_2021_ctrl_mouse,
                         sole_2020_young, tabib_2021_ctrl_mouse,
                         theo_2020_ctrl_mouse_dm, theo_2021_ctrl_mouse,
                         vors_2020_ctrl_mouse, vors_2021_ctrl_mouse, xu_2021_ctrl_mouse]

list_names_mouse_full = [adata.obs['Author'].values[0] + ' ' + str(int(adata.obs['Year'].values[0])) for adata in list_all_datasets_mouse_full]

In [None]:
%store list_all_datasets_mouse_full

In [None]:
plot_UMAPS_gene('assigned_cats', list_datasets_mouse=list_all_datasets_mouse_full, list_names_mouse=list_names_mouse_full)

In [None]:
genes = [TO BE ADDED]

In [None]:
dict_fraction_cells_full, dict_mean_exp_full = make_dicts_fraction_mean(genes, list_all_datasets_mouse_full, list_accepted_clusters_mouse_full, clusterby='assigned_cats')

In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells_full, dict_mean_exp_full, rotate=True)
    plot_UMAPS_gene(gene, list_datasets_mouse=list_all_datasets_mouse_full, list_names_mouse=list_names_mouse, n_cols=5, n_rows=5)
    plt.show()