# Obtaining robust cell population markers, and redefining/reassuring the biased cell populations

**TO RUN THIS NOTEBOOK YOU HAVE TO RUN 3H NOTEBOOK FULLY!!!**

## imports

In [None]:
import time
time.sleep(24*3600)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import networkx as nx

In [None]:
!pip install cellassign

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties, make_dicts_fraction_mean, plot_dotplot_gene
%store -r dict_colors_mouse
%store -r seed
%store -r magma
%store -r data_dir
%store -r dict_cats_fb_mouse

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Anndata loading

In [None]:
abbasi_2020_dir = data_dir + '/abassi_2020'
abassi_2020_ctrl_mouse = sc.read(f"{abbasi_2020_dir}/abassi_2020_ctrl_mouse_processed.h5")
abassi_2020_ctrl_mouse_fb = sc.read(f"{abbasi_2020_dir}/abassi_2020_ctrl_mouse_fb_robust.h5")

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_mouse = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_processed.h5')
boothby_2021_ctrl_mouse_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_fb_robust.h5')

In [None]:
buechler_2021_dir = data_dir + '/buechler_2021'
buechler_2021_ctrl_mouse = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse_processed.h5')
buechler_2021_ctrl_mouse_fb = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse_fb_robust.h5')

In [None]:
haensel_2021_dir = data_dir + '/haensel_2021'
haensel_2021_ctrl_mouse_wounding = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_processed.h5')
haensel_2021_ctrl_mouse_wounding_fb = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_fb_robust.h5')

In [None]:
joost_2020_dir = data_dir + '/joost_2020'
joost_2020_ctrl_mouse = sc.read(joost_2020_dir + '/joost_2020_ctrl_mouse_processed.h5')
joost_2020_ctrl_mouse_fb = sc.read(joost_2020_dir + '/joost_2020_ctrl_mouse_fb_robust.h5')

In [None]:
phan_2020_dir = data_dir + '/phan_2020'
phan_2020_ctrl_mouse_21d = sc.read(f"{phan_2020_dir}/phan_2020_ctrl_mouse_21d_processed.h5")
phan_2020_ctrl_mouse_21d_fb = sc.read(f"{phan_2020_dir}/phan_2020_ctrl_mouse_21d_fb_robust.h5")

In [None]:
shook_2020_dir = data_dir + '/shook_2020'
shook_2020_ctrl_mouse = sc.read(f"{shook_2020_dir}/shook_2020_ctrl_mouse_processed.h5")
shook_2020_ctrl_mouse_fb = sc.read(f"{shook_2020_dir}/shook_2020_ctrl_mouse_fb_robust.h5")

In [None]:
vorstandlechner_2021_dir = data_dir + '/Vorstandlechner_2021'
vorstandlechner_2021_ctrl_mouse = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse_processed.h5")
vorstandlechner_2021_ctrl_mouse_fb = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse_fb_robust.h5")

# Geting the definitive list of genes
This is not *really* definitive. We use the produced markers to later on, in the analysis table (not in notebooks) do a selection based on gene function, or pattern of expression. The *final* list is in the variable **genes** below.

In [None]:
%store -r list_all_datasets_mouse
%store -r list_datasets_mouse

%store -r list_accepted_clusters_mouse
%store -r list_accepted_axis_mouse
%store -r list_names_mouse

In [None]:
dict_make_gene_scoring_cluster_robust_mouse = make_gene_scoring_with_expr(list_datasets=list_datasets_mouse, calculate_DEGs = True, group_name = 'cluster_robust', 
                                                     value_ref = 'scores', select_method = 'pval', 
                                                     list_clusters = list_accepted_clusters_mouse,
                                                     select_thres = 0.05)

In [None]:
dict_make_gene_scoring_axis_robust_mouse = make_gene_scoring_with_expr(list_datasets=list_datasets_mouse, calculate_DEGs = True, group_name = 'axis_robust',
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05, 
                                                         list_clusters =list_accepted_axis_mouse,)

In [None]:
%store dict_make_gene_scoring_cluster_robust_mouse
%store dict_make_gene_scoring_axis_robust_mouse

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['a1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['a2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['a/d'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b4'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b5'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b6'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['b/c'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['c1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['c2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['c/d'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['d1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['d2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['d3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_mouse['e1'].iloc[:40].sort_index()

## Analyse the presence and expression pattern of marker genes (within fibroblast populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [None]:
genes = sorted(list(set(['Col8a1', 'Etv1', 'Cav2', 'Itga6', 'Itgb4', 'Krt19', 'Klf5', 'Ngfr', 'Pear1', 'Dusp5', 'Foxs1'])))
genes = ['6030408B16Rik', 'Vwa1', 'Cp', 'Vit', 'Apod', 'Rasgrp2', 'P2ry14',
        'Ccl11', 'Spp1', 'Ccl9', 'Smoc2', 'Matn2', 'Abca8a', 'Vtn', 'Trf',
        'Cpe', 'Gpc3', '1500009L16Rik', 'Nr2f2', 'Col8a1', 'Gfra1', 'Ebf2',
        'Cxcl1', 'Sparcl1', 'Steap4', 'Col15a1', 'Myoc', 'Pdrg1', 'Itm2a',
        'Mylk'] + ['Cldn1', 'Moxd1', 'Spint2', 'Itga6', 'Sbspon', 'F11r', 'Klf5',
        'Itgb4', 'Tenm2', 'Bcam', 'Etv4', 'Trp63', 'Cxadr', 'Gm973',
        'Bnc2', 'Ptprf', 'Fermt1', 'Npnt', 'Fxyd3', 'Igsf9', 'Cdh3',
        'Lhx2', 'Cav1', 'Trim29', 'Bcl11a', 'Ptprk', 'Ccdc3', 'Nr2f2',
        'Rassf9', 'Ptch1']

N = 25
genes = ['Ptgs2', 'Ptx3', 'Gm48942', 'Sfrp2', 'Ifi27l2a', 'Akr1c18', 'Birc5', 'Cdca3', 'Mki67', 'Fabp4', 'Cd36', 'Pltp', 'Hmcn2', 'Fbln7', 
         'Thbs4', 'Fgf9', 'Clu', 'Igf1', 'C2', 'Gdf10', 'C4b', 'Cp', 'Vwa1', 'P2ry14', 'Il6', 'Mt2', 'Tnfaip2', 'Pltp', 'Crp', 'Sectm1a', 
         'Ndufa4l2', 'Ppp1r14a', 'Sulf2', 'Igfbp2', 'Grem1', 'F13a1', 'Coch', 'Tnmd', 'Fmod', 'a', 'Rasd1', 'Wnt10a', 'Scube3', 'Edn3', 
         'Corin', 'Lrrc15', 'Tagln', 'Egfl6', 'Itgb4', 'Spint2', 'Itga6', ]

To create the dot plot we will use two variables: the percentage of cells expressing the marker in each cluster, and the mean expression value of the expressing-cells in each cluster. To do that, we will create a df for each case and gene, compile the info for each adata and get the mean/std across the adatas. If one gene is not expressed in an adata, or the adata does not have a certain cluster, the value registered is a NaN, not zero. 

In [None]:
plot_UMAPS_gene('cluster_robust', list_datasets=list_all_datasets_mouse, list_names=list_names_mouse, n_cols=3)

In [None]:
dict_fraction_cells, dict_mean_exp = make_dicts_fraction_mean(genes, list_all_datasets=list_all_datasets_mouse, list_accepted_clusters=list_accepted_clusters_mouse, 
                                                              clusterby='cluster_robust', list_names=list_names_mouse)

To do the dotplot we do some adjustments. We adjust the fraction of cells beign expressed to $x^{2/3}$ to make it slightly more visual. We also apply a alpha of 1 - std(fraction of cells) to account for markers that are expressed in disparily, so that they are visually not that relevant.


In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets_mouse, list_names=list_names_mouse, n_cols=5)
    plt.show()

In [None]:
plot_dotplot_list_genes(genes, dict_fraction_cells, dict_mean_exp, rotate=False, figsize=(10, len(genes) * 0.41))

## Analyse the presence and expression pattern of marker genes (within all populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [None]:
list_accepted_clusters_mouse_full = list(dict_cats_fb_mouse.keys())

In [None]:
list_all_datasets_mouse_full = [abassi_2020_ctrl_mouse, boothby_2021_ctrl_mouse, 
                         buechler_2021_ctrl_mouse, joost_2020_ctrl_mouse, phan_2020_ctrl_mouse_21d, shook_2020_ctrl_mouse, 
                         vorstandlechner_2021_ctrl_mouse, ]

list_names_mouse_full = [adata.obs['Author'].values[0] + ' ' + str(int(adata.obs['Year'].values[0])) + ' mouse' for adata in list_all_datasets_mouse_full]

In [None]:
plot_UMAPS_gene('assigned_cats', list_datasets=list_all_datasets_mouse_full, list_names=list_names_mouse_full, n_cols=3)

In [None]:
dict_fraction_cells_full, dict_mean_exp_full = make_dicts_fraction_mean(genes, list_all_datasets=list_all_datasets_mouse_full, list_names=list_names_mouse_full,
                                                                        list_accepted_clusters=list_accepted_clusters_mouse_full, clusterby='assigned_cats')

In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells_full, dict_mean_exp_full, rotate=True)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets_mouse_full, list_names=list_names_mouse_full, n_cols=5)
    plt.show()