# Obtaining robust cell population markers, and redefining/reassuring the biased cell populations

**TO RUN THIS NOTEBOOK YOU HAVE TO RUN 3H NOTEBOOK FULLY!!!**

## imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import networkx as nx
from kneed import KneeLocator

In [3]:
!pip install cellassign



In [4]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties, make_dicts_fraction_mean, plot_dotplot_gene, plot_dotplot_list_genes
%store -r dict_colors_human
%store -r seed
%store -r magma
%store -r data_dir
%store -r dict_cats_fb_human

In [5]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Anndata loading

In [6]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
ahlers_2022_young_human_fb = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human_fb_robust.h5')
ahlers_2022_young_human = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human_processed.h5')

In [7]:
billi_2022_dir = data_dir + '/billi_2022'
billi_2022_ctrl_human_fb = sc.read(billi_2022_dir + '/billi_2022_ctrl_human_fb_processed.h5')
billi_2022_ctrl_human = sc.read(billi_2022_dir + '/billi_2022_ctrl_human_processed.h5')

In [8]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_human_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_fb_robust.h5')
boothby_2021_ctrl_human = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_processed.h5')

In [9]:
burja_2022_dir = data_dir + '/burja_2022'
burja_2022_ctrl_human_fb = sc.read(burja_2022_dir + '/burja_2022_ctrl_human_fb_robust.h5')
burja_2022_ctrl_human = sc.read(burja_2022_dir + '/burja_2022_ctrl_human_processed.h5')

In [10]:
deng_2021_dir = data_dir + '/deng_2021'
deng_2021_scar_fb = sc.read(deng_2021_dir + '/deng_2021_scar_fb_robust.h5')
deng_2021_scar = sc.read(deng_2021_dir + '/deng_2021_scar_processed.h5')

In [11]:
gao_2021_dir = data_dir + '/gao_2021'
gao_2021_ctrl_human_fb = sc.read(gao_2021_dir + '/gao_2021_ctrl_human_fb_robust.h5')
gao_2021_ctrl_human = sc.read(gao_2021_dir + '/gao_2021_ctrl_human_processed.h5')

In [12]:
gaydosik_2020_dir = data_dir + '/gaydosik_2020'
gaydosik_2020_ctrl_human_fb = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_human_fb_robust.h5')
gaydosik_2020_ctrl_human = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_human_processed.h5')

In [13]:
gur_2022_dir = data_dir + '/gur_2022'
gur_2022_ctrl_human_fb = sc.read(gur_2022_dir + '/gur_2022_ctrl_human_fb_robust.h5')
gur_2022_ctrl_human = sc.read(gur_2022_dir + '/gur_2022_ctrl_human_processed.h5')

In [14]:
he_2020_dir = data_dir + '/He_2020'
he_2020_ctrl_human_fb = sc.read(he_2020_dir + '/he_2020_ctrl_human_fb_robust.h5')
he_2020_ctrl_human = sc.read(he_2020_dir + '/he_2020_ctrl_human_processed.h5')

In [15]:
hughes_2020_dir = data_dir + '/hughes_2020'
hughes_2020_ctrl_human_fb = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_human_fb_robust.h5')
hughes_2020_ctrl_human = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_human_processed.h5')

In [16]:
kim_2020_dir = data_dir + '/Kim_2020'
kim_2020_ctrl_human_fb = sc.read(kim_2020_dir + '/kim_2020_ctrl_human_fb_robust.h5')
kim_2020_ctrl_human = sc.read(kim_2020_dir + '/kim_2020_ctrl_human_processed.h5')

In [17]:
liu_2021_dir = data_dir + '/liu_2021'
liu_2021_ctrl_human_fb = sc.read(liu_2021_dir + '/liu_2021_ctrl_human_fb_robust.h5')
liu_2021_ctrl_human = sc.read(liu_2021_dir + '/liu_2021_ctrl_human_processed.h5')

In [18]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'
mariottoni_2021_ctrl_human_fb = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_fb_robust.h5')
mariottoni_2021_ctrl_human = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_processed.h5')

In [19]:
mirizio_2020_dir = data_dir + '/mirizio_2020'
mirizio_2020_scl_human_fb = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_human_fb_robust.h5')
mirizio_2020_scl_human = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_human_processed.h5')

In [20]:
reynolds_2021_dir = data_dir + '/reynolds_2021'
reynolds_2021_ctrl_human_fb = sc.read(reynolds_2021_dir + '/reynolds_2021_ctrl_human_fb_robust.h5')
# Dataset with the rest of cells is not available. It can be used but since the dataset doesn't have good quality, is not fully relevant.

In [21]:
rindler_2021_dir = data_dir + '/rindler_2021'
rindler_2021_ctrl_human_fb = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_human_fb_robust.h5')
rindler_2021_ctrl_human = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_human_processed.h5')

In [22]:
sole_2020_dir = data_dir + '/Sole-Boldo_2020'
sole_2020_young_human_fb = sc.read(sole_2020_dir + '/sole_2020_young_human_fb_robust.h5')
sole_2020_young_human = sc.read(sole_2020_dir + '/sole_2020_young_human_processed.h5')

In [23]:
tabib_2018_dir = data_dir + '/Tabib_2018'
tabib_2018_ctrl_human_fb = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_human_fb_robust.h5')
tabib_2018_ctrl_human = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_human_processed.h5')

In [24]:
tabib_2021_dir = data_dir + '/Tabib_2021'
tabib_2021_ctrl_human_fb = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_human_fb_robust.h5')
tabib_2021_ctrl_human = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_human_processed.h5')

In [25]:
theo_2020_dir = data_dir + '/Theocharidis_2020/'
theo_2020_ctrl_human_dm_fb = sc.read(theo_2020_dir + '/theo_2020_ctrl_human_dm_fb_robust.h5')
theo_2020_ctrl_human_dm = sc.read(theo_2020_dir + '/theo_2020_ctrl_human_dm_processed.h5')

In [26]:
theo_2021_dir = data_dir + '/Theocharidis_2021/'
theo_2021_ctrl_human_fb = sc.read(theo_2021_dir + '/theo_2021_ctrl_human_fb_robust.h5')
theo_2021_ctrl_human = sc.read(theo_2021_dir + '/theo_2021_ctrl_human_processed.h5')

In [27]:
vors_2020_dir = data_dir + '/Vorstandlechner_2020'
vors_2020_ctrl_human_fb = sc.read(vors_2020_dir + '/vors_2020_ctrl_human_fb_robust.h5')
vors_2020_ctrl_human = sc.read(vors_2020_dir + '/vors_2020_ctrl_human_processed.h5')

In [28]:
vors_2021_dir = data_dir + '/Vorstandlechner_2021'
vors_2021_ctrl_human_fb = sc.read(vors_2021_dir + '/vors_2021_ctrl_human_fb_robust.h5')
vors_2021_ctrl_human = sc.read(vors_2021_dir + '/vors_2021_ctrl_human_processed.h5')

In [29]:
xu_2021_dir = data_dir + '/xu_2021'
xu_2021_ctrl_human_fb = sc.read(xu_2021_dir + '/xu_2021_ctrl_human_fb_robust.h5')
xu_2021_ctrl_human = sc.read(xu_2021_dir + '/xu_2021_ctrl_human_processed.h5')

# Geting the definitive list of genes
This is not *really* definitive. We use the produced markers to later on, in the analysis table (not in notebooks) do a selection based on gene function, or pattern of expression. The *final* list is in the variable **genes** below.

In [31]:
%store -r list_all_datasets_human
%store -r list_datasets_human

%store -r list_accepted_clusters_human
%store -r list_accepted_axis_human
%store -r list_names_human

In [None]:
dict_make_gene_scoring_cluster_robust_human = make_gene_scoring_with_expr(list_datasets=list_datasets_human, calculate_DEGs = True, group_name = 'cluster_robust', 
                                                     value_ref = 'scores', select_method = 'pval', 
                                                     list_clusters = list_accepted_clusters_human,
                                                     select_thres = 0.05)

In [None]:
dict_make_gene_scoring_axis_robust_human = make_gene_scoring_with_expr(list_datasets=list_datasets_human, calculate_DEGs = True, group_name = 'axis_robust',
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05, 
                                                         list_clusters =list_accepted_axis_human,)

In [None]:
%store dict_make_gene_scoring_cluster_robust_human
%store dict_make_gene_scoring_axis_robust_human

In [None]:
dict_make_gene_scoring_cluster_robust_human['E1'].iloc[:50].sort_index().index

In [None]:
dict_make_gene_scoring_cluster_robust_human['A1'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['A2'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['A3'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['A4'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['B1'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['B2'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['B3'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['B4'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['C1'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['C2'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['C3'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['C5'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['D1'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['D2'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['D2'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['E1'].iloc[:50].sort_index()

In [None]:
dict_make_gene_scoring_cluster_robust_human['T1'].iloc[:50].sort_index()

In [None]:
# This variable is useful for the Ligand-Receptor notebook, look all the details there
dict_knee_markers = {}

for clus in dict_make_gene_scoring_cluster_robust_human.keys():
    kneedle = KneeLocator(range(len(dict_make_gene_scoring_cluster_robust_human[clus])), dict_make_gene_scoring_cluster_robust_human[clus].Z, 
                          S=1.0, curve="convex", direction="decreasing")
    
    kneedle = min(max(70, int(kneedle.knee * 6)), 350)
    dict_knee_markers[clus] = dict_make_gene_scoring_cluster_robust_human[clus].iloc[:kneedle].index.values

%store dict_knee_markers

## Analyse the presence and expression pattern of marker genes (within fibroblast populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [34]:
genes = ['A2M', 'AADAC', 'ABCA10', 'ABCA6', 'ABCA8', 'ABCA9', 'ABCC9', 'ABI3BP', 'ACAN', 'ACE', 'ACHE', 'ACKR2', 'ACKR3', 'ACKR4', 'ACTN1', 'ADA', 'ADAM12', 'ADAMTS1', 'ADAMTS18', 'ADAMTS2', 'ADAMTS4', 'ADAMTS5', 'ADAMTS9', 'ADAMTSL1', 'ADAMTSL3', 'ADAMTSL4', 'ADAMTSL5', 'ADCYAP1R1', 'ADGRD1', 'ADGRE2', 'ADGRE5', 'ADGRG2', 'ADRA2A', 'AEBP1', 'AFAP1L2', 'AGTR1', 'AHRR', 'AIF1L', 'AKAP12', 'AKAP6', 'ALDH1A3', 'ALPL', 'ALX4', 'ANGPTL1', 'ANGPTL5', 'ANGPTL7', 'ANKRD29', 'ANOS1', 'ANTXR1', 'ANTXR2', 'ANXA1', 'AOPEP', 'APBB1IP', 'APCDD1', 'APELA', 'APOC1', 'APOD', 'APOE', 'APP', 'AQP1', 'AQP3', 'ARF4', 'ARHGAP15', 'ARHGAP28', 'ARHGDIB', 'ARL4A', 'ARPC1B', 'ASPN', 'ATP10A', 'ATP1A2', 'ATP1B1', 'AXIN2', 'AXL', 'BAMBI', 'BASP1', 'BCL11B', 'BCL3', 'BGN', 'BHLHE40', 'BHLHE41', 'BIRC3', 'BMP1', 'BMP7', 'BMPER', 'BNC2', 'BST1', 'BTBD11', 'C10orf105', 'C11orf96', 'C19orf33', 'C1QTNF3', 'C1QTNF6', 'C1S', 'C1orf198', 'C2orf40/ECRG4', 'C3', 'C4orf48', 'C6', 'C7', 'C9orf3', 'CA12', 'CADM1', 'CADM2', 'CADM3', 'CALD1', 'CALM2', 'CAPN6', 'CAV1', 'CAV2', 'CAVIN2', 'CBFA2T3', 'CCBE1', 'CCDC146', 'CCDC80', 'CCDE80', 'CCK', 'CCL13', 'CCL19', 'CCL2', 'CCND1', 'CD109', 'CD151', 'CD200', 'CD200R1', 'CD248', 'CD34', 'CD40', 'CD47', 'CD55', 'CD63', 'CD70', 'CD74', 'CD81', 'CD9', 'CD99', 'CDA', 'CDH11', 'CDKN1A', 'CDKN2B', 'CENPW', 'CERCAM', 'CES1', 'CFD', 'CFH', 'CFHR1', 'CFL1', 'CGNL1', 'CGREF1', 'CH25H', 'CHADL', 'CHCHD10', 'CHD1', 'CHN1', 'CHPF', 'CHRDL1', 'CHST15', 'CILP', 'CLDN1', 'CLEC14A', 'CLEC2A', 'CLEC3B', 'CLSTN3', 'CNN2', 'CNN3', 'CNTN1', 'CNTN4', 'COCH', 'COL11A1', 'COL13A1', 'COL14A1', 'COL15A1', 'COL16A1', 'COL18A1', 'COL1A2', 'COL21A1', 'COL23A1', 'COL24A1', 'COL27A1', 'COL28A1', 'COL3A1', 'COL4A1', 'COL4A2', 'COL4A4', 'COL5A1', 'COL5A2', 'COL6A1', 'COL6A2', 'COL6A3', 'COL6A5', 'COL7A1', 'COL8A1', 'COL8A2', 'COL9A3', 'COMP', 'COPZ2', 'CORIN', 'COX4I2', 'CPE', 'CPNE5', 'CPVL', 'CPXM2', 'CPZ', 'CRABP1', 'CRABP2', 'CREB5', 'CRIP1', 'CRISPLD1', 'CRLF1', 'CSF1', 'CSPG4', 'CSRNP1', 'CSRP1', 'CSRP2', 'CTHRC1', 'CTNNAL1', 'CTSB', 'CTSH', 'CTSK', 'CTTNBP2', 'CX3CL1', 'CXCL1', 'CXCL12', 'CXCL2', 'CXCL3', 'CXCR4', 'CYBRD1', 'CYGB', 'CYP1B1', 'CYP26B1', 'CYP4B1', 'CYP7B1', 'CYYR1', 'DAAM2', 'DACT2', 'DBN1', 'DCN', 'DCXR', 'DDAH2', 'DDIT4', 'DEPTOR', 'DHRS3', 'DIO2', 'DKK2', 'DMD', 'DNAJA1', 'DOCK9', 'DOK6', 'DPEP1', 'DPP4', 'DPT', 'DUSP10', 'DUSP5', 'EBF2', 'ECM1', 'EDIL3', 'EDN3', 'EDNRA', 'EDNRB', 'EEF1A1', 'EFEMP1', 'EFNA1', 'EFNA5', 'EFNB1', 'EGFL6', 'EGFLAM', 'EGFR', 'EGR1', 'EGR2', 'EGR3', 'ELL2', 'ELN', 'EMB', 'EMID1', 'EMILIN2', 'EMP2', 'EMP3', 'EMX2', 'ENDOD1', 'ENHO', 'ENTPD2', 'EPHA3', 'EPHB6', 'EPS8', 'ERRFI1', 'ETV1', 'ETV4', 'EVA1A', 'EZR', 'F13A1', 'F2R', 'F2RL2', 'F3', 'F5', 'FABP3', 'FABP4', 'FABP5', 'FAM13A', 'FAM180B', 'FAM3C', 'FBLN1', 'FBLN2', 'FBN1', 'FBXO32', 'FGF13', 'FGF7', 'FGFBP2', 'FGFR2', 'FGL2', 'FHL2', 'FIBIN', 'FKBP9', 'FLNC', 'FMO1', 'FMO2', 'FMO3', 'FMOD', 'FNBP1L', 'FNDC1', 'FOS', 'FOSL1', 'FOXC2', 'FOXD2', 'FOXP1', 'FOXS1', 'FRMD4B', 'FRMD6', 'FST', 'FSTL1', 'FXYD6', 'FZD1', 'FZD2', 'FZD4', 'FZD6', 'FZD7', 'GAB1', 'GALNT15', 'GAS6', 'GATA3', 'GCH1', 'GDF10', 'GDF15', 'GEM', 'GFPT2', 'GFRA1', 'GFRA2', 'GGT5', 'GHR', 'GLRB', 'GLT8D2', 'GNAO1', 'GPC1', 'GPC3', 'GPM6B', 'GPNMB', 'GPX3', 'GREB1L', 'GREM2', 'GRN', 'GSN', 'HAPLN1', 'HAS1', 'HAS2', 'HBEGF', 'HEG1', 'HGF', 'HHIP', 'HIC1', 'HLA-B', 'HLA-C', 'HLA-DP1', 'HLA-DRB1', 'HLA-F', 'HMCN1', 'HMGA1', 'HMGCLL1', 'HMOX1', 'HOPX', 'HPGD', 'HPSE2', 'HRH1', 'HS3ST6', 'HSD3B7', 'HSPB3', 'HSPB6', 'HSPG2', 'HTRA1', 'HTRA3', 'ICAM1', 'ICAM2', 'ID4', 'IER3', 'IFI16', 'IGF1', 'IGFBP2', 'IGFBP3', 'IGFBP5', 'IGFBP6', 'IGFBP7', 'IGSF10', 'IL11RA', 'IL15', 'IL15RA', 'IL32', 'IL33', 'IL34', 'IL6', 'INHBA', 'INMT', 'IRF1', 'IRF8', 'ISLR', 'ISM1', 'ISYNA1', 'ITGA6', 'ITGB1', 'ITGB4', 'ITGBL1', 'ITIH5', 'ITM2A', 'ITM2B', 'JAG1', 'JAK3', 'JAM3', 'JUNB', 'KCNJ8', 'KCNK2', 'KCNQ3', 'KDELR3', 'KDM6B', 'KIAA1217', 'KIF26B', 'KLF5', 'KLK1', 'KLK4', 'KPNA2', 'KRT15', 'KRT17', 'KRT19', 'KRT31', 'KRT6A', 'KRT6B', 'KRT85', 'KTN1', 'LAMP1', 'LBH', 'LDB2', 'LDLR', 'LEF1', 'LEPR', 'LGMN', 'LGR4', 'LIMS2', 'LINC00327', 'LINC01133', 'LMO4', 'LMO7', 'LOX', 'LOXL1', 'LOXL2', 'LOXL4', 'LPAR4', 'LRIG1', 'LRP1', 'LRRC15', 'LRRC17', 'LRRN3', 'LRRN4CL', 'LSAMP', 'LSP1', 'LTBP2', 'LTBP4', 'LUM', 'LUZP2', 'LXN', 'MAB21L2', 'MAFB', 'MAFF', 'MAGI1', 'MAMDC2', 'MAP2', 'MAP7', 'MARCKSL1', 'MARVELD1', 'MATN2', 'MATN4', 'MCTP1', 'MDFIC', 'MDH', 'MDK', 'MEF2C', 'MEGF6', 'MEIS2', 'MEOX2', 'METRNL', 'MFAP2', 'MFAP4', 'MFAP5', 'MGLL', 'MGP', 'MGST1', 'MICAL2', 'MKX', 'MME', 'MMP11', 'MMP14', 'MMP16', 'MMP2', 'MMP27', 'MRAS', 'MRPS6', 'MSC', 'MTCL1', 'MTSS1', 'MTUS1', 'MXRA5', 'MXRA8', 'MYC', 'MYH9', 'MYO10', 'MYOC', 'NBL1', 'NCAM1', 'NDNF', 'NDP', 'NDRG1', 'NDRG2', 'NECAB1', 'NFATC1', 'NFATC2', 'NFE2L2', 'NFIB', 'NFIL3', 'NFKB1', 'NFKBIA', 'NFKBIZ', 'NGFR', 'NHSL1', 'NID1', 'NID2', 'NKD1', 'NKD2', 'NKI67', 'NLGN4X', 'NOCT', 'NOTUM', 'NOVA1', 'NPM1', 'NPTX2', 'NPY1R', 'NR1H3', 'NR2F1', 'NR2F2', 'NR4A3', 'NREP', 'NRN1', 'NRP1', 'NRP2', 'NTM', 'NUAK1', 'NUCB2', 'NXN', 'OGN', 'OLFML2A', 'OLFML2B', 'OMD', 'OSBP2', 'OSMR', 'P2RY14', 'P3H2', 'P4HA2', 'P4HA3', 'PAFAH1B3', 'PALLD', 'PALMD', 'PAM', 'PAMR1', 'PAPPA', 'PARD6G', 'PAWR', 'PCDH19', 'PCOLCE', 'PCOLCE2', 'PCSK1N', 'PCSK6', 'PCSK9', 'PDE1A', 'PDGFC', 'PDGFD', 'PDGFRA', 'PDGFRB', 'PDGFRL', 'PEAR1', 'PERP', 'PFN2', 'PGF', 'PGM2L1', 'PHACTR3', 'PHGDH', 'PHLDA1', 'PI16', 'PIEZO2', 'PIM1', 'PIM3', 'PKP4', 'PLA2G2A', 'PLA2G5', 'PLAT', 'PLAUR', 'PLCB1', 'PLEKHA4', 'PLEKHA6', 'PLK2', 'PLNXB2', 'PLPP1', 'PLPP5', 'PLSCR1', 'PLSCR4', 'PLTP', 'PLXDC1', 'PLXDC2', 'PLXNB2', 'PMEL', 'PMEPA1', 'PNP', 'PNRC1', 'PODN', 'PODNL1', 'POSTN', 'PPARG', 'PPDPFL', 'PPIB', 'PPIC', 'PPL', 'PPP1R14A', 'PPP1R15A', 'PPP1R15B', 'PRDM8', 'PRELP', 'PREX1', 'PREX2', 'PRG4', 'PRKG2', 'PRLR', 'PROCR', 'PROS1', 'PRR5L', 'PRSS23', 'PSAT1', 'PTCH1', 'PTCH2', 'PTGDS', 'PTGER3', 'PTGFR', 'PTGIS', 'PTGS1', 'PTGS1/COX1', 'PTGS2', 'PTH1R', 'PTK7', 'PTMA', 'PTN', 'PTPRD', 'PTPRE', 'PTPRK', 'PTX3', 'PXDNL', 'PXN', 'PYCR1', 'QPCT', 'RAB32', 'RAB34', 'RAMP1', 'RAMP2', 'RAMP3', 'RANBP3L', 'RARRES1', 'RARRES2', 'RBFOX1', 'RBP1', 'RBP4', 'RBP5', 'RCN3', 'REL', 'RGCC', 'RGS16', 'RHPN1', 'RNF152', 'ROBO1', 'ROBO2', 'ROR2', 'RRBP1', 'RSPO1', 'RSPO3', 'RSPO4', 'RUNX2', 'RUNX3', 'S100B', 'SATB2', 'SBSPON', 'SCARA3', 'SCARA5', 'SCARF2', 'SCN4B', 'SCN7A', 'SCRG1', 'SDC1', 'SDC2', 'SDC3', 'SDK1', 'SDPR/CAVIN2', 'SEMA3B', 'SEMA3C', 'SEMA3E', 'SERF2', 'SERPINA5', 'SERPINF1', 'SERPING1', 'SESN3', 'SFRP1', 'SFRP2', 'SFRP4', 'SGCA', 'SGIP1', 'SH3BGRL3', 'SHISA3', 'SHISAL1', 'SHOX2', 'SIRPA', 'SIX1', 'SLC16A3', 'SLC22A16', 'SLC22A3', 'SLC26A7', 'SLC29A1', 'SLC2A1', 'SLC2A3', 'SLC38A10', 'SLC40A1', 'SLC48A1', 'SLC5A3', 'SLC9A3R2', 'SLCO2B1', 'SLIT2', 'SLIT3', 'SLITRK6', 'SLPI', 'SMOC2', 'SMS', 'SMURF2', 'SNAI2', 'SNED1', 'SOCS3', 'SOD2', 'SORCS2', 'SOSTDC1', 'SOX4', 'SOX8', 'SOX9', 'SPARC', 'SPARCL1', 'SPHK1', 'SPON1', 'SPON2', 'SPRY1', 'SPRY2', 'SPSB1', 'SRPX', 'STC1', 'STC2', 'STMN1', 'STMN2', 'STXBP6', 'SULF2', 'SULT1A1', 'SUSD2', 'SVEP1', 'SYNE2', 'TAC1', 'TAGLN', 'TBX15', 'TBX3', 'TBXA2R', 'TCEAL2', 'TCF4', 'TCF7L2', 'TCHH', 'TENM2', 'TENM3', 'TFAP2A', 'TFAP2C', 'TGFB3', 'TGFBI', 'TGFBR2', 'TGFBR3', 'THBD', 'THBS2', 'THBS3', 'THBS4', 'THSD4', 'TIAM1', 'TIMP1', 'TIMP2', 'TIMP3', 'TJP1', 'TLN2', 'TM4SF1', 'TMEM119', 'TMEM135', 'TMEM150C', 'TMEM176A', 'TMEM176B', 'TMEM204', 'TMEM52', 'TMTC2', 'TNC', 'TNFAIP2', 'TNFAIP3', 'TNFAIP6', 'TNFRSF10B', 'TNFRSF11B', 'TNFRSF19', 'TNFSF10', 'TNFSF13B', 'TNFSF14', 'TNMD', 'TNN', 'TNNC1', 'TNNT3', 'TNS3', 'TNXB', 'TPD52', 'TPM2', 'TPPP3', 'TRAC', 'TRIB2', 'TRIL', 'TRIM47', 'TRIO', 'TRPM3', 'TRPS1', 'TSHZ3', 'TSPAN11', 'TSPAN13', 'TSPAN4', 'TSPAN7', 'TSPAN8', 'TTR', 'TUBA4A', 'TUBB4A', 'TWIST2', 'TXNIP', 'TYMP', 'UACA', 'UAP1', 'UBD', 'UCHL1', 'UGDH', 'UGT3A2', 'VASN', 'VCAM1', 'VCAN', 'VEGFA', 'VEGFB', 'VEGFD', 'VGLL3', 'VIPR2', 'VKORC1', 'VWA1', 'WFDC1', 'WIF1', 'WISP2', 'WISP2 / CCN5', 'WNT10A', 'WNT11', 'WNT2', 'WNT5A', 'WTAP', 'XG', 'YWHAH', 'ZC2HC1C', 'ZC3H12A', 'ZFP36', 'ZNF385A', 'ZNF608', 'ZYX']

To create the dot plot we will use two variables: the percentage of cells expressing the marker in each cluster, and the mean expression value of the expressing-cells in each cluster. To do that, we will create a df for each case and gene, compile the info for each adata and get the mean/std across the adatas. If one gene is not expressed in an adata, or the adata does not have a certain cluster, the value registered is a NaN, not zero. 

In [None]:
plot_UMAPS_gene('cluster_robust', list_datasets=list_all_datasets_human, list_names=list_names_human, n_cols=5)

In [None]:
dict_fraction_cells, dict_mean_exp = make_dicts_fraction_mean(genes, list_all_datasets=list_all_datasets_human, 
                                                              list_accepted_clusters=[i for i in list_accepted_clusters_human if i != "T1"], 
                                                              list_names=list_names_human, clusterby='cluster_robust')

To do the dotplot we do some adjustments. We adjust the fraction of cells beign expressed to $x^{2/3}$ to make it slightly more visual. We also apply a alpha of 1 - std(fraction of cells) to account for markers that are expressed in disparily, so that they are visually not that relevant.


In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets_human, list_names=list_names_human, n_cols=5)
    plt.show()

In [None]:
plot_dotplot_list_genes(genes, dict_fraction_cells, dict_mean_exp, rotate=False, figsize=(10, len(genes) * 0.41))

## Analyse the presence and expression pattern of marker genes (within all populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [None]:
list_accepted_clusters_human_full = list(dict_cats_fb_human.keys())

In [None]:
list_all_datasets_human_full = [ahlers_2022_young_human, billi_2022_ctrl_human, boothby_2021_ctrl_human, burja_2022_ctrl_human,
                         deng_2021_scar, gao_2021_ctrl_human, gaydosik_2020_ctrl_human, gur_2022_ctrl_human, 
                         he_2020_ctrl_human, hughes_2020_ctrl_human, kim_2020_ctrl_human, 
                         liu_2021_ctrl_human, mariottoni_2021_ctrl_human,
                         mirizio_2020_scl_human, rindler_2021_ctrl_human,
                         sole_2020_young_human, tabib_2021_ctrl_human,
                         theo_2020_ctrl_human_dm, theo_2021_ctrl_human,
                         vors_2020_ctrl_human, vors_2021_ctrl_human, xu_2021_ctrl_human]

list_names_human_full = [adata.obs['Author'].values[0] + ' ' + str(int(adata.obs['Year'].values[0])) + ' human' for adata in list_all_datasets_human_full]

In [None]:
plot_UMAPS_gene('assigned_cats', list_datasets=list_all_datasets_human_full, list_names=list_names_human_full, n_cols=5)

In [None]:
dict_fraction_cells_full, dict_mean_exp_full = make_dicts_fraction_mean(genes, list_all_datasets=list_all_datasets_human_full, list_names=list_names_human_full,
                                                                        list_accepted_clusters=list_accepted_clusters_human_full, clusterby='assigned_cats')

In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells_full, dict_mean_exp_full, rotate=True)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets_human_full, list_names=list_names_human, n_cols=5)
    plt.show()