# Obtaining robust cell population markers, and redefining/reassuring the biased cell populations

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx

In [None]:
!pip install cellassign

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph
%store -r dict_colors
%store -r seed
%store -r magma
%store -r data_dir

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Anndata loading

In [None]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
ahlers_2022_young_fb = sc.read(ahlers_2022_dir + '/ahlers_2022_young_fb_processed.h5')

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_fb_processed.h5')

In [None]:
deng_2021_dir = data_dir + '/deng_2021'
deng_2021_scar_fb = sc.read(deng_2021_dir + '/deng_2021_scar_fb_processed.h5')

In [None]:
gao_2021_dir = data_dir + '/gao_2021'
gao_2021_ctrl_fb = sc.read(gao_2021_dir + '/gao_2021_ctrl_fb_processed.h5')

In [None]:
gaydosik_2020_dir = data_dir + '/gaydosik_2020'
gaydosik_2020_ctrl_fb = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_fb_processed.h5')

In [None]:
he_2020_dir = data_dir + '/He_2020'
he_2020_ctrl_fb = sc.read(he_2020_dir + '/he_2020_ctrl_fb_processed.h5')

In [None]:
hughes_2020_dir = data_dir + '/hughes_2020'
hughes_2020_ctrl_fb = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_fb_processed.h5')

In [None]:
kim_2020_dir = data_dir + '/Kim_2020'
kim_2020_ctrl_fb = sc.read(kim_2020_dir + '/kim_2020_ctrl_fb_processed.h5')

In [None]:
kim_2021_dir = data_dir + '/kim_2021'
kim_2021_ctrl_fb = sc.read(kim_2021_dir + '/kim_2021_ctrl_fb_processed.h5')

In [None]:
liu_2021_dir = data_dir + '/liu_2021'
liu_2021_ctrl_fb = sc.read(liu_2021_dir + '/liu_2021_ctrl_fb_processed.h5')

In [None]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'
mariottoni_2021_ctrl_fb = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_fb_processed.h5')

In [None]:
mirizio_2020_dir = data_dir + '/mirizio_2020'
mirizio_2020_scl_fb = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_2020_fb_processed.h5')

In [None]:
reynolds_2021_dir = data_dir + '/reynolds_2021'
reynolds_2021_ctrl_fb = sc.read(reynolds_2021_dir + '/reynolds_2021_ctrl_fb_processed.h5')

In [None]:
rindler_2021_dir = data_dir + '/rindler_2021'
rindler_2021_ctrl_fb = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_fb_processed.h5')

In [None]:
sole_2020_dir = data_dir + '/Sole-Boldo_2020'
sole_2020_young_fb = sc.read(sole_2020_dir + '/sole_2020_young_fb_processed.h5')

In [None]:
tabib_2018_dir = data_dir + '/Tabib_2018'
tabib_2018_ctrl_fb = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_2018_fb_processed.h5')

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'
tabib_2021_ctrl_fb = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_fb_processed.h5')

In [None]:
theo_2020_dir = data_dir + '/Theocharidis_2020/'
theo_2020_ctrl_dm_fb = sc.read(theo_2020_dir + '/theo_2020_ctrl_dm_fb_processed.h5')

In [None]:
theo_2021_dir = data_dir + '/Theocharidis_2021/'
theo_2021_ctrl_fb = sc.read(theo_2021_dir + '/theo_2021_ctrl_fb_processed.h5')

In [None]:
vors_2020_dir = data_dir + '/Vorstandlechner_2020'
vors_2020_ctrl_fb = sc.read(vors_2020_dir + '/vors_2020_ctrl_fb_2020_processed.h5')

In [None]:
xu_2021_dir = data_dir + '/xu_2021'
xu_2021_ctrl_fb = sc.read(xu_2021_dir + '/xu_2021_ctrl_fb_processed.h5')

## Evaluate marker score for adatas
All the **commented anndatas are discarded** for marker selection because they do not show enough marker consistency and might bias the result, or because they belong to dataset from non control/healthy conditions.

In [None]:
list_datasets = [
                 ahlers_2022_young_fb,
#                  boothby_2021_ctrl_fb,
#                  deng_2021_scar_fb, 
                 gao_2021_ctrl_fb, 
                 gaydosik_2020_ctrl_fb, 
#                  he_2020_ctrl_fb, 
#                  hughes_2020_ctrl_fb, 
#                  kim_2020_ctrl_fb, 
                 liu_2021_ctrl_fb, 
                 mariottoni_2021_ctrl_fb,
                 mirizio_2020_scl_fb, 
#                  reynolds_2021_ctrl_fb,
                 rindler_2021_ctrl_fb,
                 sole_2020_young_fb, 
                 tabib_2018_ctrl_fb, 
                 tabib_2021_ctrl_fb,
                 theo_2020_ctrl_dm_fb, 
#                  theo_2021_ctrl_fb,
                 vors_2020_ctrl_fb, 
                 xu_2021_ctrl_fb
                    ]

list_accepted_clusters = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'C1', 'C2', 'C3', 'C5', 'D1', 'D2', 'E1', 'T1',]                
list_accepted_axis = ['A', 'B', 'C', 'D', 'E']                

# manual_axis is to create a robust set of markers
for adata in list_datasets:
    adata.obs['manual_axis'] = [i[0] for i in adata.obs['cluster']]

In [None]:
dict_make_gene_scoring = make_gene_scoring_with_expr(list_datasets=list_datasets, calculate_DEGs = True, group_name = 'cluster', value_ref = 'scores', select_method = 'pval', 
                                                     select_thres = 0.05, list_clusters=list_accepted_clusters)

In [None]:
dict_make_gene_scoring_axis = make_gene_scoring_with_expr(list_datasets=list_datasets, calculate_DEGs = True, group_name = 'manual_axis', list_clusters=['A', 'B', 'C', 'D', 'E'],
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05)

## Recalculating clusters in the datasets

In [None]:
dict_cats_clusters_robust = {i: np.array(dict_make_gene_scoring[i].index[:30]) for i in dict_make_gene_scoring.keys()}
dict_cats_axes_robust = {i: np.array(dict_make_gene_scoring_axis[i].index[:30]) for i in dict_make_gene_scoring_axis.keys()}

In [None]:
dict_cats_clusters_robust

In [None]:
dict_cats_axes_robust

### Ahlers 2022

In [None]:
assign_cats(ahlers_2022_young_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(ahlers_2022_young_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

ahlers_2022_young_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(ahlers_2022_young_fb.obs['cluster_robust']))]
sc.pl.umap(ahlers_2022_young_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(ahlers_2022_young_fb, cluster_column='cluster_robust')

### Boothby 2021

In [None]:
assign_cats(boothby_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(boothby_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

boothby_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(boothby_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(boothby_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(boothby_2021_ctrl_fb, cluster_column='cluster_robust')

### Deng 2021

In [None]:
assign_cats(deng_2021_scar_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.99, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(deng_2021_scar_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

deng_2021_scar_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(deng_2021_scar_fb.obs['cluster_robust']))]
sc.pl.umap(deng_2021_scar_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(deng_2021_scar_fb, cluster_column='cluster_robust')

### Gao 2021

In [None]:
assign_cats(gao_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(gao_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

gao_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(gao_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(gao_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(gao_2021_ctrl_fb, cluster_column='cluster_robust')

### Gaydosik 2020

In [None]:
assign_cats(gaydosik_2020_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(gaydosik_2020_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

gaydosik_2020_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(gaydosik_2020_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(gaydosik_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(gaydosik_2020_ctrl_fb, cluster_column='cluster_robust')

### He 2020

In [None]:
assign_cats(he_2020_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(he_2020_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

he_2020_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(he_2020_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(he_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(he_2020_ctrl_fb, cluster_column='cluster_robust')

### Hughes 2020

In [None]:
assign_cats(hughes_2020_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(hughes_2020_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)
hughes_2020_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(hughes_2020_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(hughes_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(hughes_2020_ctrl_fb, cluster_column='cluster_robust')

### Kim 2020

In [None]:
assign_cats(kim_2020_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(kim_2020_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

kim_2020_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(kim_2020_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(kim_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(kim_2020_ctrl_fb, cluster_column='cluster_robust')

### Kim 2021

In [None]:
assign_cats(kim_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(kim_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

kim_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(kim_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(kim_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(kim_2021_ctrl_fb, cluster_column='cluster_robust')

### Liu 2021

In [None]:
assign_cats(liu_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(liu_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

liu_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(liu_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(liu_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(liu_2021_ctrl_fb, cluster_column='cluster_robust')

### Mariottoni 2021

In [None]:
assign_cats(mariottoni_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(mariottoni_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

mariottoni_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(mariottoni_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(mariottoni_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(mariottoni_2021_ctrl_fb, cluster_column='cluster_robust')

### Mirizio 2020

In [None]:
assign_cats(mirizio_2020_scl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(mirizio_2020_scl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

mirizio_2020_scl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(mirizio_2020_scl_fb.obs['cluster_robust']))]
sc.pl.umap(mirizio_2020_scl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(mirizio_2020_scl_fb, cluster_column='cluster_robust')

### Reynolds 2021

In [None]:
assign_cats(reynolds_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(reynolds_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', quantile_gene_sel=0.85,
            intermediate_states=True, diff=0.15, others_name='U', verbose=False)

reynolds_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(reynolds_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(reynolds_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(reynolds_2021_ctrl_fb, cluster_column='cluster_robust')

### Rindler 2021

In [None]:
assign_cats(rindler_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(rindler_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

rindler_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(rindler_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(rindler_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(rindler_2021_ctrl_fb, cluster_column='cluster_robust')

### Solé-Boldo 2021

In [None]:
assign_cats(sole_2020_young_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(sole_2020_young_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

sole_2020_young_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(sole_2020_young_fb.obs['cluster_robust']))]
sc.pl.umap(sole_2020_young_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(sole_2020_young_fb, cluster_column='cluster_robust')

### Tabib 2018

In [None]:
assign_cats(tabib_2018_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(tabib_2018_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

tabib_2018_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(tabib_2018_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(tabib_2018_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(tabib_2018_ctrl_fb, cluster_column='cluster_robust')

### Tabib 2021

In [None]:
assign_cats(tabib_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(tabib_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

tabib_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(tabib_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(tabib_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
tabib_2021_ctrl_fb.write_h5ad(tabib_2021_dir + '/tabib_2021_ctrl_fb_robust.h5')
plot_score_graph(tabib_2021_ctrl_fb, cluster_column='cluster_robust')

### Theocarditis 2020

In [None]:
assign_cats(theo_2020_ctrl_dm_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(theo_2020_ctrl_dm_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

theo_2020_ctrl_dm_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(theo_2020_ctrl_dm_fb.obs['cluster_robust']))]
sc.pl.umap(theo_2020_ctrl_dm_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
theo_2020_ctrl_dm_fb.write_h5ad(theo_2020_dir + '/theo_2020_ctrl_dm_fb_robust.h5')
plot_score_graph(theo_2020_ctrl_dm_fb, cluster_column='cluster_robust')

### Theocarditis 2021

In [None]:
assign_cats(theo_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.7, 
            key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(theo_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4,  key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

theo_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(theo_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(theo_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
theo_2021_ctrl_fb.write_h5ad(theo_2021_dir + '/theo_2021_ctrl_fb_robust.h5')
plot_score_graph(theo_2021_ctrl_fb, cluster_column='cluster_robust')

### Vorstandlechner 2020

In [None]:
assign_cats(vors_2020_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(vors_2020_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

vors_2020_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(vors_2020_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(vors_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
vors_2020_ctrl_fb.write_h5ad(vors_2020_dir + '/vors_2020_ctrl_fb_2020_robust.h5')
plot_score_graph(vors_2020_ctrl_fb, cluster_column='cluster_robust')

### Xu 2021

In [None]:
assign_cats(xu_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.45, quantile_gene_sel=0.7, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(xu_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

xu_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(xu_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(xu_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)
plot_score_graph(xu_2021_ctrl_fb, cluster_column='cluster_robust')

## Reevaluate the presence of clusters for each dataset

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
list_all_datasets = [ ahlers_2022_young_fb, boothby_2021_ctrl_fb, 
                 deng_2021_scar_fb, gao_2021_ctrl_fb, gaydosik_2020_ctrl_fb, 
                 he_2020_ctrl_fb, hughes_2020_ctrl_fb, kim_2020_ctrl_fb, 
                 liu_2021_ctrl_fb, mariottoni_2021_ctrl_fb,
                 mirizio_2020_scl_fb, reynolds_2021_ctrl_fb, rindler_2021_ctrl_fb,
                 sole_2020_young_fb, tabib_2018_ctrl_fb, tabib_2021_ctrl_fb,
                 theo_2020_ctrl_dm_fb, theo_2021_ctrl_fb,
                 vors_2020_ctrl_fb, xu_2021_ctrl_fb]

list_names = [adata.obs['Author'].values[0] + ' ' + str(int(adata.obs['Year'].values[0])) for adata in list_all_datasets]

In [None]:
list_datasets = [
                 ahlers_2022_young_fb,
#                  boothby_2021_ctrl_fb,
#                  deng_2021_scar_fb, 
                 gao_2021_ctrl_fb, 
                 gaydosik_2020_ctrl_fb, 
#                  he_2020_ctrl_fb, 
#                  hughes_2020_ctrl_fb, 
                 kim_2020_ctrl_fb, 
                 liu_2021_ctrl_fb, 
                 mariottoni_2021_ctrl_fb,
                 mirizio_2020_scl_fb, 
#                  reynolds_2021_ctrl_fb,
                 rindler_2021_ctrl_fb,
                 sole_2020_young_fb, 
                 tabib_2018_ctrl_fb, 
                 tabib_2021_ctrl_fb,
                 theo_2020_ctrl_dm_fb, 
#                  theo_2021_ctrl_fb,
                 vors_2020_ctrl_fb, 
                 xu_2021_ctrl_fb
                    ]

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters_robust, list_datasets=list_all_datasets, what='presence', cluster_name='cluster_robust', axis_name='axis_robust')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters_robust, list_datasets=list_all_datasets, what='percentage', cluster_name='cluster_robust', axis_name='axis_robust')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters_robust, list_datasets=list_all_datasets, what='axis', cluster_name='cluster_robust', axis_name='axis_robust')

## Plotting all Adatas

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(5 * 4, 4 * 4))

for ax in axs.ravel():
    ax.set_axis_off()

for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
    sc.pl.umap(adata, color=['cluster_robust'], legend_loc='on data', show=False, ax = axs.ravel()[idx], 
               title=name, size=15, cmap=magma, frameon=False)

# fig, axs = plt.subplots(4, 5, figsize=(5 * 4, 4 * 4))
    
# for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_datasets))):
#     sc.tl.dendrogram(adata, groupby='cluster_robust')
#     sc.pl.dendrogram(adata, groupby='cluster_robust', show=False, ax = axs.ravel()[idx])

In [None]:
def plot_UMAPS_gene(gene):
    fig, axs = plt.subplots(4, 5, figsize=(5 * 4, 4 * 4))

    for ax in axs.ravel():
        ax.set_axis_off()

    for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
        try:
            sc.pl.umap(adata, color=[gene], legend_loc='on data', show=False, ax = axs.ravel()[idx], 
                       title=name, size=15, cmap=magma, frameon=False, use_raw=False)
        except:
            ...

In [None]:
genes = sorted(list(set(['A2M', 'AADAC', 'ABCA10', 'ABCA10', 'ABCA8', 'ABCA9', 'ABCB5', 'ABCC9', 'ACAN', 'ACE', 'ACHE', 'ACKR3', 'ACKR4', 'ADAM12', 'ADAMTS18', 'ADAMTS4', 'ADAMTS4', 'ADAMTSL5', 'ADGRE2', 'ADGRE2', 'ADH1B', 'ADRA2A', 'AEBP1', 'AHRR', 'AKAP6', 'ALDH1A3', 'ALX4', 'ANGPTL5', 'ANGPTL7', 'ANKRD29', 'ANOS1', 'ANTXR2', 'AOPEP', 'APCDD1', 'APOC1', 'APOD', 'APOE', 'AQP1', 'AQP3', 'ARHGAP15', 'ARID5B', 'ASPN', 'ATP1A2', 'AXIN2', 'BAMBI', 'BGN', 'BIRC3', 'BMP7', 'BNC2', 'BTBD11', 'C11orf96', 'C19orf33', 'C1QTNF3', 'C1orf198', 'C2orf40', 'C3', 'C6', 'C7', 'C9orf3', 'CA12', 'CADM2', 'CALD1', 'CAV2', 'CCDC146', 'CCK', 'CCL19', 'CCL2', 'CD200R1', 'CD248', 'CD34', 'CD55', 'CD70', 'CD74', 'CD9', 'CDA', 'CDH11', 'CDKN1A', 'CEBPB', 'CENPW', 'CERCAM', 'CES1', 'CFD', 'CFH', 'CFHR1', 'CH25H', 'CHADL', 'CHD1', 'CHN1', 'CHRDL1', 'CIRBP', 'CLDN1', 'CLEC14A', 'CLEC2A', 'CLEC3B', 'CLSTN3', 'CNTN4', 'COCH', 'COL10A1', 'COL11A1', 'COL11A1', 'COL13A1', 'COL14A1', 'COL15A1', 'COL18A1', 'COL1A2', 'COL21A1', 'COL23A1', 'COL24A1', 'COL28A1', 'COL3A1', 'COL5A2', 'COL6A1', 'COL6A2', 'COL6A3', 'COL6A5', 'COL8A1', 'COL9A3', 'COL9A3', 'COMP', 'CORIN', 'COX4I2', 'CPE', 'CPNE5', 'CPVL', 'CRABP1', 'CRIP1', 'CRISPLD1', 'CSRP1', 'CST3', 'CTHRC1', 'CTSH', 'CTSK', 'CX3CL1', 'CXCL1', 'CXCL12', 'CXCL2', 'CXCL3', 'CYBRD1', 'CYGB', 'CYP1B1', 'CYP26B1', 'CYP7B1', 'CYYR1', 'DBN1', 'DCN', 'DCXR', 'DIO2', 'DNAJA1', 'DOK6', 'DPEP1', 'DPP4', 'DPT', 'DUSP5', 'EBF1', 'EBF2', 'ECM1', 'EDIL3', 'EDNRA', 'EFEMP1', 'EGFLAM', 'EGR1', 'EGR2', 'ELL2', 'ELN', 'EMB', 'EMID1', 'EMP3', 'ENTPD2', 'EPHX1', 'ERRFI1', 'ETV1', 'ETV4', 'EVA1A', 'F13A1', 'F2R', 'F2RL2', 'F2RL2', 'FABP3', 'FAM180B', 'FBLN1', 'FBLN2', 'FBN1', 'FBXO32', 'FGF7', 'FGFBP2', 'FIBIN', 'FMO1', 'FMO2', 'FMO3', 'FOS', 'FOSL1', 'FOXC2', 'FOXS1', 'FXYD6', 'FZD1', 'GALNT15', 'GCH1', 'GDF10', 'GEM', 'GFRA2', 'GGT5', 'GLRB', 'GNAO1', 'GPC3', 'GPM6B', 'GPNMB', 'GSN', 'HAPLN1', 'HAS2', 'HHIP', 'HLA-B', 'HLA-F', 'HMGA1', 'HMGCLL1', 'HNRNPA1', 'HOPX', 'HPSE2', 'HRH1', 'HSD3B7', 'HSPB3', 'HTRA1', 'ICAM1', 'ICAM2', 'IER3', 'IGF1', 'IGFBP2', 'IGFBP3', 'IGFBP5', 'IGFBP6', 'IGFBP7', 'IL32', 'IL33', 'IL34', 'IL6', 'INHBA', 'INMT', 'IRF1', 'IRF8', 'ISYNA1', 'ISYNA1', 'ITGA6', 'ITGB4', 'ITIH5', 'ITM2A', 'JAK3', 'JUNB', 'KCNQ3', 'KDM6B', 'KIAA1217', 'KIF26B', 'KLF5', 'KLK1', 'KLK4', 'KPNA2', 'KRT17', 'KRT19', 'KRT2', 'LAMP5', 'LEF1', 'LEPR', 'LINC00327', 'LINC01133', 'LMO4', 'LMO7', 'LOX', 'LOX', 'LOXL2', 'LPAR4', 'LRRC15', 'LRRC17', 'LRRN3', 'LSP1', 'LTBP2', 'MAB21L2', 'MAFF', 'MAGI1', 'MAP2', 'MAP7', 'MARCKSL1', 'MATN4', 'MCTP1', 'MDK', 'MEF2C', 'MFAP5', 'MGP', 'MGST1', 'MKX', 'MME', 'MMP11', 'MMP16', 'MMP2', 'MRAS', 'MRPS6', 'MSC', 'MXRA5', 'MYO10', 'MYOC', 'NBL1', 'NDNF', 'NECAB1', 'NFIB', 'NFIL3', 'NFKB1', 'NFKBIA', 'NGFR', 'NKD2', 'NLGN4X', 'NLGN4X', 'NPTX2', 'NR2F2', 'NR4A3', 'NRP2', 'NTM', 'OGN', 'OLFML2A', 'OSBP2', 'P2RY14', 'P3H2', 'P4HA3', 'PAFAH1B3', 'PALMD', 'PAMR1', 'PCDH19', 'PCOLCE2', 'PCSK1N', 'PCSK9', 'PDGFD', 'PDGFRL', 'PEAR1', 'PGM2L1', 'PHACTR3', 'PI16', 'PIEZO2', 'PIM3', 'PKP4', 'PLA2G2A', 'PLA2G5', 'PLAUR', 'PLEKHA4', 'PLK2', 'PLPP5', 'PLXDC1', 'PMEPA1', 'PNRC1', 'PODNL1', 'POSTN', 'PPARG', 'PPDPFL', 'PPIC', 'PPP1R14A', 'PPP1R14B', 'PPP1R15A', 'PPP1R15B', 'PRDM8', 'PRG4', 'PRKG2', 'PRSS23', 'PSAT1', 'PTCH1', 'PTGDS', 'PTGIS', 'PTGS1', 'PTK7', 'PTMA', 'PTN', 'PTPRD', 'PXDNL', 'QPCT', 'RAMP1', 'RANBP3L', 'RARRES2', 'RBFOX1', 'RBP4', 'RBP5', 'REL', 'RGCC', 'RGS16', 'RHPN1', 'ROBO2', 'RSPO1', 'RSPO3', 'RSPO4', 'RUNX2', 'S100B', 'SBSPON', 'SCARA5', 'SCN4B', 'SCN7A', 'SCRG1', 'SDC1', 'SDPR', 'SEMA3B', 'SEMA3C', 'SEMA3E', 'SERPINA5', 'SERPINF1', 'SFRP1', 'SFRP2', 'SFRP4', 'SGCA', 'SGIP1', 'SH3BGRL3', 'SHISA3', 'SHISAL1', 'SIX1', 'SLC22A16', 'SLC22A3', 'SLC26A7', 'SLC29A1', 'SLC2A1', 'SLC2A3', 'SLC5A3', 'SLC9A3R2', 'SLCO2B1', 'SLIT2', 'SLIT3', 'SLITRK6', 'SLPI', 'SNAI2', 'SNHG8', 'SOCS3', 'SOD2', 'SOD3', 'SOSTDC1', 'SOX8', 'SPARC', 'SPARCL1', 'SPON1', 'SPON2', 'SPRY1', 'SPRY2', 'SPSB1', 'STC1', 'STC2', 'STMN1', 'STMN2', 'SULT1A1', 'SVEP1', 'TAC1', 'TAGLN', 'TBX3', 'TCEAL2', 'TCF7L2', 'TENM2', 'TENM3', 'TENM3', 'TFAP2A', 'TGFBI', 'THBS2', 'THBS2', 'THBS4', 'THSD4', 'TIAM1', 'TIMP2', 'TIMP3', 'TIMP3', 'TM4SF1', 'TMEM150C', 'TMEM176A', 'TMEM204', 'TMEM233', 'TMEM52', 'TMSB4X', 'TMTC2', 'TNC', 'TNFAIP3', 'TNFAIP6', 'TNFRSF19', 'TNFSF13B', 'TNFSF13B', 'TNFSF14', 'TNMD', 'TNN', 'TNNC1', 'TNNT3', 'TNXB', 'TPD52', 'TPPP3', 'TRAC', 'TRIL', 'TRPM3', 'TRPS1', 'TSPAN13', 'TSPAN8', 'TTR', 'TUBB4A', 'TWIST2', 'TXNIP', 'TYMP', 'UACA', 'UAP1', 'UBD', 'UGT3A2', 'VCAM1', 'VIPR2', 'WFDC1', 'WIF1', 'WISP2', 'WNT10A', 'WNT2', 'WNT5A', 'WTAP', 'XG', 'YWHAH', 'ZC2HC1C', 'ZC3H12A', 'ZFP36'])))

In [None]:
# genes = ['A2M', 'CCL19']

dict_fraction_cells = {gene: pd.DataFrame(np.nan, index=list_names, columns=list_accepted_clusters) for gene in genes}
dict_mean_exp = {gene: pd.DataFrame(np.nan, index=list_names, columns=list_accepted_clusters) for gene in genes}

for adata, name in zip(list_all_datasets, list_names):
    genes_sub = [i for i in genes if i in adata.var_names]
    for cluster in set(adata.obs['cluster_robust']):
        counts = adata[adata.obs['cluster_robust'] == cluster][:, genes_sub].X.toarray().copy()
        counts_frac = (counts > 0).sum(0) / counts.shape[0]
        counts[counts == 0] = np.nan
        counts_mean_exp = np.nanmean(counts, 0)
        
        for idx, gene in enumerate(genes_sub):
            dict_fraction_cells[gene].loc[name, cluster] = counts_frac[idx]
            dict_mean_exp[gene].loc[name, cluster] = counts_mean_exp[idx]

for gene in genes:
#     dict_fraction_cells[gene] = dict_fraction_cells[gene].fillna(0)
#     dict_mean_exp[gene] = dict_mean_exp[gene].fillna(0)
    
    dict_fraction_cells[gene].loc['Mean'] = dict_fraction_cells[gene].mean()
    dict_mean_exp[gene].loc['Mean'] = dict_mean_exp[gene].mean() 
    
    dict_fraction_cells[gene] = dict_fraction_cells[gene][list_accepted_clusters]
    dict_mean_exp[gene] = dict_mean_exp[gene][list_accepted_clusters]

In [None]:
def plot_dotplot_gene(gene):
    dfplot_frac = dict_fraction_cells[gene] ** 0.75
    dfplot_exp = dict_mean_exp[gene] 
    exp_norm_vals = (dfplot_exp.loc['Mean'] - min(dfplot_exp.loc['Mean'])) / (max(dfplot_exp.loc['Mean']) - min(dfplot_exp.loc['Mean']))
    fig, ax = plt.subplots(1, 1, figsize=(10, 1))
    ax.set_xticks(range(len(dfplot_frac.columns)))
    ax.set_xticklabels(dfplot_frac.columns)
    ax.set_yticks([0])
    ax.set_yticklabels([gene])
    plt.scatter(range(len(dfplot_frac.columns)), [0] * len(dfplot_frac.columns), s=dfplot_frac.loc['Mean'] * 400, c=[cm.OrRd(i) for i in exp_norm_vals], linewidths=0.5, edgecolor='#878787')

In [None]:
dict_fraction_cells[gene]

In [None]:
gene = 'ACKR3'
plot_dotplot_gene(gene)
plot_UMAPS_gene(gene)

In [None]:
list_genes = ['A2M', 
'AADAC', 
'ABCA10', 
'ABCA8', 
'ABCA9', 
'ABCB5', 
'ABCC9', 
'ACAN', 
'ACE', 
'ACHE', 
'ACKR3', 
'ACKR4', 
'ADAM12', 
'ADAMTS18', 
'ADAMTS4', 
'ADAMTSL5', 
'ADGRE2', 
'ADH1B', 
'ADRA2A', 
'AEBP1', 
'AHRR', 
'AKAP6', 
'ALDH1A3', 
'ALX4', 
'ANGPTL5', 
'ANGPTL7', 
'ANKRD29', 
'ANOS1', 
'ANTXR2', 
'AOPEP', 
'APCDD1', 
'APOC1', 
'APOD', 
'APOE', 
'AQP1', 
'AQP3', 
'ARHGAP15', 
'ARID5B', 
'ASPN', 
'ATP1A2', 
'AXIN2', 
'BAMBI', 
'BGN', 
'BIRC3', 
'BMP7', 
'BNC2', 
'BTBD11', 
'C11orf96', 
'C19orf33', 
'C1orf198', 
'C1QTNF3', 
'C2orf40',
'C3', 
'C6', 
'C7', 
'C9orf3', 
'CA12', 
'CADM2', 
'CALD1', 
'CAV2', 
'CCDC146', 
'CCK', 
'CCL19', 
'CCL2', 
'CD200R1', 
'CD248', 
'CD34', 
'CD55', 
'CD70', 
'CD74', 
'CD9', 
'CDA', 
'CDH11', 
'CDKN1A', 
'CEBPB', 
'CENPW', 
'CERCAM', 
'CES1', 
'CFD', 
'CFH', 
'CFHR1', 
'CH25H', 
'CHADL', 
'CHD1', 
'CHN1', 
'CHRDL1', 
'CIRBP', 
'CLDN1', 
'CLEC14A', 
'CLEC2A', 
'CLEC3B', 
'CLSTN3', 
'CNTN4', 
'COCH', 
'COL10A1', ]

for gene in list_genes:
    plot_dotplot_gene(gene)
    plot_UMAPS_gene(gene)
    plt.show()

## PAGA 

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(5 * 4, 4 * 4))

for ax in axs.ravel():
    ax.set_axis_off()
    
for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
    sc.tl.paga(adata, groups='cluster_robust')
    sc.pl.paga(adata, ax=axs.ravel()[idx], frameon=False, show=False, 
               title=str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])))

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(5 * 4, 4 * 4))

for ax in axs.ravel()[len(list_all_datasets) - len(axs) :]:
    ax.set_axis_off()
    
for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
    sc.tl.paga(adata, groups='cluster_robust')
    sc.pl.paga(adata, ax=axs.ravel()[idx], frameon=False, show=False, solid_edges='connectivities_tree', 
               title=str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])))

## Combined PAGA 

### Using connectivities_tree

In [None]:
df_all = pd.DataFrame(0, index=list_accepted_clusters + ['U'], columns=list_accepted_clusters + ['U'])

for adata in tqdm(list_all_datasets):
    dfsub = pd.DataFrame(adata.uns['paga']['connectivities_tree'].todense(), 
                  columns=adata.obs['cluster_robust'].cat.categories, 
                  index=adata.obs['cluster_robust'].cat.categories)
    
    df_all.loc[dfsub.index, dfsub.index] = df_all.loc[dfsub.index, dfsub.index] + dfsub
    
df_all = df_all.iloc[:-1, :-1] # To remove U

df_all[df_all < 1] = 0
df_all = df_all ** 1.25

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 4))
G = nx.convert_matrix.from_pandas_adjacency(df_all)
pos=nx.spring_layout(G, seed=1)

edges = G.edges()
weights = [G[u][v]['weight'] for u,v in edges]

nx.draw_networkx(G, pos, width=weights, node_color=[dict_colors[i] for i in df_all.index], ax=ax)
plt.axis('off')

### Using connectivities

In [None]:
df_all = pd.DataFrame(0, index=list_accepted_clusters + ['U'], columns=list_accepted_clusters + ['U'])

for adata in list_all_datasets:
    dfsub = pd.DataFrame(adata.uns['paga']['connectivities'].todense(), 
                  columns=adata.obs['cluster_robust'].cat.categories, 
                  index=adata.obs['cluster_robust'].cat.categories)
    
    df_all.loc[dfsub.index, dfsub.index] = df_all.loc[dfsub.index, dfsub.index] + dfsub
    
df_all = df_all.iloc[:-1, :-1] # To remove U

df_all[df_all < 1.9] = 0
df_all = df_all ** 1.1


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 4))
G = nx.convert_matrix.from_pandas_adjacency(df_all)
pos=nx.spring_layout(G, seed=1)

edges = G.edges()
weights = [G[u][v]['weight'] for u,v in edges]

nx.draw_networkx(G, pos, width=weights, node_color=[dict_colors[i] for i in df_all.index])
plt.axis('off')

# Geting the definitive list of genes

In [None]:
dict_make_gene_scoring_robust = make_gene_scoring_with_expr(list_datasets=list_datasets, calculate_DEGs = True, group_name = 'cluster_robust', 
                                                     value_ref = 'scores', select_method = 'pval', 
                                                     list_clusters = list_accepted_clusters,
                                                     select_thres = 0.05)

In [None]:
dict_make_gene_scoring_axis_robust = make_gene_scoring_with_expr(list_datasets=list_datasets, calculate_DEGs = True, group_name = 'axis_robust',
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05, 
                                                         list_clusters =list_accepted_axis,)

In [None]:
dict_make_gene_scoring_robust['A1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['A2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['A3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['A4'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['B1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['B2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['B3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['B4'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['C1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['C2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['C3'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['C5'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['D1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['D2'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['E1'].iloc[:40].sort_index()

In [None]:
dict_make_gene_scoring_robust['T1']
