# Obtaining robust cell population markers, and redefining/reassuring the biased cell populations

**TO RUN THIS NOTEBOOK YOU HAVE TO RUN 3H NOTEBOOK FULLY!!!**

## imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx

In [3]:
!pip install cellassign



In [4]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r dict_colors
%store -r seed
%store -r magma
%store -r data_dir

In [5]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Anndata loading

In [6]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
ahlers_2022_young_fb = sc.read(ahlers_2022_dir + '/ahlers_2022_young_fb_robust.h5')
ahlers_2022_young = sc.read(ahlers_2022_dir + '/ahlers_2022_young_processed.h5')

In [7]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_fb_robust.h5')
boothby_2021_ctrl = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_processed.h5')

In [8]:
deng_2021_dir = data_dir + '/deng_2021'
deng_2021_scar_fb = sc.read(deng_2021_dir + '/deng_2021_scar_fb_robust.h5')
deng_2021_scar = sc.read(deng_2021_dir + '/deng_2021_scar_processed.h5')

In [9]:
gao_2021_dir = data_dir + '/gao_2021'
gao_2021_ctrl_fb = sc.read(gao_2021_dir + '/gao_2021_ctrl_fb_robust.h5')
gao_2021_ctrl = sc.read(gao_2021_dir + '/gao_2021_ctrl_processed.h5')

In [10]:
gaydosik_2020_dir = data_dir + '/gaydosik_2020'
gaydosik_2020_ctrl_fb = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_fb_robust.h5')
gaydosik_2020_ctrl = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_processed.h5')

In [11]:
he_2020_dir = data_dir + '/He_2020'
he_2020_ctrl_fb = sc.read(he_2020_dir + '/he_2020_ctrl_fb_robust.h5')
he_2020_ctrl = sc.read(he_2020_dir + '/he_2020_ctrl_processed.h5')

In [12]:
hughes_2020_dir = data_dir + '/hughes_2020'
hughes_2020_ctrl_fb = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_fb_robust.h5')
hughes_2020_ctrl = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_processed.h5')

In [13]:
kim_2020_dir = data_dir + '/Kim_2020'
kim_2020_ctrl_fb = sc.read(kim_2020_dir + '/kim_2020_ctrl_fb_robust.h5')
kim_2020_ctrl = sc.read(kim_2020_dir + '/kim_2020_ctrl_processed.h5')

In [14]:
liu_2021_dir = data_dir + '/liu_2021'
liu_2021_ctrl_fb = sc.read(liu_2021_dir + '/liu_2021_ctrl_fb_robust.h5')
liu_2021_ctrl = sc.read(liu_2021_dir + '/liu_2021_ctrl_processed.h5')

In [15]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'
mariottoni_2021_ctrl_fb = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_fb_robust.h5')
mariottoni_2021_ctrl = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_processed.h5')

In [16]:
mirizio_2020_dir = data_dir + '/mirizio_2020'
mirizio_2020_scl_fb = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_fb_robust.h5')
mirizio_2020_scl = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_processed.h5')

In [17]:
reynolds_2021_dir = data_dir + '/reynolds_2021'
reynolds_2021_ctrl_fb = sc.read(reynolds_2021_dir + '/reynolds_2021_ctrl_fb_robust.h5')
# Dataset with the rest of cells is not available. It can be used but since the dataset doesn't have good quality, is not fully relevant.

In [18]:
rindler_2021_dir = data_dir + '/rindler_2021'
rindler_2021_ctrl_fb = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_fb_robust.h5')
rindler_2021_ctrl = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_processed.h5')

In [19]:
sole_2020_dir = data_dir + '/Sole-Boldo_2020'
sole_2020_young_fb = sc.read(sole_2020_dir + '/sole_2020_young_fb_robust.h5')
sole_2020_young = sc.read(sole_2020_dir + '/sole_2020_young_processed.h5')

In [20]:
tabib_2018_dir = data_dir + '/Tabib_2018'
tabib_2018_ctrl_fb = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_fb_robust.h5')
tabib_2018_ctrl = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_processed.h5')

In [21]:
tabib_2021_dir = data_dir + '/Tabib_2021'
tabib_2021_ctrl_fb = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_fb_robust.h5')
tabib_2021_ctrl = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_processed.h5')

In [22]:
theo_2020_dir = data_dir + '/Theocharidis_2020/'
theo_2020_ctrl_dm_fb = sc.read(theo_2020_dir + '/theo_2020_ctrl_dm_fb_robust.h5')
theo_2020_ctrl_dm = sc.read(theo_2020_dir + '/theo_2020_ctrl_dm_processed.h5')

In [23]:
theo_2021_dir = data_dir + '/Theocharidis_2021/'
theo_2021_ctrl_fb = sc.read(theo_2021_dir + '/theo_2021_ctrl_fb_robust.h5')
theo_2021_ctrl = sc.read(theo_2021_dir + '/theo_2021_ctrl_processed.h5')

In [24]:
vors_2020_dir = data_dir + '/Vorstandlechner_2020'
vors_2020_ctrl_fb = sc.read(vors_2020_dir + '/vors_2020_ctrl_fb_robust.h5')
vors_2020_ctrl = sc.read(vors_2020_dir + '/vors_2020_ctrl_processed.h5')

In [25]:
xu_2021_dir = data_dir + '/xu_2021'
xu_2021_ctrl_fb = sc.read(xu_2021_dir + '/xu_2021_ctrl_fb_robust.h5')
xu_2021_ctrl = sc.read(xu_2021_dir + '/xu_2021_ctrl_processed.h5')

# Geting the definitive list of genes
This is not *really* definitive. We use the produced markers to later on, in the analysis table (not in notebooks) do a selection based on gene function, or pattern of expression. The *final* list is in the variable **genes** below.

In [26]:
%store -r list_all_datasets
%store -r list_datasets

%store -r list_accepted_clusters
%store -r list_accepted_axis
%store -r list_names

In [27]:
dict_make_gene_scoring_robust = make_gene_scoring_with_expr(list_datasets=list_datasets, calculate_DEGs = True, group_name = 'cluster_robust', 
                                                     value_ref = 'scores', select_method = 'pval', 
                                                     list_clusters = list_accepted_clusters,
                                                     select_thres = 0.05)

In [28]:
dict_make_gene_scoring_axis_robust = make_gene_scoring_with_expr(list_datasets=list_datasets, calculate_DEGs = True, group_name = 'axis_robust',
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05, 
                                                         list_clusters =list_accepted_axis,)

In [52]:
%store dict_make_gene_scoring_robust
%store dict_make_gene_scoring_axis_robust

Stored 'dict_make_gene_scoring_robust' (dict)
Stored 'dict_make_gene_scoring_axis_robust' (dict)


In [29]:
dict_make_gene_scoring_robust['A1'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ABCB5,3.45,8.55,0.0,0,0,6.13,0.0,3.87,1.34,0.0,0.0,0.0,0.6,0,3.22,1.47,2.19,0.1,0.56,5.73
ACKR3,1.46,17.72,2.24,0,0,7.93,2.23,2.91,2.4,2.17,4.55,1.42,0.24,0,5.66,1.81,3.13,1.06,1.01,5.59
AEBP1,12.67,17.92,3.53,0,0,17.43,3.15,7.81,9.17,1.46,1.91,2.7,3.8,0,9.46,1.85,5.12,9.59,1.76,5.38
ANGPTL5,4.14,15.47,6.32,0,0,10.73,0.59,4.4,1.67,1.91,7.3,1.96,0.53,0,6.6,1.72,3.84,2.98,1.31,5.02
CA12,2.82,10.59,1.11,0,0,6.94,0.73,5.35,1.34,0.0,1.27,0.56,1.18,0,4.09,1.54,2.65,0.48,0.83,4.92
CCN5,0.0,36.5,0.0,0,0,0.0,3.33,0.0,0.0,0.0,0.0,0.0,0.0,0,6.55,2.4,2.73,0.12,0.59,11.03
CD34,11.05,12.51,3.85,0,0,6.98,1.56,12.89,2.74,1.62,2.67,2.28,4.16,0,7.1,1.64,4.32,5.29,1.52,4.68
CD55,14.67,23.67,8.06,0,0,9.52,3.15,9.93,5.9,1.78,9.72,4.12,4.22,0,10.92,1.89,5.77,8.91,1.73,6.32
CHRDL1,6.93,13.73,6.12,0,0,10.72,2.09,10.92,5.43,2.48,5.07,0.91,3.08,0,7.6,1.6,4.74,2.93,1.31,5.81
CPVL,7.3,11.91,4.28,0,0,8.12,0.66,11.64,3.48,1.64,3.95,3.25,2.03,0,6.64,1.57,4.23,1.53,1.11,5.96


In [30]:
dict_make_gene_scoring_robust['A2'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
AHRR,10.16,4.99,4.79,4.67,1.9,2.89,2.25,6.02,4.31,4.31,6.13,5.55,1.41,2.67,4.94,1.34,3.68,0.32,0.75,6.55
AKAP6,7.09,5.97,7.63,4.53,2.19,1.73,4.64,7.09,4.04,3.94,7.85,2.34,2.78,5.82,5.13,1.26,4.07,0.31,0.74,6.9
APCDD1,10.37,13.6,19.34,7.27,8.52,13.13,4.39,20.45,5.37,13.23,23.65,28.61,9.57,11.41,13.81,1.92,7.21,3.56,1.37,10.05
AXIN2,14.98,10.28,6.7,7.66,2.44,3.38,5.51,8.72,7.35,4.9,8.23,8.79,3.94,2.28,7.56,1.53,4.93,1.05,1.01,7.47
C1orf198,21.27,7.25,7.36,13.33,3.32,3.56,5.2,10.62,6.96,5.75,8.98,7.68,4.37,2.41,8.84,1.76,5.02,2.3,1.23,7.18
CD9,36.24,11.07,7.91,18.48,5.57,3.77,10.16,11.86,13.65,6.25,12.55,12.63,9.08,10.12,13.92,2.09,6.68,20.53,2.13,6.54
CLEC2A,24.71,9.57,12.33,15.07,3.64,2.27,8.41,11.57,9.32,6.68,13.18,9.86,4.55,5.7,11.12,1.83,6.07,0.79,0.94,11.79
COL13A1,15.25,9.86,8.38,12.69,3.26,3.3,7.76,9.52,8.68,7.28,10.54,10.04,4.87,5.48,9.13,1.5,6.07,0.79,0.94,9.67
COL18A1,39.93,19.18,17.41,20.94,9.91,9.41,13.1,18.11,11.68,13.47,23.34,24.22,6.38,6.21,18.69,2.13,8.76,3.35,1.35,13.82
COL23A1,17.95,16.87,10.11,14.14,4.62,5.36,6.89,10.14,9.15,5.05,11.27,14.41,4.37,6.62,10.71,1.66,6.44,0.95,0.99,10.85


In [31]:
dict_make_gene_scoring_robust['A3'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ACKR4,5.05,2.05,11.48,6.01,4.53,2.72,3.89,11.31,3.03,5.99,14.76,9.73,5.26,7.01,7.12,1.56,4.57,1.15,1.03,6.88
AEBP1,21.85,6.88,13.47,11.31,12.53,3.25,5.14,17.73,12.59,9.7,15.03,20.84,8.02,4.08,12.73,1.78,7.16,8.7,1.72,7.41
ANGPTL5,15.66,6.82,12.28,11.1,7.71,2.81,2.36,7.53,9.53,9.15,16.06,10.19,5.46,4.68,9.57,1.59,6.0,2.74,1.29,7.44
C1QTNF3,20.18,5.57,14.26,7.98,7.39,2.38,0.0,11.95,9.79,9.24,17.36,16.34,12.15,4.06,11.09,1.78,6.24,2.72,1.28,8.63
CERCAM,15.87,3.83,9.65,8.94,9.64,1.46,3.24,9.96,9.95,8.21,10.63,11.28,8.38,3.91,8.97,1.53,5.86,3.41,1.36,6.6
CES1,19.65,3.72,11.27,9.92,7.4,2.16,8.67,12.23,10.5,3.86,15.13,11.84,3.7,6.92,9.88,1.7,5.79,2.92,1.31,7.56
COL14A1,12.65,6.84,14.55,6.53,8.99,2.25,6.09,13.34,5.37,8.11,15.47,13.45,2.7,6.15,9.57,1.61,5.94,4.25,1.44,6.66
COL1A1,19.41,3.89,21.85,0.0,11.73,3.39,0.94,19.05,3.85,10.16,20.07,36.33,18.08,9.3,14.15,2.16,6.55,17.28,2.04,6.94
COL1A2,50.62,5.17,23.05,7.88,12.53,3.71,7.79,20.6,18.93,10.44,20.39,37.54,17.25,11.53,19.84,2.38,8.32,41.33,2.54,7.82
COL3A1,45.69,3.11,23.29,6.43,10.16,0.55,6.24,14.62,10.73,10.48,18.14,30.37,16.25,8.55,16.77,2.32,7.24,32.14,2.38,7.04


In [32]:
dict_make_gene_scoring_robust['A4'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ACE,8.98,2.76,0,0,0,3.71,4.53,4.73,0.63,1.43,2.54,3.43,0,1.23,4.09,1.38,2.96,0.28,0.73,5.61
ACKR3,12.13,4.48,0,0,0,7.53,7.38,5.08,0.66,3.05,2.42,6.45,0,1.9,6.09,1.53,3.99,1.05,1.01,6.02
C1QTNF3,20.31,2.44,0,0,0,7.81,11.94,7.84,0.19,5.61,4.49,9.42,0,3.72,9.23,1.83,5.03,3.35,1.35,6.82
CA12,12.44,1.26,0,0,0,3.78,2.35,4.72,1.65,4.02,2.69,4.85,0,1.14,4.86,1.56,3.11,0.5,0.84,5.79
CD248,18.65,2.12,0,0,0,5.77,8.97,8.46,2.19,4.61,5.02,10.6,0,2.76,8.47,1.77,4.79,5.23,1.51,5.6
CD34,24.77,3.7,0,0,0,5.68,3.88,6.97,2.21,5.18,4.86,6.13,0,3.93,8.59,1.99,4.32,5.33,1.52,5.65
CD55,33.65,6.19,0,0,0,9.16,11.44,10.72,3.49,9.87,7.66,18.0,0,6.33,14.31,2.15,6.66,9.13,1.74,8.23
CD70,21.48,1.74,0,0,0,4.26,7.18,4.69,1.95,5.56,4.05,3.05,0,1.77,7.44,1.91,3.9,1.57,1.12,6.64
CDA,12.98,0.0,0,0,0,2.38,5.1,5.05,1.23,2.89,1.5,4.75,0,0.86,4.86,1.62,3.01,0.28,0.73,6.68
CLEC3B,16.49,4.87,0,0,0,9.13,3.38,10.77,1.16,4.79,4.21,8.65,0,3.42,7.78,1.7,4.56,2.69,1.28,6.07


In [33]:
dict_make_gene_scoring_robust['B1'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ADAMTS4,4.95,11.33,2.81,3.35,6.69,0,3.49,3.84,3.75,2.76,2.52,6.23,3.6,0.4,4.9,1.39,3.52,0.3,0.74,6.62
ARID5B,28.77,12.97,1.98,9.74,14.05,0,2.42,5.27,17.18,5.68,4.52,16.67,11.37,0.0,11.95,1.97,6.05,7.01,1.63,7.35
C11orf96,29.41,16.21,4.26,9.96,12.48,0,8.47,3.51,16.2,5.14,4.98,11.73,2.61,0.0,11.4,1.99,5.74,4.5,1.46,7.83
CCL2,27.41,13.41,3.91,16.24,3.15,0,11.95,8.21,14.14,7.36,8.07,21.25,16.64,0.14,13.03,1.95,6.68,12.4,1.88,6.95
CDKN1A,32.93,19.17,6.44,15.79,6.38,0,11.3,13.16,13.49,6.62,7.1,13.15,10.19,0.0,13.55,2.0,6.77,8.66,1.72,7.9
CEBPB,18.98,16.2,6.14,15.34,12.08,0,9.71,3.76,18.35,4.85,5.41,17.29,13.46,0.0,12.51,1.77,7.07,10.54,1.8,6.94
CHD1,13.15,13.21,2.47,6.23,9.46,0,2.46,3.28,5.86,5.33,4.3,10.27,6.78,0.0,7.47,1.57,4.75,2.05,1.2,6.25
CXCL1,22.45,5.06,2.92,14.84,8.49,0,5.97,5.29,17.11,1.02,1.88,16.76,14.5,0.0,10.35,1.91,5.41,2.5,1.26,8.23
CXCL2,62.15,13.76,5.83,18.16,9.25,0,10.97,7.23,23.77,7.06,14.27,25.13,20.14,0.77,19.37,2.52,7.68,10.08,1.78,10.87
CXCL3,33.13,8.35,3.45,14.09,11.91,0,7.99,3.37,16.8,3.93,7.03,21.04,15.3,0.0,13.07,2.07,6.32,4.4,1.45,9.02


In [34]:
dict_make_gene_scoring_robust['B2'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ABCC3,3.35,4.7,3.01,3.02,2.97,3.28,2.41,4.61,1.43,0.22,2.25,0.89,1.15,0.91,2.79,1.08,2.58,0.23,0.69,4.04
ADRA2A,3.5,4.93,4.25,4.81,2.19,2.17,2.99,4.4,7.76,1.51,3.65,1.19,2.17,0.0,3.67,1.2,3.05,0.39,0.79,4.65
ANKRD29,3.76,4.41,2.9,5.95,1.64,2.99,3.8,6.02,3.0,1.28,3.74,1.51,1.1,0.0,3.45,1.18,2.93,0.27,0.72,4.79
APOE,34.31,17.07,25.42,21.01,10.83,10.33,12.29,18.68,18.84,5.23,18.31,7.19,7.14,2.66,16.98,2.01,8.47,11.73,1.85,9.18
B2M,18.85,12.22,12.71,14.94,4.86,6.0,8.22,14.09,14.74,3.04,9.62,2.93,7.55,2.05,10.73,1.69,6.36,40.82,2.53,4.25
C3,9.58,8.83,13.84,10.72,9.36,2.16,8.25,9.1,10.12,3.91,8.5,1.66,3.37,2.07,8.08,1.51,5.37,4.9,1.49,5.43
C7,6.89,7.69,7.98,4.35,4.92,6.71,3.05,5.23,1.22,4.07,7.93,0.23,0.0,0.0,4.88,1.39,3.51,1.59,1.12,4.35
CCL19,15.29,12.23,12.52,14.22,6.01,6.11,10.3,15.09,12.58,3.71,12.02,1.38,3.97,0.0,10.36,1.66,6.25,2.17,1.21,8.54
CD200R1,2.22,4.01,2.47,3.72,1.64,0.95,1.93,3.7,2.01,0.0,2.42,0.62,0.0,0.0,2.19,1.08,2.02,0.08,0.54,4.07
CD74,9.86,7.69,7.19,7.68,3.15,1.59,8.45,10.98,6.77,3.09,6.93,1.59,6.19,0.47,6.59,1.44,4.58,3.28,1.35,4.9


In [35]:
dict_make_gene_scoring_robust['B3'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ACHE,3.05,3.93,1.43,4.24,3.64,0,4.06,3.82,0.11,2.24,3.38,3.22,0.79,0.9,2.89,1.08,2.67,0.15,0.62,4.66
ADRA2A,4.02,4.5,1.31,3.94,6.35,0,3.1,5.67,1.91,1.29,1.78,2.38,0.82,3.76,3.4,1.18,2.88,0.41,0.8,4.24
ANKRD29,3.88,3.55,2.42,3.78,2.63,0,0.82,4.0,0.2,1.5,3.49,2.47,1.93,1.68,2.7,1.05,2.56,0.28,0.73,3.72
APOE,31.27,7.66,6.85,10.58,8.19,0,8.28,12.24,6.12,8.19,14.34,15.33,6.37,11.02,12.32,1.93,6.38,12.26,1.87,6.59
BIRC3,2.79,5.57,4.28,2.99,3.96,0,2.33,6.33,1.95,3.53,4.71,2.27,1.68,3.82,3.71,1.12,3.3,0.99,1.0,3.72
C3,11.33,6.43,2.61,4.01,7.74,0,6.24,7.93,5.37,3.74,7.58,9.35,0.81,7.31,6.64,1.39,4.76,4.91,1.49,4.46
CCL19,14.59,7.77,5.07,8.4,14.59,0,10.85,13.89,4.25,5.11,12.75,12.45,10.61,7.39,10.33,1.53,6.77,2.3,1.23,8.39
CCL2,25.98,6.97,6.8,14.11,5.63,0,5.9,12.63,7.72,4.96,6.42,4.05,4.21,8.32,9.76,1.86,5.24,12.5,1.88,5.19
CD74,8.15,4.51,3.82,3.78,5.14,0,5.56,13.46,3.24,3.54,8.14,6.1,3.53,6.08,6.3,1.45,4.34,3.48,1.37,4.61
CH25H,7.65,5.72,0.91,6.21,5.68,0,2.77,8.31,1.44,2.92,3.68,3.88,1.44,4.04,4.7,1.33,3.53,0.9,0.97,4.83


In [36]:
dict_make_gene_scoring_robust['B4'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ABCA10,6.81,7.99,4.05,0.11,1.0,4.18,1.62,4.78,0.54,2.81,3.7,6.33,2.2,3.32,4.0,1.32,3.04,0.87,0.97,4.14
ABCA8,13.08,9.72,6.6,3.03,1.11,6.45,2.54,5.89,3.68,6.71,9.2,8.38,3.61,7.47,7.03,1.47,4.79,3.56,1.37,5.12
ABCA9,8.67,6.37,2.46,1.89,0.77,7.56,1.68,3.88,0.76,3.5,4.5,2.39,1.96,3.02,4.08,1.35,3.01,0.94,0.99,4.14
APOC1,16.56,9.08,7.33,2.23,0.82,5.86,1.35,4.76,2.67,5.98,10.83,6.05,1.16,3.75,6.64,1.65,4.02,2.11,1.2,5.51
APOD,38.17,14.32,11.4,7.29,4.38,11.61,5.08,9.88,7.59,5.63,16.6,17.94,7.99,2.77,13.38,2.13,6.28,26.06,2.26,5.92
APOE,29.95,12.43,14.45,4.69,1.05,11.48,5.02,11.64,6.75,7.04,21.31,9.14,7.54,5.91,12.29,1.99,6.18,11.85,1.86,6.62
ATP8B4,2.56,2.39,0.72,1.6,0.71,2.28,0.96,1.5,1.11,3.27,3.95,3.21,1.25,1.32,2.15,1.0,2.14,0.07,0.52,4.16
C3,16.73,8.91,5.54,3.47,0.0,7.34,3.2,5.02,0.42,3.55,11.44,13.8,1.05,5.24,7.36,1.72,4.29,4.95,1.49,4.93
C6,4.49,4.02,2.31,1.4,0.44,2.79,0.69,2.44,0.93,2.24,3.58,1.68,0.64,1.98,2.42,1.08,2.25,0.1,0.56,4.32
C7,10.42,8.29,8.2,2.87,1.32,9.91,3.23,6.04,3.19,6.13,12.11,12.38,5.17,6.45,7.69,1.5,5.15,1.61,1.13,6.83


In [37]:
dict_make_gene_scoring_robust['C1'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ACAN,4.91,6.47,0.0,4.67,4.24,3.68,1.8,2.41,3.55,3.93,3.16,0,0,1.3,3.78,1.18,3.2,0.08,0.54,7.01
ADAMTS18,4.34,7.16,0.86,1.6,1.92,1.56,0.9,2.09,2.47,2.26,2.83,0,0,0.0,2.82,1.27,2.23,0.05,0.48,5.86
ALX4,7.27,8.94,1.99,2.2,0.69,6.96,1.38,2.69,2.31,3.58,3.94,0,0,0.71,4.28,1.42,3.02,0.19,0.66,6.52
ANOS1,6.26,6.06,1.59,1.34,1.9,2.43,0.0,2.2,3.16,0.0,3.14,0,0,1.58,2.97,1.28,2.31,0.1,0.56,5.34
CFHR1,9.52,5.51,1.13,3.47,1.29,3.18,4.03,0.95,3.7,2.94,3.53,0,0,1.76,3.98,1.36,2.93,0.16,0.63,6.34
COL11A1,21.29,15.76,6.14,7.79,11.64,15.28,3.53,3.88,7.81,7.33,12.89,0,0,3.15,11.17,1.79,6.24,0.64,0.89,12.49
COL21A1,9.95,10.4,3.32,6.76,4.31,5.23,2.42,3.74,4.39,5.71,7.03,0,0,0.75,6.12,1.42,4.31,1.05,1.01,6.05
CYYR1,8.16,6.13,0.07,4.81,1.73,3.89,3.48,2.26,3.2,3.98,2.64,0,0,2.71,4.11,1.29,3.19,0.31,0.75,5.49
DOK6,3.93,8.54,1.96,3.16,2.79,4.92,0.0,2.27,2.7,2.77,3.87,0,0,1.82,3.74,1.3,2.87,0.18,0.65,5.75
DPEP1,20.08,11.58,4.64,6.05,5.75,12.05,4.11,2.92,8.73,8.31,13.01,0,0,2.04,9.56,1.74,5.5,0.57,0.87,10.99


In [38]:
dict_make_gene_scoring_robust['C2'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ARHGAP15,10.98,4.29,7.56,14.35,2.7,2.17,6.9,5.44,7.59,3.69,9.02,0,0,0,7.58,1.53,4.96,0.43,0.81,9.34
ASPN,26.49,12.15,24.28,39.14,3.97,4.03,27.71,12.12,19.67,7.48,27.37,0,0,0,21.08,2.2,9.56,1.93,1.18,17.89
BTBD11,3.87,6.95,4.23,4.66,1.83,1.42,1.64,2.96,3.32,1.08,4.3,0,0,0,3.57,1.16,3.08,0.04,0.45,8.0
CADM2,4.64,9.42,7.15,7.03,1.42,1.9,1.82,4.83,2.44,2.64,7.39,0,0,0,5.11,1.37,3.74,0.13,0.6,8.49
CCK,7.07,4.54,8.67,13.21,1.32,1.42,2.79,5.69,5.5,3.33,8.77,0,0,0,6.5,1.52,4.28,0.1,0.56,11.7
CHADL,6.43,4.58,9.49,13.77,2.24,1.9,4.87,2.41,4.59,2.56,9.62,0,0,0,6.57,1.56,4.2,0.44,0.81,8.07
CLEC14A,6.25,3.67,8.27,8.73,1.44,2.87,4.41,4.94,3.95,3.88,8.68,0,0,0,5.8,1.34,4.34,0.32,0.75,7.73
COCH,37.7,15.18,56.8,52.02,5.5,4.29,44.06,13.53,31.31,8.15,57.86,0,0,0,34.56,2.71,12.75,1.74,1.15,30.11
COL24A1,4.35,9.47,6.31,4.44,2.75,3.79,2.8,4.61,3.6,1.88,6.75,0,0,0,4.89,1.25,3.91,0.14,0.61,7.97
CPNE5,6.68,5.77,7.17,8.92,3.05,4.66,4.18,5.34,3.98,2.11,6.91,0,0,0,5.82,1.23,4.72,0.09,0.54,10.75


In [39]:
dict_make_gene_scoring_robust['C3'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ACAN,1.85,1.89,1.39,0.31,4.75,1.58,0,2.4,0.0,0.7,1.28,0.0,0,0,1.63,1.11,1.47,0.08,0.54,3.03
ADAM12,0.72,0.56,1.82,4.26,6.01,0.19,0,6.38,1.06,0.0,1.71,2.72,0,0,2.67,1.32,2.02,0.08,0.54,4.96
ADGRE2,3.46,2.91,3.82,2.15,1.16,2.06,0,3.31,1.76,0.0,3.8,0.0,0,0,2.36,1.08,2.17,0.06,0.5,4.7
ASPN,6.59,2.94,2.74,2.07,7.31,1.3,0,8.97,1.61,3.22,6.11,1.05,0,0,4.42,1.4,3.15,1.95,1.18,3.74
BGN,7.84,4.81,5.04,6.42,4.96,4.27,0,5.0,4.06,3.43,7.19,6.2,0,0,5.48,1.08,5.07,9.01,1.73,3.16
COL11A1,3.43,1.68,2.63,1.84,5.69,2.31,0,7.02,1.5,0.0,2.5,0.0,0,0,2.97,1.29,2.3,0.7,0.92,3.25
COL5A2,1.02,3.76,1.63,3.17,7.44,1.08,0,9.8,0.99,0.0,1.69,5.78,0,0,3.76,1.47,2.56,1.72,1.15,3.28
COL6A3,6.22,4.22,5.26,7.19,6.16,1.38,0,11.03,4.3,2.0,4.19,6.48,0,0,5.72,1.38,4.16,9.32,1.75,3.27
EDIL3,2.51,1.1,2.84,2.53,6.42,2.92,0,7.4,2.27,0.68,2.64,1.43,0,0,3.32,1.29,2.57,0.45,0.82,4.05
EDNRA,5.44,2.65,5.1,2.96,2.37,2.9,0,7.84,2.99,2.01,6.86,0.0,0,0,4.01,1.32,3.05,0.57,0.87,4.62


In [40]:
dict_make_gene_scoring_robust['C5'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ALX4,0,3.63,0.0,1.67,1.66,0,0,0.93,0,0,0.8,0,0,3.82,2.24,1.11,2.02,0.2,0.67,3.36
AOPEP,0,4.83,0.0,2.68,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1.2,1.24,0.97,0.22,0.68,1.75
ASPN,0,0.67,1.54,0.17,0.0,0,0,1.04,0,0,1.86,0,0,5.4,1.92,1.28,1.5,1.22,1.05,1.83
BMP7,0,3.81,1.0,0.42,0.61,0,0,1.58,0,0,1.27,0,0,4.21,2.24,1.16,1.93,0.17,0.64,3.52
C9orf3,0,0.0,0.0,0.0,2.29,0,0,1.9,0,0,0.0,0,0,4.27,1.6,1.21,1.33,1.4,1.09,1.47
CENPW,0,2.94,0.0,2.38,3.02,0,0,0.0,0,0,0.91,0,0,4.27,2.43,1.15,2.12,0.59,0.87,2.78
CNTN4,0,5.08,0.0,1.4,1.96,0,0,1.45,0,0,0.0,0,0,1.28,1.83,1.17,1.57,0.22,0.69,2.67
CPE,0,3.95,0.0,5.48,5.22,0,0,1.02,0,0,0.66,0,0,7.59,4.25,1.39,3.07,2.35,1.24,3.44
CRABP1,0,2.76,0.0,0.0,1.15,0,0,6.45,0,0,0.5,0,0,5.27,2.73,1.33,2.04,0.56,0.87,3.15
DCXR,0,3.17,0.19,0.0,3.04,0,0,0.5,0,0,1.05,0,0,3.84,2.13,1.15,1.85,0.83,0.95,2.23


In [41]:
dict_make_gene_scoring_robust['D1'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ALDH1A3,2.8,0.64,3.87,2.6,0.0,0,0,0,1.92,1.68,4.43,2.67,0,0,2.51,1.11,2.27,0.07,0.51,4.96
ANGPTL7,12.47,6.46,7.62,6.46,3.4,0,0,0,5.25,4.46,10.08,5.91,0,0,7.27,1.39,5.23,0.21,0.68,10.72
APOD,27.29,8.75,19.74,20.33,11.22,0,0,0,13.43,7.22,22.62,21.73,0,0,17.96,1.85,9.71,30.51,2.35,7.64
ATP1A2,3.49,4.25,4.04,3.64,2.38,0,0,0,2.14,2.02,4.93,1.94,0,0,3.35,1.02,3.29,0.05,0.47,7.05
BAMBI,5.11,3.09,4.03,3.76,2.66,0,0,0,1.62,2.1,4.49,3.57,0,0,3.55,1.01,3.52,0.18,0.66,5.42
C2orf40,7.01,0.0,10.48,10.79,4.59,0,0,0,5.38,4.67,10.35,4.73,0,0,6.91,1.51,4.59,0.41,0.8,8.64
CHN1,5.94,3.8,3.41,4.96,2.35,0,0,0,3.1,2.28,4.24,4.89,0,0,4.01,1.04,3.85,0.59,0.88,4.58
CLDN1,2.52,4.47,3.86,4.86,4.15,0,0,0,3.8,0.95,4.22,4.94,0,0,3.85,1.03,3.74,0.09,0.54,7.07
COL28A1,3.43,3.18,2.66,1.67,1.38,0,0,0,1.11,2.14,3.92,2.49,0,0,2.55,0.97,2.63,0.04,0.45,5.71
COL8A1,3.35,3.81,5.26,2.35,1.46,0,0,0,1.81,2.45,5.45,2.63,0,0,3.38,1.12,3.03,0.11,0.57,5.94


In [42]:
dict_make_gene_scoring_robust['D2'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ADAMTSL5,0,2.21,4.06,2.15,0,1.15,1.11,0.59,0,1.9,3.42,2.93,0,0,2.53,1.03,2.46,0.05,0.47,5.36
AQP3,0,2.19,8.26,3.59,0,0.91,1.0,1.4,0,2.59,6.18,6.26,0,0,4.47,1.38,3.24,0.31,0.74,6.01
BNC2,0,10.89,7.06,3.39,0,0.82,2.61,1.28,0,2.09,4.49,4.16,0,0,4.5,1.39,3.23,0.22,0.68,6.6
C19orf33,0,2.22,6.27,3.15,0,1.28,1.36,1.52,0,1.88,5.33,8.49,0,0,4.28,1.37,3.13,0.07,0.51,8.43
C2orf40,0,0.0,8.21,3.2,0,0.75,0.0,1.8,0,2.41,5.7,8.04,0,0,4.34,1.47,2.95,0.4,0.8,5.45
CAV2,0,2.87,7.06,3.9,0,0.0,1.31,2.02,0,3.18,5.16,5.2,0,0,4.11,1.27,3.24,0.92,0.98,4.19
CLDN1,0,2.37,7.17,3.53,0,2.08,2.55,2.69,0,1.92,5.24,5.18,0,0,4.2,1.23,3.42,0.08,0.54,7.85
COL9A3,0,1.26,3.9,1.56,0,2.12,1.29,0.0,0,2.21,3.32,2.22,0,0,2.28,1.03,2.21,0.04,0.45,5.07
CSRP1,0,2.1,9.95,5.4,0,2.82,2.87,2.4,0,1.85,7.25,8.64,0,0,5.77,1.45,3.97,1.17,1.04,5.55
DUSP5,0,0.81,4.8,2.53,0,0.36,0.53,0.87,0,1.53,3.52,4.43,0,0,2.71,1.18,2.29,0.11,0.58,4.66


In [43]:
dict_make_gene_scoring_robust['E1'].iloc[:40].sort_index()

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
A2M,2.88,3.24,1.92,4.5,4.52,1.66,0,2.96,2.1,2.03,2.01,3.02,0.62,0,2.82,1.04,2.72,0.54,0.86,3.29
ABCA10,0.45,2.64,1.02,0.0,4.2,0.0,0,1.61,0.24,0.0,0.0,1.89,0.69,0,1.25,1.11,1.12,0.83,0.95,1.31
ALX4,1.12,4.33,0.82,2.88,3.09,1.5,0,1.35,1.58,0.0,0.0,2.39,0.0,0,1.87,1.1,1.69,0.14,0.61,3.08
ANTXR2,2.55,4.94,0.91,3.34,3.4,1.78,0,0.42,1.85,0.97,0.08,1.65,0.96,0,2.18,1.12,1.94,1.48,1.1,1.98
APOD,3.65,9.63,0.71,4.79,8.85,2.09,0,2.31,7.03,0.0,2.35,9.7,5.65,0,5.37,1.49,3.61,26.94,2.28,2.36
CALD1,1.69,1.87,1.28,3.16,3.72,1.77,0,1.35,1.89,0.0,0.53,1.82,0.0,0,1.8,1.03,1.76,12.22,1.87,0.96
COL15A1,2.26,4.11,2.35,1.01,4.16,2.37,0,1.63,0.61,0.1,0.79,0.0,0.0,0,1.8,1.15,1.57,2.26,1.23,1.46
CPE,3.0,8.17,2.04,14.42,8.46,2.8,0,1.28,5.18,2.16,2.92,8.32,2.49,0,5.85,1.58,3.7,10.48,1.8,3.25
EGR2,3.54,4.25,1.36,2.74,2.3,2.15,0,1.2,1.14,1.32,1.58,2.24,1.98,0,2.3,0.99,2.33,0.52,0.85,2.71
FGFBP2,4.94,6.1,2.34,6.76,5.24,2.54,0,1.48,5.71,2.38,3.55,4.37,3.06,0,4.37,1.18,3.71,2.89,1.3,3.35


In [44]:
dict_make_gene_scoring_robust['T1']

Unnamed: 0,Ahlers-2022-Healthy,Gao-2021-Healthy,Gaydosik-2019-Healthy,Kim-2020-Healthy,Liu-2021-Keloid,Mariottoni-2021-Healthy,Mirizio-2020-Scleroderma,Rindler-2021-Healthy,Solé-Boldo-2020-Healthy,Tabib-2018-Healthy,Tabib-2021-Healthy,Theocarditis-2020-DM – non ulcer/Healthy,Vorstandlechner-2020-Healthy,Xu-2021-Healthy,mean,dev,CV,expr,expr_pow,Z
ABCA10,2.18,4.73,3.73,3.80,7.30,5.25,2.29,6.50,3.87,0.88,5.41,0.00,0.00,0,4.01,1.28,3.13,0.79,0.94,4.25
GPC3,5.39,5.74,9.67,6.68,0.00,11.36,3.37,5.86,5.06,0.00,3.09,1.16,1.88,0,5.30,1.52,3.48,3.47,1.37,3.88
TRIL,0.00,1.04,3.80,2.54,2.19,3.57,1.97,0.52,3.52,0.00,4.89,0.00,0.00,0,2.14,1.18,1.81,0.11,0.57,3.72
DPT,7.39,4.64,7.64,7.19,2.33,6.72,1.00,3.33,5.90,3.81,14.79,4.93,0.00,0,5.94,1.53,3.88,8.17,1.69,3.51
TMEM176B,6.25,4.34,6.69,6.10,2.59,6.24,0.00,5.98,2.88,4.27,10.33,1.20,1.08,0,4.99,1.38,3.61,4.17,1.43,3.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRDX1,3.62,0.00,0.00,0.00,0.00,0.61,0.00,0.00,0.00,0.00,0.00,0.07,1.76,0,0.49,1.02,0.48,10.07,1.78,0.27
UBB,0.93,0.00,0.00,0.00,0.00,3.15,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0.53,1.03,0.51,16.50,2.02,0.26
ACTG1,0.00,0.22,0.00,0.00,0.00,3.52,0.00,0.00,0.86,0.00,0.00,0.02,0.00,0,0.58,1.07,0.55,27.26,2.28,0.26
GNB2L1,0.00,0.00,5.25,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.78,0,0.42,1.11,0.38,8.16,1.69,0.25


## Analyse the presence and expression pattern of marker genes (within fibroblast populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [45]:
def make_dicts_fraction_mean(genes, list_all_datasets, list_accepted_clusters, clusterby='cluster_robust'):
    dict_fraction_cells = {gene: pd.DataFrame(np.nan, index=list_names, columns=list_accepted_clusters) for gene in genes}
    dict_mean_exp = {gene: pd.DataFrame(np.nan, index=list_names, columns=list_accepted_clusters) for gene in genes}

    for adata, name in zip(list_all_datasets, list_names):
        genes_sub = [i for i in genes if i in adata.var_names]
        for cluster in set(adata.obs[clusterby]):
            counts = adata[adata.obs[clusterby] == cluster][:, genes_sub].X.toarray().copy()
            counts_frac = (counts > 0).sum(0) / counts.shape[0]
            counts[counts == 0] = np.nan
            counts_mean_exp = np.nanmean(counts, 0)

            for idx, gene in enumerate(genes_sub):
                dict_fraction_cells[gene].loc[name, cluster] = counts_frac[idx]
                dict_mean_exp[gene].loc[name, cluster] = counts_mean_exp[idx]

    for gene in genes:
        dict_fraction_cells_mean, dict_fraction_cells_std =  dict_fraction_cells[gene].mean(),  dict_fraction_cells[gene].std()
        dict_mean_exp_mean, dict_mean_exp_std =  dict_mean_exp[gene].mean(),  dict_mean_exp[gene].std()

        dict_fraction_cells[gene].loc['Mean'] = dict_fraction_cells_mean
        dict_fraction_cells[gene].loc['Std'] = dict_fraction_cells_std
        dict_mean_exp[gene].loc['Mean'] = dict_mean_exp_mean
        dict_mean_exp[gene].loc['Std'] = dict_mean_exp_std

        dict_fraction_cells[gene] = dict_fraction_cells[gene][list_accepted_clusters]
        dict_mean_exp[gene] = dict_mean_exp[gene][list_accepted_clusters]
    
    return dict_fraction_cells, dict_mean_exp

In [46]:
def plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp, rotate=False):
    dfplot_frac = dict_fraction_cells[gene] ** 0.66
    dfplot_exp = dict_mean_exp[gene] 
    exp_norm_vals = (dfplot_exp.loc['Mean'] - min(dfplot_exp.loc['Mean'])) / (max(dfplot_exp.loc['Mean']) - min(dfplot_exp.loc['Mean']))
    fig, ax = plt.subplots(1, 1, figsize=(10, 1))
    ax.set_xticks(range(len(dfplot_frac.columns)))
    
    if rotate:
        ax.set_xticklabels(dfplot_frac.columns, rotation=40, ha='right')
    else:
        ax.set_xticklabels(dfplot_frac.columns)
    
    ax.set_yticks([0])
    ax.set_yticklabels([gene])
    ax.set_ylim([-0.1, 0.1])
    plt.scatter(range(len(dfplot_frac.columns)), [0] * len(dfplot_frac.columns), s=dfplot_frac.loc['Mean'] * 550, c=[cm.OrRd(i) for i in exp_norm_vals], 
                linewidths=0.5, edgecolor='#878787', alpha = [max(0, i) for i in 1 - dict_fraction_cells[gene].loc['Std'] ** 0.75])
    
    plt.plot([-0.3, len(dfplot_frac.columns) - 0.3], [0, 0], c="#676767", linewidth=0.7, alpha=0.3)
    plt.plot([-0.3, len(dfplot_frac.columns) - 0.3], [0.025, 0.025], c="#676767", linewidth=0.7, alpha=0.3)
    plt.plot([-0.3, len(dfplot_frac.columns) - 0.3], [-0.025, -0.025], c="#676767", linewidth=0.7, alpha=0.3)

In [47]:
genes = sorted(list(set(['A2M', 'AADAC', 'ABCA10', 'ABCA8', 'ABCA9', 'ABCC9', 'ACAN', 'ACE', 'ACHE', 'ACKR3', 'ACKR4', 'ADAM12', 'ADAMTS18', 'ADAMTS4', 
                         'ADAMTSL5', 'ADGRE2', 'ADRA2A', 'AEBP1', 'AHRR', 'AKAP6', 'ALDH1A3', 'ALX4', 'ANGPTL5', 'ANGPTL7', 'ANKRD29', 'ANOS1', 'ANTXR2', 
                         'AOPEP', 'APCDD1', 'APOC1', 'APOD', 'APOE', 'AQP1', 'AQP3', 'ARHGAP15', 'ASPN', 'ATP1A2', 'AXIN2', 'BAMBI', 'BGN', 'BIRC3', 'BMP7', 
                         'BNC2', 'BTBD11', 'C11orf96', 'C19orf33', 'C1orf198', 'C1QTNF3', 'C2orf40/ECRG4', 'C3', 'C6', 'C7', 'C9orf3', 'CA12', 'CADM2', 'CAV2', 
                         'CCDC146', 'CCK', 'CCL19', 'CCL2', 'CD200R1', 'CD248', 'CD34', 'CD55', 'CD70', 'CD74', 'CD9', 'CDA', 'CDH11', 'CDKN1A', 'CENPW', 
                         'CERCAM', 'CES1', 'CFD', 'CFH', 'CFHR1', 'CH25H', 'CHADL', 'CHD1', 'CHN1', 'CHRDL1', 'CLDN1', 'CLEC14A', 'CLEC2A', 'CLEC3B', 'CLSTN3', 
                         'CNTN4', 'COCH', 'COL11A1', 'COL13A1', 'COL14A1', 'COL15A1', 'COL18A1', 'COL1A2', 'COL21A1', 'COL23A1', 'COL24A1', 'COL28A1', 'COL3A1', 
                         'COL5A2', 'COL6A1', 'COL6A2', 'COL6A3', 'COL6A5', 'COL8A1', 'COL9A3', 'COL9A3', 'COMP', 'CORIN', 'COX4I2', 'CPE', 'CPNE5', 'CPVL', 
                         'CRABP1', 'CRIP1', 'CRISPLD1', 'CSRP1', 'CTHRC1', 'CTSH', 'CTSK', 'CX3CL1', 'CXCL1', 'CXCL12', 'CXCL2', 'CXCL3', 'CYBRD1', 'CYGB', 
                         'CYP1B1', 'CYP26B1', 'CYP7B1', 'CYYR1', 'DBN1', 'DCN', 'DCXR', 'DIO2', 'DNAJA1', 'DOK6', 'DPEP1', 'DPP4', 'DPT', 'DUSP5', 'EBF2', 
                         'ECM1', 'EDIL3', 'EDNRA', 'EFEMP1', 'EGFLAM', 'EGR1', 'EGR2', 'ELL2', 'ELN', 'EMB', 'EMID1', 'EMP3', 'ENTPD2', 'ERRFI1', 'ETV1', 
                         'ETV4', 'EVA1A', 'F13A1', 'F2R', 'F2RL2', 'FABP3', 'FAM180B', 'FBLN1', 'FBLN2', 'FBN1', 'FBXO32', 'FGF7', 'FGFBP2', 'FIBIN', 'FMO1', 
                         'FMO2', 'FMO3', 'FOS', 'FOSL1', 'FOXC2', 'FOXS1', 'FXYD6', 'FZD1', 'GALNT15', 'GCH1', 'GDF10', 'GEM', 'GFRA2', 'GGT5', 'GLRB', 'GNAO1', 
                         'GPC3', 'GPM6B', 'GPNMB', 'GSN', 'HAPLN1', 'HAS2', 'HHIP', 'HLA-B', 'HLA-F', 'HMGA1', 'HMGCLL1', 'HOPX', 'HPSE2', 'HRH1', 'HSD3B7', 
                         'HSPB3', 'HTRA1', 'ICAM1', 'ICAM2', 'IER3', 'IGF1', 'IGFBP2', 'IGFBP3', 'IGFBP5', 'IGFBP6', 'IGFBP7', 'IL32', 'IL33', 'IL34', 'IL6', 
                         'INHBA', 'INMT', 'IRF1', 'IRF8', 'ISYNA1', 'ITGA6', 'ITGB4', 'ITIH5', 'ITM2A', 'JAK3', 'JUNB', 'KCNQ3', 'KDM6B', 'KIAA1217', 'KIF26B', 
                         'KLF5', 'KLK1', 'KLK4', 'KPNA2', 'KRT17', 'KRT19', 'LEF1', 'LEPR', 'LINC00327', 'LINC01133', 'LMO4', 'LMO7', 'LOX', 'LOXL2', 'LPAR4', 
                         'LRRC15', 'LRRC17', 'LRRN3', 'LSP1', 'LTBP2', 'MAB21L2', 'MAFF', 'MAGI1', 'MAP2', 'MAP7', 'MARCKSL1', 'MATN4', 'MCTP1', 'MDK', 'MEF2C', 
                         'MFAP5', 'MGP', 'MGST1', 'MKX', 'MME', 'MMP11', 'MMP16', 'MMP2', 'MRAS', 'MRPS6', 'MSC', 'MXRA5', 'MYO10', 'MYOC', 'NBL1', 'NDNF', 
                         'NECAB1', 'NFIB', 'NFIL3', 'NFKB1', 'NFKBIA', 'NGFR', 'NKD2', 'NLGN4X', 'NPTX2', 'NR2F2', 'NR4A3', 'NRP2', 'NTM', 'OGN', 'OLFML2A', 
                         'OSBP2', 'P2RY14', 'P3H2', 'P4HA3', 'PAFAH1B3', 'PALMD', 'PAMR1', 'PCDH19', 'PCOLCE2', 'PCSK1N', 'PCSK9', 'PDGFD', 'PDGFRL', 'PEAR1', 
                         'PGM2L1', 'PHACTR3', 'PI16', 'PIEZO2', 'PIM3', 'PKP4', 'PLA2G2A', 'PLA2G5', 'PLAUR', 'PLEKHA4', 'PLK2', 'PLPP5', 'PLXDC1', 'PMEPA1', 
                         'PNRC1', 'PODNL1', 'POSTN', 'PPARG', 'PPDPFL', 'PPIC', 'PPP1R14A', 'PPP1R15A', 'PPP1R15B', 'PRDM8', 'PRG4', 'PRKG2', 'PRSS23', 'PSAT1', 
                         'PTCH1', 'PTGDS', 'PTGIS', 'PTGS1/COX1', 'PTK7', 'PTN', 'PTPRD', 'PXDNL', 'QPCT', 'RAMP1', 'RANBP3L', 'RARRES2', 'RBFOX1', 'RBP4', 
                         'RBP5', 'REL', 'RGCC', 'RGS16', 'RHPN1', 'ROBO2', 'RSPO1', 'RSPO3', 'RSPO4', 'RUNX2', 'S100B', 'SBSPON', 'SCARA5', 'SCN4B', 'SCN7A', 
                         'SCRG1', 'SDC1', 'SDPR/CAVIN2', 'SEMA3B', 'SEMA3C', 'SEMA3E', 'SERPINA5', 'SERPINF1', 'SFRP1', 'SFRP2', 'SFRP4', 'SGCA', 'SGIP1', 
                         'SH3BGRL3', 'SHISA3', 'SHISAL1', 'SIX1', 'SLC22A16', 'SLC22A3', 'SLC26A7', 'SLC29A1', 'SLC2A1', 'SLC2A3', 'SLC5A3', 'SLC9A3R2', 
                         'SLCO2B1', 'SLIT2', 'SLITRK6', 'SLPI', 'SNAI2', 'SOCS3', 'SOD2', 'SOSTDC1', 'SOX8', 'SPARC', 'SPARCL1', 'SPON1', 'SPON2', 'SPRY1', 
                         'SPRY2', 'SPSB1', 'STC1', 'STC2', 'STMN1', 'STMN2', 'SULT1A1', 'SVEP1', 'TAC1', 'TAGLN', 'TBX3', 'TCEAL2', 'TCF7L2', 'TENM2', 'TENM3', 
                         'TFAP2A', 'TGFBI', 'THBS2', 'THBS4', 'THSD4', 'TIAM1', 'TIMP2', 'TIMP3', 'TM4SF1', 'TMEM150C', 'TMEM176A', 'TMEM52', 'TMTC2', 'TNC', 
                         'TNFAIP3', 'TNFAIP6', 'TNFRSF19', 'TNFSF13B', 'TNFSF14', 'TNMD', 'TNN', 'TNNC1', 'TNNT3', 'TNXB', 'TPD52', 'TPPP3', 'TRAC', 'TRIL', 
                         'TRPM3', 'TRPS1', 'TSPAN13', 'TSPAN8', 'TTR', 'TUBB4A', 'TWIST2', 'TXNIP', 'TYMP', 'UACA', 'UAP1', 'UBD', 'UGT3A2', 'VCAM1', 'VIPR2', 
                         'WFDC1', 'WIF1', 'WISP2 / CCN5', 'WNT10A', 'WNT2', 'WNT5A', 'WTAP', 'XG', 'YWHAH', 'ZC2HC1C', 'ZC3H12A', 'ZFP36'])))

To create the dot plot we will use two variables: the percentage of cells expressing the marker in each cluster, and the mean expression value of the expressing-cells in each cluster. To do that, we will create a df for each case and gene, compile the info for each adata and get the mean/std across the adatas. If one gene is not expressed in an adata, or the adata does not have a certain cluster, the value registered is a NaN, not zero. 

In [48]:
dict_fraction_cells, dict_mean_exp = make_dicts_fraction_mean(genes, list_all_datasets, list_accepted_clusters, clusterby='cluster_robust')

To do the dotplot we do some adjustments. We adjust the fraction of cells beign expressed to $x^{2/3}$ to make it slightly more visual. We also apply a alpha of 1 - std(fraction of cells) to account for markers that are expressed in disparily, so that they are visually not that relevant.


In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets, list_names=list_names)
    plt.show()

In [53]:
genes = ['DKK3', 'SPINT2', 'FGFR4', 'GPM6B']

To create the dot plot we will use two variables: the percentage of cells expressing the marker in each cluster, and the mean expression value of the expressing-cells in each cluster. To do that, we will create a df for each case and gene, compile the info for each adata and get the mean/std across the adatas. If one gene is not expressed in an adata, or the adata does not have a certain cluster, the value registered is a NaN, not zero. 

In [54]:
dict_fraction_cells, dict_mean_exp = make_dicts_fraction_mean(genes, list_all_datasets, list_accepted_clusters, clusterby='cluster_robust')

To do the dotplot we do some adjustments. We adjust the fraction of cells beign expressed to $x^{2/3}$ to make it slightly more visual. We also apply a alpha of 1 - std(fraction of cells) to account for markers that are expressed in disparily, so that they are visually not that relevant.


In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells, dict_mean_exp)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets, list_names=list_names)
    plt.show()

## Analyse the presence and expression pattern of marker genes (within all populations)

In this section we will plot UMAPs and dot plots of expression patterns of gene markers. This is useful because many gene markers tend to be markers of few clusters based on DE analysis but, in reality, they are also expressed in other clusters which might need to be considered.

In [None]:
list_accepted_clusters_full = ['fibro', 'fibro - ANGPTL7', 'F', 'melanocyte', 'neuro', 'endo capillary', 'endo venule', 'endo artery', 'lymph', 
                               'peri - CYCS', 'peri - ZFP36', 'peri - RERGL', 'peri - RGS5', 'krt channel', 'krt spinous', 'krt basal', 'krt gran', 
                               'muscle', 'T CD4+', 'B cells', 'NK cell', 'APC', 'immune', 'mast cell' ]

In [None]:
list_all_datasets_full = [ahlers_2022_young, boothby_2021_ctrl, 
                         deng_2021_scar, gao_2021_ctrl, gaydosik_2020_ctrl, 
                         he_2020_ctrl, hughes_2020_ctrl, kim_2020_ctrl, 
                         liu_2021_ctrl, mariottoni_2021_ctrl,
                         mirizio_2020_scl, rindler_2021_ctrl,
                         sole_2020_young, tabib_2021_ctrl,
                         theo_2020_ctrl_dm, theo_2021_ctrl,
                         vors_2020_ctrl, xu_2021_ctrl]

list_names_full = [adata.obs['Author'].values[0] + ' ' + str(int(adata.obs['Year'].values[0])) for adata in list_all_datasets_full]

In [None]:
plot_UMAPS_gene('assigned_cats', list_datasets=list_all_datasets_full, list_names=list_names_full)

In [None]:
dict_fraction_cells_full, dict_mean_exp_full = make_dicts_fraction_mean(genes, list_all_datasets_full, list_accepted_clusters_full, clusterby='assigned_cats')

In [None]:
genes = [ 'PLA2G2A', 'PLA2G5', 'PLAUR', 'PLEKHA4', 'PLK2', 'PLPP5', 'PLXDC1', 'PMEPA1', 
                         'PNRC1', 'PODNL1', 'POSTN', 'PPARG', 'PPDPFL', 'PPIC', 'PPP1R14A', 'PPP1R15A', 'PPP1R15B', 'PRDM8', 'PRG4', 'PRKG2', 'PRSS23', 'PSAT1', 
                         'PTCH1', 'PTGDS', 'PTGIS', 'PTGS1', 'PTK7', 'PTN', 'PTPRD', 'PXDNL', 'QPCT', 'RAMP1', 'RANBP3L', 'RARRES2', 'RBFOX1', 'RBP4', 
                         'RBP5', 'REL', 'RGCC', 'RGS16', 'RHPN1', 'ROBO2', 'RSPO1', 'RSPO3', 'RSPO4', 'RUNX2', 'S100B', 'SBSPON', 'SCARA5', 'SCN4B', 'SCN7A', 
                         'SCRG1', 'SDC1', 'SDPR/CAVIN2', 'SEMA3B', 'SEMA3C', 'SEMA3E', 'SERPINA5', 'SERPINF1', 'SFRP1', 'SFRP2', 'SFRP4', 'SGCA', 'SGIP1', 
                         'SH3BGRL3', 'SHISA3', 'SHISAL1', 'SIX1', 'SLC22A16', 'SLC22A3', 'SLC26A7', 'SLC29A1', 'SLC2A1', 'SLC2A3', 'SLC5A3', 'SLC9A3R2', 
                         'SLCO2B1', 'SLIT2', 'SLITRK6', 'SLPI', 'SNAI2', 'SOCS3', 'SOD2', 'SOSTDC1', 'SOX8', 'SPARC', 'SPARCL1', 'SPON1', 'SPON2', 'SPRY1', 
                         'SPRY2', 'SPSB1', 'STC1', 'STC2', 'STMN1', 'STMN2', 'SULT1A1', 'SVEP1', 'TAC1', 'TAGLN', 'TBX3', 'TCEAL2', 'TCF7L2', 'TENM2', 'TENM3', 
                         'TFAP2A', 'TGFBI', 'THBS2', 'THBS4', 'THSD4', 'TIAM1', 'TIMP2', 'TIMP3', 'TM4SF1', 'TMEM150C', 'TMEM176A', 'TMEM52', 'TMTC2', 'TNC', 
                         'TNFAIP3', 'TNFAIP6', 'TNFRSF19', 'TNFSF13B', 'TNFSF14', 'TNMD', 'TNN', 'TNNC1', 'TNNT3', 'TNXB', 'TPD52', 'TPPP3', 'TRAC', 'TRIL', 
                         'TRPM3', 'TRPS1', 'TSPAN13', 'TSPAN8', 'TTR', 'TUBB4A', 'TWIST2', 'TXNIP', 'TYMP', 'UACA', 'UAP1', 'UBD', 'UGT3A2', 'VCAM1', 'VIPR2', 
                         'WFDC1', 'WIF1', 'WISP2', 'WNT10A', 'WNT2', 'WNT5A', 'WTAP', 'XG', 'YWHAH', 'ZC2HC1C', 'ZC3H12A', 'ZFP36']

In [None]:
for gene in genes:
    print(gene)
    plot_dotplot_gene(gene, dict_fraction_cells_full, dict_mean_exp_full, rotate=True)
    plot_UMAPS_gene(gene, list_datasets=list_all_datasets_full, list_names=list_names)
    plt.show()