# Comparison of fibroblast populations

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

**After the publication in JID we will include the following papers, as confirmatory results**
* Tabib 2018
* Solé-Boldo 2020
* Vorstandlechner 2020
* He 2020
* Kim 2020
* Gaydosik 2020
* McCarthy 2020
* Mirizio 2020
* Gao 2021
* Reynolds 2021

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
# import ray
# import subprocess
# import time
# import scvelo as scv
# import gc
# import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
dict_cats_fb = {'peri': ['Rgs5', 'Myl9', 'Ndufa4l2', 'Nrip2', 'Mylk', 'Rgs4', 'Acta2', 'Sncg', 'Tagln', 'Des', 'Ptp4a3', 'Myh11'], 
                'endo': ['Pecam1', 'Cdh5', 'Egfl7', 'Cd36', 'Srgn', 'Adgrf5', 'Ptprb', 'Scarb1', 'Plvap', 'Grrp1', 'C1qtnf9', 'Mmrn2', 'Flt1'], 
                'kerato': ['Krt14', 'Krt15', 'Perp', 'S100a14', 'Ccl27a', 'Gata3', 'Dapl1', 'Rab25', 'Ckmt1', 'Col17a1', 'Serpinb5'],
                'kerato Gjb2': ['Ucp2', 'Krt71', 'Gjb2', 'Ahcy', 'Acaa2', 'Cbs', 'Slc3a2', 'Serpina11', 'Lap3', 'Gss', 'Basp1', ],
                'fibro': ['Dcn', 'Pdgfra',  'Lum', 'Col1a1', 'Col1a2',],
                'fibro_2': ['Ncam1', 'Ptch1', 'Trps1', 'Col11a1', 'Wif1'],
                'T cell': ['Rac2', 'Ptprcap', 'Il2rg', 'Cd3g', 'Skap1', 'Hcst', 'Ctsw', 'Ets1', 'Cd3d', 'Ctla2a', 'Cd2'],
                'APC': ['Tyrobp', 'Cd74', 'H2-Aa', 'H2-Eb1', 'Ctss', 'Spi1', 'Napsa', 'Cd68', 'Lyz2', 'Csf2ra'],
                'lymph': ['Ccl21a', 'Egfl7', 'Mmrn1', 'Nsg1', 'Meox1', 'Gimap6', 'Kdr'],
                'melano / schwann': ['Syngr1', 'Pmel', 'Mlana'],
                'myo': ['Tnnt1', 'Tnnt2', 'Tnnt3', 'Tnnc2', 'Acta1', 'Myl1', 'Tnni2', 'Tcap', 'Eno3', 'Myoz1'],
                'neural': ['Itgb8', 'Plp1', 'Ptn', 'Egfl8', 'Chl1', 'Cadm4', 'Sox10', 'Cdh19', 'Snca']
               }

provisional_manual_dict_cats_axes = {}
provisional_manual_dict_cats_clusters = {}

In [None]:
dict_cats_clusters = {'a1': ['Igfbp2', 'F13a1', 'Aldh3a1', 'Rspo1', 'Mmp3', 'Adh7', 'Apcdd1', 'Cd59', 'Ndufa4l2', 'Olfml3', 'Crispld2', 
                             'Sncg', 'Ltbp4', 'Fxyd5', 'Rarres1', 'Stc1', 'Cyp26b1', 'Spry1', 'Hmcn1', 'Grem1', 'Cilp', 'Ccnd1', 'Gch1', 
                             'Rasgrf1', 'C1qtnf3', ],
                      'a2': [ 'Myo7b', 'Nkain4', 'Tgif1', 'Arl4a', 'Has1', 'Sbno2', 'Ifrd1', 'Bcl3', 'Atp13a2', 'Mxd1', 'Clic3', 'Cav2', 
                             'Fosl1', 'Ret', 'Tff2', 'Tmem52', 'Igfbp2', 'F13a1', 'Aldh3a1', 'Rspo1', 'Mmp3', 'Adh7', 'Apcdd1', 'Cd59', 
                             'Ndufa4l2', 'Olfml3', 'Crispld2', ],
                     'a3': [ 'Ecm1', 'Lamb1', 'Hmcn1', 'Gfra2', 'Gpm6a', 'Etv1', 'Aqp1', 'Bin3', 'Sncg', 'Igfbp2', 'F13a1', 'Aldh3a1', 
                            'Rspo1', 'Mmp3', 'Adh7', 'Apcdd1', 'Cd59', 'Ndufa4l2', 'Olfml3', 'Crispld2', ],
                     'a4': ['Pltp', 'Cck', 'Sfrp1', 'Dpep1', 'Mmp14', 'Postn', 'Lpl', 'Angptl1', 'Enpp3', 'Pon3', 'Cxcl13', 'Dpp4', 'Angptl4', 
                            'Ccl11', 'Cthrc1', 'Hmgcs2', 'Cyp1b1', 'Vnn1', 'Lgi1', 'Cpxm1', 'Il13ra1', 'Ogn', 'Tes', 'Lmo2', 'Pola2', 
                            'Nectin2', 'Igfbp2', 'F13a1', 'Aldh3a1', 'Rspo1', 'Mmp3', 'Adh7', 'Apcdd1', 'Cd59', 'Ndufa4l2', 'Olfml3', 
                            'Crispld2', ],
                     'b1': ['Bgn', 'Tpm2', 'Igfbp4', 'Tsc22d1', 'Cdh11', 'Cdh13', 'Egflam', 'Tnmd', 'Wfdc1', 'Ednrb', 'Col8a2', 'Alpl', 
                            'Ptger3', 'Pmepa1', 'Col11a1', 'Acta2', 'Myl9', 'Tns3', 'Sgcg', 'Rgcc', 'Tagln', 'Ramp1', 'Cdc42ep3', 'Srpx2', 
                            'Actg2', 'Tnc', 'Crym', 'Hdac11', ],
                     'b2': ['Bgn', 'Tpm2', 'Igfbp4', 'Tsc22d1', 'Cdh11', 'Cdh13', 'Egflam', 'Tnmd', 'Wfdc1', 'Ednrb', 'Col8a2', 'Alpl', 
                            'Ptger3', 'Pmepa1', 'Col11a1', 'Acta2', 'Syt13', 'Enpp2', 'Ptch1', 'Adamts18', 'Heyl', 'Tek', ],
                     'b3': [ 'Bgn', 'Tpm2', 'Igfbp4', 'Tsc22d1', 'Cdh11', 'Cdh13', 'Egflam', 'Fxyd5', 'Gng2', 'Nradd', 'Ptch1', 
                            'Rasl11b', 'Ptn', 'Cox4i2', 'Crabp2', 'Prr7', 'Inhba', 'Nkd2', 'Bcl2', 'Wif1', 'Prss12', 'Slc26a7', 'Alx3', ],
                     'c1': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Slc2a1', 'Itgb4', 'Bcam', 'F11r', 'Itga6', 'Lad1', 
                            'Serpinb1a', 'Anxa3', 'Tm4sf1', 'Sostdc1', 'Sfrp5', 'Thbs4', 'Tenm2', 'Wnt6', 'Cntfr', 'Aqp1', 'Adrb2', 'Kcnj13', 
                            'Sbspon', 'Tmem176b', 'Rnd1', ],
                     'c2': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Postn', 'S1pr3', 'Aoc3', 'Mdk', 'Areg', 'Nrp1', 'Acp5', 
                            'Cited1', 'Olfml2a', 'Cacna1g', 'Wnt10a', ],
                     'c3': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Mgp', 'Cygb', 'Nrp1', 'Vcam1', 'Frmd6', 'Serpine2', 
                            'Gdf10', 'Meox2', 'Ech1', 'Tsc22d1', 'Gpsm3', 'Rprml', 'Cox6b2', 'Tmem9b', ],
                     'c4': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Serpine2', 'Tspan7', 'Stmn2', 'Cxcl9', 'Ninj2', 
                            'Serpine1', 'Lsp1', 'Kcnip4', 'Gadd45a', 'Tekt3', ],
                     'c5': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Serpine2', 'Zc3h12a', 'Perp', 'Vcam1', 'Fxyd3', 'Dlc1', 
                            'Fam25a', 'Adamts1', 'Tnfaip6', 'Epcam', 'Irf6', 'Tifa', 'Has2', 'Taf4b', 'Serpinb2', ],
                     'c6': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Pla2g2a', 'Mup4', 'Igfbp3', 'Wfdc1', 'Gdf10', 'Gsta1', 
                            'Rbp4', 'Cox6b2', 'Acp5', 'Cp', 'Ces1d', 'Shisa3', 'C7', 'Slit2', 'Sfrp2', 'Steap2', ],
                     'd1': ['Pi16', 'Mfap5', 'Fn1', 'Fbn1', 'Fndc1', 'Postn', 'Igfbp5', 'Dpp4', 'Oaf', 'Fstl1', 'Nov', 'Vat1', 'Axl', 'Myoc', 
                            'Postn', 'Itm2a', 'Col5a3', 'Phf7', 'C1qtnf6', 'Cfd', 'Thbs3', 'Thy1', 'Scn1b', 'Dpcd', 'Enpp2', 'Lsp1', ],
                     'd2': ['Pi16', 'Mfap5', 'Fn1', 'Fbn1', 'Fndc1', 'Postn', 'Igfbp5', 'Dpp4', 'Oaf', 'Fstl1', 'Nov', 'Vat1', 'Axl', 'Myoc', 
                            'Tnfaip6', 'Ptx3', 'Cxcl1', 'Has1', 'Myoc', 'Carhsp1', 'Elf1', 'Tfpi2', 'Tnfsf9', 'Angptl4', 'Pla2g2a', 'Ralgds', 
                            'Tes', ],
                     'd3': ['Pi16', 'Mfap5', 'Fn1', 'Fbn1', 'Fndc1', 'Postn', 'Igfbp5', 'Dpp4', 'Oaf', 'Fstl1', 'Nov', 'Vat1', 'Axl', 'Myoc', 
                            'Lrrn4cl', 'Daglb', 'Hs3st1', 'Efhd1', 'Akr1c3', 'Dpp4', 'Cotl1', 'Stmn4', 'Gap43', 'Galnt16', 'Ptges', 'Pcsk6', 
                            'Smpd3', 'Lurap1l', 'Dact2', 'Car8', 'Tnfrsf11b', 'Gnpnat1', 'Tek', 'Anxa3', ],
                     'd4': ['Pi16', 'Mfap5', 'Fn1', 'Fbn1', 'Fndc1', 'Postn', 'Igfbp5', 'Dpp4', 'Oaf', 'Fstl1', 'Nov', 'Vat1', 'Axl', 'Myoc', 
                            'Sfrp4', 'Mgp', 'Ogn', 'Ctgf', 'Eln', 'Fibin', 'Cryab', 'Cpxm2', 'Gas6', 'Clic2', 'Cilp', 'Rerg', 'Cfh', 'Dpp4', 
                            'Ltbp4', 'Gpc3', 'Ier3', 'Col12a1', 'Myoc', 'S100b', ],}
dict_cats_axes = {'a': ['Il1r2', 'Fam180a', 'Clu', 'Entpd1', 'Sectm1b', 'Ccl19', 'Cgref1', 'Asip', 'Adgrv1', 'Tex264', 'Olfml3', 'Adamts2', 
                        'Mgst1', 'Apcdd1', 'Sulf2', 'Crabp1', 'Ctsh', 'Psap', 'Aebp1', 'Ctsl', 'Pdgfrl', 'Cpz', 'Cck', ],
                  'b': ['Bgn', 'Tpm2', 'Igfbp4', 'Tsc22d1', 'Cdh11', 'Cdh13', 'Egflam', 'Fxyd5', 'Gng2', 'Nradd', 'Ptch1', 'Rasl11b', 
                        'Ncam1', 'Nog', ],
                  'c': ['Igfbp7', 'Cxcl14', 'Mgp', 'Igfbp3', 'Cxcl14', 'Gstt3', 'Sparcl1', 'Il6', 'Tnfaip6', 'Sat1', ],
                  'd': ['Pi16', 'Mfap5', 'Fn1', 'Fbn1', 'Fndc1', 'Postn', 'Igfbp5', 'Dpp4', 'Oaf', 'Fstl1', 'Nov', 'Vat1', 'Axl', 
                        'Myoc', 'Arpc1a', 'Zfp385a', ],}

In [None]:
dict_colors = {'a1': '#c93038', 'a2': '#de6a38', 'a3': '#ffad3b', 'a4': '#852d66',
               'b1': '#b4d645', 'b2': '#51c43f', 'b3': '#309c63',
               'c1': '#93dfe4', 'c2': '#63c2c9', 'c3': '#4c93ad',  'c4': '#3c83bd', 'c5': '#264f6e', 'c6': '#1c735d',
               'd1': '#fcbf8a', 'd2': '#b58057', 'd3': '#956642',  'd4': '#754622',
               'T1': '#29c2a8', 'U': '#dedede'}

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
def plot_score_graph(adatax):
    df_cats_own = pd.DataFrame(index=adatax.obs_names, columns=['clusters', 'score'])
    for cluster in adatax.obs['cluster'].cat.categories:
        adata_sub = adatax[adatax.obs['cluster'] == cluster]
        try:
            df_cats_own.loc[adata_sub.obs_names, 'score'] = adata_sub.obs[f'cluster_{cluster}']
            df_cats_own.loc[adata_sub.obs_names, 'clusters'] = cluster
        except:
            pass

    df_cats_own = df_cats_own.sort_values('clusters')
    sns.barplot(x='clusters', y='score', data=df_cats_own, palette=adatax.uns['cluster_colors'])

In [None]:
data_dir = os.getcwd() + '/data/'
print(data_dir)

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Buechler et al. 2021

In [None]:
buechler_dir = data_dir + '/buechler_2021'

In [None]:
adata_buechler = sc.read(buechler_dir + '/buechler_2021.loom')
adata_buechler.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_buechler.var['mt'] = adata_buechler.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_buechler, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_buechler, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_buechler, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_buechler, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_buechler = adata_buechler[((adata_buechler.obs.n_genes_by_counts < 4000) & 
                                    (adata_buechler.obs.n_genes_by_counts > 1850)).values, :]
adata_buechler = adata_buechler[adata_buechler.obs.pct_counts_mt < 12, :]

In [None]:
sc.pp.filter_genes(adata_buechler, min_counts=1)
sc.pp.normalize_total(adata_buechler)
sc.pp.log1p(adata_buechler)

In [None]:
sc.pp.pca(adata_buechler, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_buechler, n_neighbors=int(0.5 * len(adata_buechler) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_buechler)

In [None]:
sc.tl.umap(adata_buechler, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_buechler, resolution=4, random_state=seed)

In [None]:
sc.pp.subsample(adata_buechler, fraction=1, random_state=0, copy=False)
sc.pl.umap(adata_buechler, color=['leiden'], legend_loc='on data')

In [None]:
assign_cats(adata_buechler, dict_cats=dict_cats_fb, min_score=0.35, quantile_gene_sel=0.85)

In [None]:
sc.pl.umap(adata_buechler, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_buechler_fb = adata_buechler[adata_buechler.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(adata_buechler_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_buechler_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(adata_buechler_fb,  n_neighbors=int(0.5 * len(adata_buechler_fb) ** 0.5 // 7), metric='cosine')
tk.tl.triku(adata_buechler_fb)

In [None]:
sc.tl.umap(adata_buechler_fb, min_dist=0.1, random_state=seed)

In [None]:
sc.tl.leiden(adata_buechler_fb, resolution=0.5, random_state=seed)

In [None]:
sc.pl.umap(adata_buechler_fb, color=['leiden'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(adata_buechler_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(adata_buechler_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_buechler_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_buechler_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_buechler_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_buechler_fb, groupby='leiden', groups=['2'])
sc.pl.umap(adata_buechler_fb, color=adata_buechler_fb.uns['rank_genes_groups']['names']['2'][:200], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_buechler_fb, color=[
'Masp1', 'Hmcn1', 'Akr1cl', 
    'Aldh3a1', 'Fgfr4', 'Grem1', 'Tnfrsf19', 'Pla2g5', 
    'Lrrc15', 'Col11a1', 'Hck', 'Acan', 'Actg2', 
    'Sfrp5', 'Itgb4', 'Kank4', 'Mcam', 'Cadm4', 'Pcp4l1', 
    'Hhip', 'Crabp1', 'Col23a1', 'Crabp2', 'Hey2', 'Ndp'

], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

## Efremova, Mirjana Panglao DB) 2018

In [None]:
efremova_dir = data_dir + '/efremova_2018'

In [None]:
adata_efremova_2018 = sc.read(f"{efremova_dir}/efremova_2018.h5")

In [None]:
# Basic QC filtering
adata_efremova_2018.var['mt'] = adata_efremova_2018.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_efremova_2018, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_efremova_2018, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_efremova_2018, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_efremova_2018, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_efremova_2018.obs['batch'], 'y': adata_efremova_2018.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_efremova_2018.obs['batch'], 'y': adata_efremova_2018.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_efremova_2018 = adata_efremova_2018[(
                              ((adata_efremova_2018.obs.batch == '0') & (adata_efremova_2018.obs.log1p_n_genes_by_counts < 8.3) & 
                                (adata_efremova_2018.obs.log1p_n_genes_by_counts > 6.5))).values, :]
adata_efremova_2018 = adata_efremova_2018[adata_efremova_2018.obs.pct_counts_mt < 9, :]

In [None]:
sc.pp.filter_genes(adata_efremova_2018, min_counts=1)
sc.pp.normalize_total(adata_efremova_2018)
sc.pp.log1p(adata_efremova_2018)

In [None]:
sc.pp.pca(adata_efremova_2018, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_efremova_2018, n_neighbors=int(0.5 * len(adata_efremova_2018) ** 0.5), metric='cosine')
tk.tl.triku(adata_efremova_2018)

In [None]:
sc.tl.umap(adata_efremova_2018, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_efremova_2018, resolution=2, random_state=seed)

In [None]:
assign_cats(adata_efremova_2018, dict_cats=dict_cats_fb, min_score=0.5)
sc.pl.umap(adata_efremova_2018, color=['leiden', 'batch', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
adata_efremova_2018_fb = adata_efremova_2018[adata_efremova_2018.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(adata_efremova_2018_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_efremova_2018_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(adata_efremova_2018_fb, n_neighbors=int(0.5 * len(adata_efremova_2018_fb) ** 0.5), metric='cosine')
tk.tl.triku(adata_efremova_2018_fb)

In [None]:
sc.tl.umap(adata_efremova_2018_fb, random_state=seed)

In [None]:
sc.tl.leiden(adata_efremova_2018_fb, resolution=0.6, random_state=seed)

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden', 'batch'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_efremova_2018_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden'] + list(adata_efremova_2018_fb.uns['rank_genes_groups']['names']['0'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden'] + list(adata_efremova_2018_fb.uns['rank_genes_groups']['names']['2'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden'] + list(adata_efremova_2018_fb.uns['rank_genes_groups']['names']['3'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden'] + list(adata_efremova_2018_fb.uns['rank_genes_groups']['names']['1'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden'] + list(adata_efremova_2018_fb.uns['rank_genes_groups']['names']['4'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_efremova_2018_fb, color=['leiden'] + list(adata_efremova_2018_fb.uns['rank_genes_groups']['names']['5'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
adata_efremova_2018

## Haensel et al. 2021

In [None]:
haensel_dir = data_dir + '/haensel_2021'

In [None]:
adata_haensel = sc.read(f"{haensel_dir}/adata_haensel.h5")

In [None]:
# Basic QC filtering
adata_haensel.var['mt'] = adata_haensel.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_haensel, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_haensel, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_haensel, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_haensel, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_haensel.obs['batch'], 'y': adata_haensel.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_haensel.obs['batch'], 'y': adata_haensel.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_haensel = adata_haensel[(((adata_haensel.obs.pct_counts_mt < 10) &
                                (adata_haensel.obs.log1p_n_genes_by_counts > 6.8) & 
                                (adata_haensel.obs.log1p_n_genes_by_counts < 8.1)) 
                                            ).values, :]

In [None]:
sc.pp.filter_genes(adata_haensel, min_counts=1)
sc.pp.normalize_total(adata_haensel)
sc.pp.log1p(adata_haensel)

In [None]:
sc.pp.pca(adata_haensel, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_haensel, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_haensel, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_haensel) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_haensel)

In [None]:
sc.tl.umap(adata_haensel, min_dist=0.6, random_state=seed)

In [None]:
sc.tl.leiden(adata_haensel, resolution=1.5, random_state=seed)

In [None]:
assign_cats(adata_haensel, dict_cats=dict_cats_fb, min_score=0.4)
sc.pl.umap(adata_haensel, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_haensel_fb = adata_haensel[adata_haensel.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(adata_haensel_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_haensel_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(adata_haensel_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_haensel_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_haensel_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(adata_haensel_fb)

In [None]:
sc.tl.umap(adata_haensel_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(adata_haensel_fb, resolution=4, random_state=seed)

In [None]:
sc.pl.umap(adata_haensel_fb, color=['leiden', 'batch'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_haensel_fb, groupby='leiden')

In [None]:
sc.pl.umap(adata_haensel_fb, color=['leiden', 'batch',] + list(adata_haensel_fb.uns['rank_genes_groups']['names']['4'][:200]), legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(adata_haensel_fb, color=['leiden', 'batch',] + list(adata_haensel_fb.uns['rank_genes_groups']['names']['1'][:200]), legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(adata_haensel_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(adata_haensel_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_haensel_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_haensel_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_haensel_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

## Ma et al. 2020


In [None]:
ma_dir = data_dir + '/ma_2020'

In [None]:
adata_ma_Y = sc.read(f"{ma_dir}/adata_ma_Y.h5")

In [None]:
# Basic QC filtering
adata_ma_Y.var['mt'] = adata_ma_Y.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_ma_Y, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_ma_Y, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_ma_Y, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_ma_Y, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_ma_Y.obs['batch'], 'y': adata_ma_Y.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_ma_Y = adata_ma_Y[(
                              ((adata_ma_Y.obs.batch == '0') & (adata_ma_Y.obs.log1p_n_genes_by_counts < 8.2) & 
                                (adata_ma_Y.obs.log1p_n_genes_by_counts > 7.3)) | 
                              ((adata_ma_Y.obs.batch == '1') & (adata_ma_Y.obs.log1p_n_genes_by_counts < 8.2) & 
                                (adata_ma_Y.obs.log1p_n_genes_by_counts > 7.3))).values, :]

In [None]:
sc.pp.filter_genes(adata_ma_Y, min_counts=1)
sc.pp.normalize_total(adata_ma_Y)
sc.pp.log1p(adata_ma_Y)

In [None]:
sc.pp.pca(adata_ma_Y, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_ma_Y, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_ma_Y, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_ma_Y) ** 0.5), metric='cosine')
tk.tl.triku(adata_ma_Y)

In [None]:
sc.tl.umap(adata_ma_Y, min_dist=0.4, random_state=seed)

In [None]:
sc.tl.leiden(adata_ma_Y, resolution=0.3, random_state=seed)

In [None]:
assign_cats(adata_ma_Y, dict_cats=dict_cats_fb, min_score=0.5, quantile_gene_sel=0.9)
sc.pl.umap(adata_ma_Y, color=['leiden', 'batch', 'assigned_cats', 'Pdgfra', 'Lum', 'Dcn', ], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
adata_ma_Y_fb = adata_ma_Y[adata_ma_Y.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(adata_ma_Y_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_ma_Y_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(adata_ma_Y_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_ma_Y_fb, use_rep='X_pca_harmony', n_neighbors=int(len(adata_ma_Y_fb) ** 0.5 // 3), metric='cosine')
tk.tl.triku(adata_ma_Y_fb)

In [None]:
sc.tl.umap(adata_ma_Y_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(adata_ma_Y_fb, resolution=1.5, random_state=seed)
# sc.tl.leiden(adata_ma_Y_fb, resolution=0.15, random_state=seed)

In [None]:
sc.pl.umap(adata_ma_Y_fb, color=['leiden', 'batch'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_ma_Y_fb, color=['leiden', 'batch', 
'Tnfaip6', 'Ptx3', 'Cxcl1', 'Has1', 'Myoc', 'Elf1', 'Tfpi2', 'Tes',

                                ], legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.tl.rank_genes_groups(adata_ma_Y_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(adata_ma_Y_fb, color=['leiden', 'batch'] + list(adata_ma_Y_fb.uns['rank_genes_groups']['names']['20'][:250]), legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(adata_ma_Y_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster', others_name='U')
assign_cats(adata_ma_Y_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_ma_Y_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_ma_Y_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_ma_Y_fb, color=['leiden', 'batch', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

## Shook 2020

In [None]:
shook_dir = data_dir + '/shook_2020'

In [None]:
adata_shook_NW = sc.read(f"{shook_dir}/adata_shook_NW.h5")

In [None]:
# Basic QC filtering
adata_shook_NW.var['mt'] = adata_shook_NW.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_shook_NW, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_shook_NW, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_shook_NW, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_shook_NW, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_shook_NW.obs['batch'], 'y': adata_shook_NW.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_shook_NW.obs['batch'], 'y': adata_shook_NW.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_shook_NW = adata_shook_NW[(
#                               ((adata_shook_NW.obs.batch == '1') & (adata_shook_NW.obs.log1p_n_genes_by_counts < 7.3) & 
#                                 (adata_shook_NW.obs.log1p_n_genes_by_counts > 6.2)) | 
#                               ((adata_shook_NW.obs.batch == '2') & (adata_shook_NW.obs.log1p_n_genes_by_counts < 7.3) & 
#                                 (adata_shook_NW.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((adata_shook_NW.obs.batch == '3') & (adata_shook_NW.obs.log1p_n_genes_by_counts < 8) & 
                                (adata_shook_NW.obs.log1p_n_genes_by_counts > 7)) | 
                              ((adata_shook_NW.obs.batch == '4') & (adata_shook_NW.obs.log1p_n_genes_by_counts < 8) & 
                                (adata_shook_NW.obs.log1p_n_genes_by_counts > 7))).values, :]
adata_shook_NW = adata_shook_NW[adata_shook_NW.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(adata_shook_NW, min_counts=1)
sc.pp.normalize_total(adata_shook_NW)
sc.pp.log1p(adata_shook_NW)

In [None]:
sc.pp.pca(adata_shook_NW, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_shook_NW, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_shook_NW, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_shook_NW) ** 0.5), metric='cosine')
tk.tl.triku(adata_shook_NW)

In [None]:
sc.tl.umap(adata_shook_NW, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_shook_NW, resolution=2, random_state=seed)

In [None]:
assign_cats(adata_shook_NW, dict_cats=dict_cats_fb, min_score=0.5)
sc.pl.umap(adata_shook_NW, color=['leiden', 'batch', 'assigned_cats', 'Rgs5', 'Pecam1', 'Lyve1', 'Mlana'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
adata_shook_NW_fb = adata_shook_NW[adata_shook_NW.obs['assigned_cats'].isin(['fibro', 'unassigned'])]

In [None]:
sc.pp.filter_genes(adata_shook_NW_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_shook_NW_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(adata_shook_NW_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_shook_NW_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_shook_NW_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(adata_shook_NW_fb)

In [None]:
sc.tl.umap(adata_shook_NW_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(adata_shook_NW_fb, resolution=0.9, random_state=seed)

In [None]:
sc.tl.rank_genes_groups(adata_shook_NW_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(adata_shook_NW_fb, color=['leiden',                                      
'Fbln7',
'Thbs4',
'Ggt5',
'Nop58',
'Thbs1',
'Bmper',
'Casp4',
'G0s2',
                                     
'Angpt4', 'Fgf9', 'Nkain4', 'Npy1r', 'Ecrg4', 'Wfdc1'                                     
], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(adata_shook_NW_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(adata_shook_NW_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_shook_NW_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_shook_NW_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_shook_NW_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
dict_haensel = {'Ha': ['Col1a1', 'Col1a2', 'Col16a1', 'Cgref1', 'Pla2g5', 'Cib3', 'Gpha2', 'Tmem41a'], 
                'Hb': []}

In [None]:
gene_list = ['C2', 'C7', 'Vit', 'Nmb', 'C4b', 'Srpx', 'Sned1', 'Pltp', 'Cpe', 'Tent5c']


In [None]:
sc.pl.umap(adata_buechler_fb, color=[i for i in gene_list if i in adata_buechler_fb.var_names], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=4)

In [None]:
sc.pl.umap(adata_haensel_fb, color=[i for i in gene_list if i in adata_haensel_fb.var_names], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=4)

In [None]:
sc.pl.umap(adata_ma_Y_fb, color=[i for i in gene_list if i in adata_ma_Y_fb.var_names], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=4)

In [None]:
sc.pl.umap(adata_shook_NW_fb, color=[i for i in gene_list if i in adata_shook_NW_fb.var_names], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=4)