# Comparison of fibroblast populations

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
# import ray
# import subprocess
# import time
# import scvelo as scv
# import gc
import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats
from fb_functions import clear_adata
from fb_functions import plot_score_graph

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0
%store seed

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

%store magma

In [None]:
dict_cats_fb = {'peri': ['Rgs5', 'Myl9', 'Ndufa4l2', 'Nrip2', 'Mylk', 'Rgs4', 'Acta2', 'Sncg', 'Tagln', 'Des', 'Ptp4a3', 'Myh11'], 
                'endo': ['Pecam1', 'Cdh5', 'Egfl7', 'Cd36', 'Srgn', 'Adgrf5', 'Ptprb', 'Scarb1', 'Plvap', 'Grrp1', 'C1qtnf9', 'Mmrn2', 'Flt1'], 
                'kerato': ['Krt14', 'Krt15', 'Perp', 'S100a14', 'Ccl27a', 'Gata3', 'Dapl1', 'Rab25', 'Ckmt1', 'Col17a1', 'Serpinb5'],
                'kerato Gjb2': ['Ucp2', 'Krt71', 'Gjb2', 'Ahcy', 'Acaa2', 'Cbs', 'Slc3a2', 'Serpina11', 'Lap3', 'Gss', 'Basp1', ],
                'fibro': ['Dcn', 'Pdgfra',  'Lum', 'Col1a1', 'Col1a2',],
                'fibro_2': ['Ncam1', 'Ptch1', 'Trps1', 'Col11a1', 'Wif1'],
                'T cell': ['Rac2', 'Ptprcap', 'Il2rg', 'Cd3g', 'Skap1', 'Hcst', 'Ctsw', 'Ets1', 'Cd3d', 'Ctla2a', 'Cd2'],
                'APC': ['Tyrobp', 'Cd74', 'H2-Aa', 'H2-Eb1', 'Ctss', 'Spi1', 'Napsa', 'Cd68', 'Lyz2', 'Csf2ra'],
                'lymph': ['Ccl21a', 'Egfl7', 'Mmrn1', 'Nsg1', 'Meox1', 'Gimap6', 'Kdr'],
                'melano / schwann': ['Syngr1', 'Pmel', 'Mlana'],
                'myo': ['Tnnt1', 'Tnnt2', 'Tnnt3', 'Tnnc2', 'Acta1', 'Myl1', 'Tnni2', 'Tcap', 'Eno3', 'Myoz1'],
                'neural': ['Itgb8', 'Plp1', 'Ptn', 'Egfl8', 'Chl1', 'Cadm4', 'Sox10', 'Cdh19', 'Snca']
               }

dict_cats_clusters = {'a1': ['Ptgs2', 'Gfpt2', 'Ugdh', 'Gm48942', 'Ccl2', 'Gm45551', 'Cxcl1',
                            'Anxa3', 'Gm12840', 'Has1', 'Uap1', 'Ptx3', 'Efhd1', 'Smpd3',
                            'Errfi1', 'Gm20186', 'Akr1c18', 'Plat', 'Ifi205', 'Procr',
                            'Wnt10b', 'Prss23', 'Irak3', 'Sbsn', 'Ccl7', 'Sema3c', 'Fndc1',
                            'Wnt2', 'Emilin2', 'Anxa1'],
                     'a2': ['Anxa3', 'Efhd1', 'Aldh1a3', 'Akr1c18', 'Wnt2', 'Smpd3', 'Dpp4',
                            'Pi16', 'Sbsn', 'Pla1a', 'Sema3c', 'Emilin2', 'Chst1', 'Il18',
                            'Limch1', 'Cd248', 'Mfap5', 'Igfbp4', 'Prss23', 'Sfrp2', 'Tek',
                            'Itgb7', 'Dact2', 'Tmem100', 'Efemp1', 'Mustn1', 'Thbd',
                            'Ifi27l2a', 'Slc4a10', 'Axl'],
                     'a3': ['Stmn1', 'Birc5', 'Cks2', 'Ccna2', 'Ube2c', 'Lockd', 'Spc25',
                            'Top2a', 'Cdca3', 'Cenpm', 'Tpx2', 'Tpm2', '2810417H13Rik', 'Cdk1',
                            'Spc24', 'Diaph3', 'Mki67', 'Pbk', 'Cdca8', 'Ckap2', 'H2afz',
                            'Cdkn3', 'Hmgb2', 'Cenpf', 'Col7a1', 'Prc1', 'Tk1', 'Ccnb2',
                            'Gmnn', 'Cdc20'],
                     'b1': ['Cxcl12', 'Postn', 'Pltp', 'Cd36', 'Agt', 'Cyp1b1', 'C6', 'Olfm2',
                            'Adam12', 'Cyp2f2', 'Phospho1', 'Ctsk', 'Slit2', 'Scg3', 'Txnip',
                            'Fabp4', 'Mlana', 'Lpl', 'Cthrc1', 'Mex3b', 'Mmp14', 'Pparg',
                            'Cygb', 'Hpgd', 'Pth1r', 'Aoc3', 'Fzd4', 'Slco2b1', 'Gas6', 'Ggt5'],
                     'b2': ['Hmcn2', 'Thbs4', 'Mgp', 'Fbln7', 'Meox1', 'Col8a1', 'Cilp',
                            'Smoc2', 'Cxcl14', 'Prss12', 'Cygb', 'Sparcl1', 'Col5a3',
                            'Angptl1', 'Ret', 'Fhl2', 'Abca8a', 'Col4a2', 'Fap', 'Col6a3',
                            'Myoc', 'Gfra1', 'Crlf1', 'Col6a2', 'Sept4', 'Rem1', 'Col4a1',
                            'Lsamp', 'Col6a6', 'Mest'],
                     'b3': ['Cilp', 'Fgf9', 'Igf1', 'F3', 'Nkain4', 'Fxyd6', 'Angpt4', 'Mgp',
                            'Ccn5', 'Gas6', 'Clu', 'Tgm2', 'Gm5084', 'Wfdc1', 'Arhgdib',
                            'Ccn2', 'Angptl7', 'Fmo2', 'Gpx3', 'Col12a1', 'Paqr6', 'Sfrp1',
                            'Ecrg4', 'Cygb', 'Smoc2', 'Ltbp4', 'Boc', 'Gas1', 'Npy1r', 'Mfap4'],
                     'b4': ['Steap4', 'Cygb', 'Inmt', 'Agt', 'C4b', 'C2', 'Ggt5', 'Sned1',
                            'Cxcl12', 'Nmb', 'Vit', 'Cp', 'Col4a2', 'Mmp3', 'Vtn', 'Igfbp7',
                            'Adamtsl3', 'Cfh', 'Bmper', 'Entpd2', 'C1s1', 'Col4a1', 'Aoc3',
                            'Lpl', 'Nrp1', 'Gdf10', 'Tmem176a', 'Tmem176b', 'Meox2', 'Nr2f2'],
                     'b5': ['Apod', 'Rasgrp2', 'Cp', 'Vwa1', 'Ccl9', 'Spp1', 'Vit', 'P2ry14',
                            'Abca8a', 'Trf', 'Smoc2', 'Ccl11', 'Matn2', 'Gpc3', 'Vtn',
                            '1500009L16Rik', 'Ebf2', 'Myoc', 'Col8a1', 'Nr2f2', 'Cpe', 'Gfra1',
                            'Pdrg1', 'Steap4', 'Sparcl1', 'Col15a1', 'Itm2a', 'Phgdh', 'Thbs4',
                            'Ecm1'],
                     'b/c': ['Cyp2f2', 'Masp1', 'Rtn4r', 'Pltp', 'Crp', 'Serpina3n', 'C1qtnf3',
                            'Tgfbi', 'Sectm1a', 'Lgr5', 'Lepr', 'Crip2', 'Pth1r', 'Gas6',
                            'Hpgd', 'Ppp2r2c', 'Slco2b1', 'Mkx', 'Sparcl1', 'Cyp4b1', 'Akr1cl',
                            'Inhbb', 'Bcl11b', 'Angptl1', 'Rasa3', 'Cpz', 'Gng13', 'Penk',
                            'mt-Rnr2', 'Zfp536'],
                     'c1': ['Ndufa4l2', 'Cpz', 'Ppp1r14a', 'Cldn10', 'Cdh4', 'Cgref1',
                            'Aldh3a1', 'Col1a1', 'Pla2g5', 'Col1a2', 'Csf1r', 'Creb3l3',
                            'Sulf2', 'Tgfbi', 'Fgfr4', 'Sparc', 'Col16a1', 'Cib3', 'Cyp2f2',
                            'Sema3b', 'Serpina3n', 'Creb3l1', 'Kazald1', 'Adcy1', 'Mmp27',
                            'Rassf4', 'Gpha2', 'Rcn3', 'P4ha2', 'Tmem150c'],
                     'c2': ['Igfbp2', 'Grem1', 'F13a1', 'Sema3a', 'Stc1', 'Mamdc2', 'Wnt5a',
                            'Serpina3g', 'Qpct', 'Kcnk2', 'Slc6a2', 'Col13a1', 'Notum',
                            'Ccbe1', 'Ccnd1', 'Fxyd6', 'Nt5e', 'Rarres1', 'Tmem132c', 'Meox2',
                            'Zfp385b', 'Rspo1', 'Nkd1', 'Ndufa4l2', 'Cdh4', 'Cldn10', 'Ccdc42',
                            'Miat', 'Ackr4', 'Adcy1'],
                     'c3': ['Ccl8', 'Slc10a6', 'Serpina3n', 'Ccl7', 'Gbp5', 'Casp4', 'Tnfaip6',
                            'Mmp3', 'Cpxm1', 'Ccl11', 'Fgl2', 'Sowahc', 'Vcam1', 'H2-Q7',
                            'H2-K1', 'Birc3', 'Gbp2', 'B2m', 'Ddah1', 'Mt1', 'Ccl19', 'Mt2',
                            'Il6', 'CR974586.5', 'Npc2', 'Ccl2', 'Postn', 'Gch1', 'Iigp1',
                            'AW112010'],
                     'c/d': ['Mfap4', 'Cpxm2', 'Dkk2', 'Ltbp2', 'Tnmd', 'Ccl19', 'Coch',
                            'Pianp', 'Slit2', 'Gas1', 'Wnt5b', 'Ackr4', 'Eln', 'Syt13',
                            'Mmp16', 'Il15', 'Cyp26b1', 'Fam180a', 'Nrep', 'Mafb', 'Enpp2',
                            'Ptgfr', 'Cyp1b1', 'Pth1r', 'Cmklr1', 'Atp1a2', 'Tmem204', 'Cd9',
                            'Gpm6b', 'Nrn1'],
                     'd1': ['Aqp1', 'Igfbp4', 'Tbx3', 'Nrp1', 'Hsd17b7', 'Mxra8',
                            '3300005D01Rik', 'Itga8', 'Hs3st6', 'Vcan', 'F2r', 'Cotl1',
                            'Spry1', 'Efna5', 'Shc4', 'Cdkn2b', 'Meox2', 'Mgst3', 'P2ry1',
                            'Dusp6', 'Bhlhe41', 'Tcf4', 'Tmem204', 'Sdc1', 'Foxs1', 'Wisp1',
                            'Fcgr2b', 'Calcrl', 'Egr2', 'Postn'],
                     'd2': ['Col11a1', 'Kif26b', 'Lrrc15', 'Coch', 'Tpm2', 'Kera', 'Fmod',
                            'Tagln', 'Robo2', 'Wif1', 'Col7a1', 'Aqp1', 'Cd200', 'Tnmd',
                            'Megf6', 'Ifitm1', 'Mylk', 'Rasgrp2', 'Ncam1', 'Pianp', 'Mafb',
                            'Trps1', 'Itga8', 'Emid1', 'Cox4i2', 'Stmn2', 'Enpp2', 'Tnn', 'a',
                            'Ednra'],
                     'd3': ['Trps1', 'Prlr', 'Col23a1', 'Lef1', 'Lamc3', 'Scube3', 'Inhba',
                            'Tmem176a', 'Crabp1', 'Enpp2', 'Wif1', 'Gm48159', 'a', 'Edn3',
                            'Hhip', 'Rspo1', 'Tmem176b', 'Corin', 'Chodl', 'Pappa2', 'Mdk',
                            'Crabp2', 'Ptger3', 'Cd24a', 'Rspo3', 'Ndnf', 'Runx3', 'Kctd1',
                            'Fgfr2', 'Sostdc1'],
                     'd4': ['Mmp11', 'Chchd10', 'Mamdc2', 'Lrrc15', 'Mif', 'Plxdc1', 'Postn',
                            'Mdk', 'Adam12', 'Dkk3', 'Sox4', 'Ednrb', 'F2r', 'Mafb', 'Ctsc',
                            'Pawr', 'Mylk', 'Lhfpl2', 'Adamts15', 'Cdh11', 'Bok', 'Ociad2',
                            'Ncam1', 'Tmem140', 'Pard6g', 'Corin', 'Etl4', 'Bcl11b', 'Nav2',
                            'Cdc42ep3'],
                     'e1': ['Cldn1', 'Ebf2', 'Klf5', 'Sbspon', 'Itga6', 'Nr2f2', 'Igfbp6',
                            'Itgb4', 'Mgp', 'Cav1', 'Gas6', 'Fxyd6', 'Tubb2b', 'Mfap5',
                            'Stxbp6', 'Phlda3', 'Lmo4', 'Lbp', 'Ptch1', 'Ccdc3', 'Mras',
                            'Arhgdib', 'Rgs16', 'Ndrg2', 'Acer3', 'Wnt6', 'Tln2', 'Akap12',
                            'Scd1', 'Homer2'],
                     'f1': ['Mbp', 'Mpz', 'Itga6', 'Cldn1', 'Sbspon', 'Cd59a', 'Ebf2', 'Itgb4',
                            'Gab1', 'Cryab', 'Fxyd6', 'Nr2f2', 'Cpe', 'Tmod2', 'Ndrg1',
                            'Igfbp6', 'Col5a3', 'Secisbp2l', 'Klf5', 'Scd1', 'Sfrp1', 'Dbi',
                            'Phlda3', 'Cxcl1', 'Pmp22', 'Adam10', 'Nr4a2', 'Utrn', 'Phlda1',
                            'Csrp1']}

dict_cats_axes = {'a': ['Efhd1', 'Anxa3', 'Akr1c18', 'Smpd3', 'Aldh1a3', 'Sbsn', 'Wnt2',
                        'Chst1', 'Wnt10b', 'Sema3c', 'Limch1', 'Pla1a', 'Gfpt2', 'Emilin2',
                        'Il18', 'Dpp4', 'Itgb7', 'Ugdh', 'Prss23', 'Thbd', 'Sema3e',
                        'Pi16', 'Cd55', 'Tek', 'Fndc1', 'Aif1l', 'Ptgs2', 'Dbn1', 'Dmkn',
                        'Axl'],
                 'b': ['Cygb', 'Mgp', 'F3', 'Cxcl14', 'Hmcn2', 'Smoc2', 'Fxyd6', 'Meox1',
                        'Fbln7', 'Steap4', 'Cilp', 'Gpx3', 'Gas6', 'Ggt5', 'Abca8a',
                        'Podn', 'Thbs4', 'Igfbp7', 'Entpd2', 'Col8a1', 'Igf1', 'Nfib',
                        'Sfrp1', 'Bgn', 'Angpt4', 'Cxcl12', 'Ltbp4', 'Fzd4', 'Sparcl1',
                        'Col4a2'],
                 'c': ['Ppp1r14a', 'Ndufa4l2', 'Cdh4', 'Cldn10', 'Cpz', 'Aldh3a1',
                        'Pla2g5', 'Cgref1', 'Fgfr4', 'Csf1r', 'Grem1', 'Cib3', 'Sulf2',
                        'Pla2g2e', 'Adcy1', 'Tgfbi', 'Creb3l3', 'Col1a1', 'Rassf4',
                        'Col1a2', 'Miat', 'Igfbp2', 'Creb3l1', 'Gpha2', 'Tmem150c',
                        'Serpina3n', 'Sema3a', 'Rspo1', 'Sema3b', 'Col16a1'],
                 'd': ['Wif1', 'Trps1', 'Lamc3', 'Tpm2', 'Ncam1', 'Enpp2', 'a', 'Col23a1',
                        'Col11a1', 'Runx3', 'Ptger3', 'Robo2', 'Kif26b', 'Scube3', 'Myo1b',
                        'Crabp1', 'Lrrc15', 'Aqp1', 'Crabp2', 'Prlr', 'Prdm1', 'Mdk',
                        'Nav2', 'Inhba', 'Daam2', 'Edn3', 'Myo10', 'Aplp1', 'Kctd1',
                        'Gng2'],
                 'e': ['Cldn1', 'Ebf2', 'Klf5', 'Sbspon', 'Itga6', 'Nr2f2', 'Igfbp6',
                        'Itgb4', 'Mgp', 'Cav1', 'Gas6', 'Fxyd6', 'Tubb2b', 'Mfap5',
                        'Stxbp6', 'Phlda3', 'Lmo4', 'Lbp', 'Ptch1', 'Ccdc3', 'Mras',
                        'Arhgdib', 'Rgs16', 'Ndrg2', 'Acer3', 'Wnt6', 'Tln2', 'Akap12',
                        'Scd1', 'Homer2'],
                 'f': ['Mbp', 'Mpz', 'Itga6', 'Cldn1', 'Sbspon', 'Cd59a', 'Ebf2', 'Itgb4',
                        'Gab1', 'Cryab', 'Fxyd6', 'Nr2f2', 'Cpe', 'Tmod2', 'Ndrg1',
                        'Igfbp6', 'Col5a3', 'Secisbp2l', 'Klf5', 'Scd1', 'Sfrp1', 'Dbi',
                        'Phlda3', 'Cxcl1', 'Mras', 'Stmn1', 'Pmp22', 'Adam10', 'Nr4a2',
                        'Utrn']}

In [None]:
dict_colors = {'a1': '#c93038', 'a2': '#e01f6c', 'a3': '#aa0044',
               'b1': '#ffa900', 'b2': '#ff6600', 'b3': '#d45500', 'b4': '#c87137', 'b5': '#ecaa87',
               'b/c': "#ddff55",
               'c1': '#b4d645', 'c2': '#51c43f', 'c3': '#309c63',
               'c/d': '#80ffb3',
               'd1': '#aaeeff', 'd2': '#2ad4ff', 'd3': '#5599ff', 'd4': '#0066ff',  
               'e1': '#ddafe9', 'f1': '#8d5fd3'
              }


%store dict_colors

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
data_dir = os.getcwd() + '/data/'
print(data_dir)
%store data_dir

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Data extraction and processing

**Note: The notebook 1M also has a preprocessing of samples, but this preprocessing might be different in 2M. Also, preprocessing of new datasets will be included.**

## Abbasi et al. 2020

In [None]:
abbasi_2020_dir = data_dir + '/abassi_2020'

In [None]:
abassi_2020_ctrl = sc.read(f"{abbasi_2020_dir}/abassi_2020_ctrl.h5")

In [None]:
# Basic QC filtering
abassi_2020_ctrl.var['mt'] = abassi_2020_ctrl.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(abassi_2020_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(abassi_2020_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(abassi_2020_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(abassi_2020_ctrl, x='log1p_total_counts', y='log1p_n_genes_by_counts')

In [None]:
abassi_2020_ctrl = abassi_2020_ctrl[(
                              ((abassi_2020_ctrl.obs.n_genes_by_counts < 3000) & 
                                (abassi_2020_ctrl.obs.n_genes_by_counts > 1100))).values, :]
abassi_2020_ctrl = abassi_2020_ctrl[abassi_2020_ctrl.obs.pct_counts_mt < 9, :]

In [None]:
sc.pp.filter_genes(abassi_2020_ctrl, min_counts=1)
sc.pp.normalize_total(abassi_2020_ctrl)
sc.pp.log1p(abassi_2020_ctrl)

In [None]:
sc.pp.pca(abassi_2020_ctrl, random_state=seed, n_comps=50)
sc.pp.neighbors(abassi_2020_ctrl, n_neighbors=int(0.5 * len(abassi_2020_ctrl) ** 0.5), metric='cosine')
tk.tl.triku(abassi_2020_ctrl)

In [None]:
sc.tl.umap(abassi_2020_ctrl, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(abassi_2020_ctrl, resolution=2, random_state=seed)

In [None]:
assign_cats(abassi_2020_ctrl, dict_cats=dict_cats_fb, min_score=0.5)
sc.pl.umap(abassi_2020_ctrl, color=['leiden', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
abassi_2020_ctrl_fb = abassi_2020_ctrl[abassi_2020_ctrl.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(abassi_2020_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(abassi_2020_ctrl_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(abassi_2020_ctrl_fb, n_neighbors=int(0.5 * len(abassi_2020_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(abassi_2020_ctrl_fb)

In [None]:
sc.tl.umap(abassi_2020_ctrl_fb, random_state=seed)

In [None]:
sc.tl.leiden(abassi_2020_ctrl_fb, resolution=5, random_state=seed)

In [None]:
sc.pl.umap(abassi_2020_ctrl_fb, color=['leiden'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(abassi_2020_ctrl_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(abassi_2020_ctrl_fb, color=['leiden'] + ['Ccl19', 'Ndufa4l2', 'Mfap4', 'Eln', 'Tnmd', 'Cyp2f2', 'Tsc22d3', 'Rtn4r', 'Ltbp2', 'Gpm6b', 'Fam180a', 'Lsamp', 'Pid1', 'Lgr5', 'Sectm1a', 'Prkcb', 'Entpd1',  'Serpina3c', 'Myo1b', 'H2-Q7', 'Bmp4', 'Cystm1', 'Cyp1b1', 'Wnt5b', 'Id1', 'Syt13', 'Etv1', 'Eya1'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(abassi_2020_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(abassi_2020_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
abassi_2020_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(abassi_2020_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(abassi_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del abassi_2020_ctrl_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(abassi_2020_ctrl_fb, color=['cluster'] + [i for i in val if i in abassi_2020_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(abassi_2020_ctrl_fb)

In [None]:
clear_adata(abassi_2020_ctrl_fb)
abassi_2020_ctrl_fb.write_h5ad(abbasi_2020_dir + '/abassi_2020_ctrl_fb_processed.h5')

In [None]:
abassi_2020_ctrl_fb = sc.read(abbasi_2020_dir + '/abassi_2020_ctrl_fb_processed.h5')

## Buechler et al. 2021

In [None]:
buechler_2021_dir = data_dir + '/buechler_2021'

In [None]:
buechler_2021_ctrl = sc.read(buechler_2021_dir + '/buechler_2021_ctrl.h5')
buechler_2021_ctrl.var_names_make_unique()

In [None]:
# Basic QC filtering
buechler_2021_ctrl.var['mt'] = buechler_2021_ctrl.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(buechler_2021_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(buechler_2021_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(buechler_2021_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(buechler_2021_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
buechler_2021_ctrl = buechler_2021_ctrl[((buechler_2021_ctrl.obs.n_genes_by_counts < 4000) & 
                                    (buechler_2021_ctrl.obs.n_genes_by_counts > 2000)).values, :]
buechler_2021_ctrl = buechler_2021_ctrl[buechler_2021_ctrl.obs.pct_counts_mt < 12, :]

In [None]:
sc.pp.filter_genes(buechler_2021_ctrl, min_counts=1)
sc.pp.normalize_total(buechler_2021_ctrl)
sc.pp.log1p(buechler_2021_ctrl)

In [None]:
sc.pp.pca(buechler_2021_ctrl, random_state=seed, n_comps=50)
sc.pp.neighbors(buechler_2021_ctrl, n_neighbors=int(0.5 * len(buechler_2021_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(buechler_2021_ctrl)

In [None]:
sc.tl.umap(buechler_2021_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(buechler_2021_ctrl, resolution=4, random_state=seed)

In [None]:
sc.pp.subsample(buechler_2021_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(buechler_2021_ctrl, color=['leiden'], legend_loc='on data')

In [None]:
assign_cats(buechler_2021_ctrl, dict_cats=dict_cats_fb, min_score=0.35, quantile_gene_sel=0.85)

In [None]:
sc.pl.umap(buechler_2021_ctrl, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
buechler_2021_ctrl_fb = buechler_2021_ctrl[buechler_2021_ctrl.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(buechler_2021_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(buechler_2021_ctrl_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(buechler_2021_ctrl_fb,  n_neighbors=int(len(buechler_2021_ctrl_fb) ** 0.5 // 6), metric='cosine')
tk.tl.triku(buechler_2021_ctrl_fb)

In [None]:
sc.tl.umap(buechler_2021_ctrl_fb, min_dist=0.1, random_state=seed)

In [None]:
sc.tl.leiden(buechler_2021_ctrl_fb, resolution=10, random_state=seed)
# sc.tl.leiden(buechler_2021_ctrl_fb, resolution=0.2, random_state=seed)

In [None]:
sc.pl.umap(buechler_2021_ctrl_fb, color=['leiden', 'Itga6', 'Lamc3'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(buechler_2021_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(buechler_2021_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
buechler_2021_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(buechler_2021_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(buechler_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del buechler_2021_ctrl_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(buechler_2021_ctrl_fb, color=['cluster'] + [i for i in val if i in buechler_2021_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(buechler_2021_ctrl_fb)

In [None]:
clear_adata(buechler_2021_ctrl_fb)
buechler_2021_ctrl_fb.write_h5ad(buechler_2021_dir + '/buechler_2021_ctrl_fb_processed.h5')

In [None]:
buechler_2021_ctrl_fb = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_fb_processed.h5')

## Efremova, Mirjana 2018

In [None]:
efremova_2018_dir = data_dir + '/efremova_2018'

In [None]:
efremova_2018_ctrl = sc.read(f"{efremova_2018_dir}/efremova_2018_ctrl.h5")

In [None]:
# Basic QC filtering
efremova_2018_ctrl.var['mt'] = efremova_2018_ctrl.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(efremova_2018_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(efremova_2018_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(efremova_2018_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(efremova_2018_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': efremova_2018_ctrl.obs['Internal sample identifier'], 'y': efremova_2018_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': efremova_2018_ctrl.obs['Internal sample identifier'], 'y': efremova_2018_ctrl.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
efremova_2018_ctrl = efremova_2018_ctrl[(
                              ((efremova_2018_ctrl.obs['Internal sample identifier'] == '0') & (efremova_2018_ctrl.obs.log1p_n_genes_by_counts < 8.3) & 
                                (efremova_2018_ctrl.obs.log1p_n_genes_by_counts > 6.5))).values, :]
efremova_2018_ctrl = efremova_2018_ctrl[efremova_2018_ctrl.obs.pct_counts_mt < 9, :]

In [None]:
sc.pp.filter_genes(efremova_2018_ctrl, min_counts=1)
sc.pp.normalize_total(efremova_2018_ctrl)
sc.pp.log1p(efremova_2018_ctrl)

In [None]:
sc.pp.pca(efremova_2018_ctrl, random_state=seed, n_comps=50)
sc.pp.neighbors(efremova_2018_ctrl, n_neighbors=int(0.5 * len(efremova_2018_ctrl) ** 0.5), metric='cosine')
tk.tl.triku(efremova_2018_ctrl)

In [None]:
sc.tl.umap(efremova_2018_ctrl, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(efremova_2018_ctrl, resolution=2, random_state=seed)

In [None]:
assign_cats(efremova_2018_ctrl, dict_cats=dict_cats_fb, min_score=0.5)
sc.pl.umap(efremova_2018_ctrl, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
efremova_2018_ctrl_fb = efremova_2018_ctrl[efremova_2018_ctrl.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(efremova_2018_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(efremova_2018_ctrl_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(efremova_2018_ctrl_fb, n_neighbors=int(0.5 * len(efremova_2018_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(efremova_2018_ctrl_fb)

In [None]:
sc.tl.umap(efremova_2018_ctrl_fb, random_state=seed)

In [None]:
sc.tl.leiden(efremova_2018_ctrl_fb, resolution=0.6, random_state=seed)

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden', 'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(efremova_2018_ctrl_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden'] + list(efremova_2018_ctrl_fb.uns['rank_genes_groups']['names']['0'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden'] + list(efremova_2018_ctrl_fb.uns['rank_genes_groups']['names']['2'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden'] + list(efremova_2018_ctrl_fb.uns['rank_genes_groups']['names']['3'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden'] + list(efremova_2018_ctrl_fb.uns['rank_genes_groups']['names']['1'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden'] + list(efremova_2018_ctrl_fb.uns['rank_genes_groups']['names']['4'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['leiden'] + list(efremova_2018_ctrl_fb.uns['rank_genes_groups']['names']['5'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(efremova_2018_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.6, key_added='cluster', others_name='U')
assign_cats(efremova_2018_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
efremova_2018_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(efremova_2018_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(efremova_2018_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del efremova_2018_ctrl_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(efremova_2018_ctrl_fb, color=['cluster'] + [i for i in val if i in efremova_2018_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(efremova_2018_ctrl_fb)

In [None]:
clear_adata(efremova_2018_ctrl_fb)
efremova_2018_ctrl_fb.write_h5ad(efremova_2018_dir + '/efremova_2018_ctrl_fb_processed.h5')

In [None]:
efremova_2018_ctrl_fb = sc.read(efremova_2018_dir + '/efremova_2018_ctrl_fb_processed.h5')

## Haensel et al. 2021

**CAUTION** We include **wounded and unwounded** samples! We have seen that B1 population (B general population) from this dataset was almost depleted in the unwounded, and included with more heterogeneity in the wounded state. However, these populations appear in other datasets such as Abassi and Shook (at homeostatic states), so we are going to keep them.

In [None]:
haensel_2021_dir = data_dir + '/haensel_2021'

In [None]:
haensel_2021_ctrl = sc.read(f"{haensel_2021_dir}/haensel_2021_ctrl.h5")
haensel_2021_wounding = sc.read(f"{haensel_2021_dir}/haensel_2021_wounding.h5")

In [None]:
haensel_2021_ctrl_wounding = sc.AnnData.concatenate(haensel_2021_ctrl, haensel_2021_wounding, batch_categories=['Healthy', 'Wounding'],
                                           batch_key='Condition')


In [None]:
# Basic QC filtering
haensel_2021_ctrl_wounding.var['mt'] = haensel_2021_ctrl_wounding.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(haensel_2021_ctrl_wounding, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(haensel_2021_ctrl_wounding, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(haensel_2021_ctrl_wounding, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(haensel_2021_ctrl_wounding, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': haensel_2021_ctrl_wounding.obs['Internal sample identifier'], 'y': haensel_2021_ctrl_wounding.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': haensel_2021_ctrl_wounding.obs['Internal sample identifier'], 'y': haensel_2021_ctrl_wounding.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
haensel_2021_ctrl_wounding = haensel_2021_ctrl_wounding[(((haensel_2021_ctrl_wounding.obs.pct_counts_mt < 10) &
                                (haensel_2021_ctrl_wounding.obs.log1p_n_genes_by_counts > 7.2) & 
                                (haensel_2021_ctrl_wounding.obs.log1p_n_genes_by_counts < 8.1)) 
                                            ).values, :]

In [None]:
sc.pp.filter_genes(haensel_2021_ctrl_wounding, min_counts=1)
sc.pp.normalize_total(haensel_2021_ctrl_wounding)
sc.pp.log1p(haensel_2021_ctrl_wounding)

In [None]:
sc.pp.pca(haensel_2021_ctrl_wounding, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(haensel_2021_ctrl_wounding, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_wounding, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(haensel_2021_ctrl_wounding) ** 0.5 // 4), metric='cosine')
tk.tl.triku(haensel_2021_ctrl_wounding)

In [None]:
sc.tl.umap(haensel_2021_ctrl_wounding, min_dist=0.2, random_state=seed)

In [None]:
sc.tl.leiden(haensel_2021_ctrl_wounding, resolution=0.2, random_state=seed)

In [None]:
assign_cats(haensel_2021_ctrl_wounding, dict_cats=dict_cats_fb, min_score=0.4, quantile_gene_sel=0.4)
sc.pl.umap(haensel_2021_ctrl_wounding, color=['leiden', 'assigned_cats', 'Internal sample identifier'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
haensel_2021_ctrl_wounding_fb = haensel_2021_ctrl_wounding[haensel_2021_ctrl_wounding.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(haensel_2021_ctrl_wounding_fb, min_counts=1)

In [None]:
sc.pp.pca(haensel_2021_ctrl_wounding_fb, random_state=seed, n_comps=35)
sce.pp.harmony_integrate(haensel_2021_ctrl_wounding_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_wounding_fb, use_rep='X_pca_harmony', n_neighbors=int(len(haensel_2021_ctrl_wounding_fb) ** 0.5 // 7), metric='cosine')
tk.tl.triku(haensel_2021_ctrl_wounding_fb)

In [None]:
sc.tl.umap(haensel_2021_ctrl_wounding_fb, min_dist=0.2, random_state=seed)

In [None]:
sc.tl.leiden(haensel_2021_ctrl_wounding_fb, resolution=0.2, random_state=seed)

sc.tl.leiden(haensel_2021_ctrl_wounding_fb, resolution=2.4, random_state=seed)
sc.tl.leiden(haensel_2021_ctrl_wounding_fb, resolution=5, random_state=seed)

In [None]:
sc.pl.umap(haensel_2021_ctrl_wounding_fb, color=['leiden'], cmap=magma, use_raw=False, legend_loc='on data', ncols=1)
sc.pl.umap(haensel_2021_ctrl_wounding_fb, color=['Internal sample identifier'], cmap=magma, use_raw=False, ncols=1)

In [None]:
sc.tl.rank_genes_groups(haensel_2021_ctrl_wounding_fb, groupby='leiden')

In [None]:
assign_cats(haensel_2021_ctrl_wounding_fb, dict_cats={'krt-like': ['Lgals7', 'Fxyd3', 'Perp', 'Krt15', 'S100a14', 'Sfn', 'Krt5', 'Anxa8', 'Sfn', 'Ly6d', ], 
                                                      'immune-like': ['Fcer1g', 'Tyrobp', 'Srgn', 'Cd52', 'Cxcl2'],
                                                     }, min_score=0.99, quantile_gene_sel=0.99, key_added='clusterx', others_name='U')

In [None]:
sc.pl.umap(haensel_2021_ctrl_wounding_fb, color=['Internal sample identifier', 'leiden', 'clusterx'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
haensel_2021_ctrl_wounding_fb = haensel_2021_ctrl_wounding_fb[haensel_2021_ctrl_wounding_fb.obs['clusterx'] == 'U']

In [None]:
assign_cats(haensel_2021_ctrl_wounding_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(haensel_2021_ctrl_wounding_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
haensel_2021_ctrl_wounding_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(haensel_2021_ctrl_wounding_fb.obs['cluster']))]

In [None]:
sc.pl.umap(haensel_2021_ctrl_wounding_fb, color=['leiden', 'cluster', 'axis'], legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del haensel_2021_ctrl_wounding_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(haensel_2021_ctrl_wounding_fb, color=['cluster'] + [i for i in val if i in haensel_2021_ctrl_wounding_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(haensel_2021_ctrl_wounding_fb)

In [None]:
clear_adata(haensel_2021_ctrl_wounding_fb)
haensel_2021_ctrl_wounding_fb.write_h5ad(haensel_2021_dir + '/haensel_2021_ctrl_wounding_fb_processed.h5')

In [None]:
haensel_2021_ctrl_wounding_fb = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_wounding_fb_processed.h5')

## Phan 2020

In [None]:
phan_2020_dir = data_dir + '/phan_2020'

In [None]:
phan_2020_ctrl_21d = sc.read(f"{phan_2020_dir}/phan_2020_ctrl_21d.h5")

In [None]:
# Basic QC filtering
phan_2020_ctrl_21d.var['mt'] = phan_2020_ctrl_21d.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(phan_2020_ctrl_21d, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(phan_2020_ctrl_21d, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(phan_2020_ctrl_21d, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(phan_2020_ctrl_21d, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': phan_2020_ctrl_21d.obs['Internal sample identifier'], 'y': phan_2020_ctrl_21d.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': phan_2020_ctrl_21d.obs['Internal sample identifier'], 'y': phan_2020_ctrl_21d.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
phan_2020_ctrl_21d = phan_2020_ctrl_21d[(
                              ((phan_2020_ctrl_21d.obs['Internal sample identifier'] == 'P21_Un_1') & (phan_2020_ctrl_21d.obs.log1p_n_genes_by_counts < 8) & 
                                (phan_2020_ctrl_21d.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((phan_2020_ctrl_21d.obs['Internal sample identifier'] == 'P21_Un_2') & (phan_2020_ctrl_21d.obs.log1p_n_genes_by_counts < 8) & 
                                (phan_2020_ctrl_21d.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((phan_2020_ctrl_21d.obs['Internal sample identifier'] == 'P21_Un_3') & (phan_2020_ctrl_21d.obs.log1p_n_genes_by_counts < 8) & 
                                (phan_2020_ctrl_21d.obs.log1p_n_genes_by_counts > 6.2)) 
                             ).values, :]
phan_2020_ctrl_21d = phan_2020_ctrl_21d[phan_2020_ctrl_21d.obs.pct_counts_mt < 5, :]

In [None]:
sc.pp.filter_genes(phan_2020_ctrl_21d, min_counts=1)
sc.pp.normalize_total(phan_2020_ctrl_21d)
sc.pp.log1p(phan_2020_ctrl_21d)

In [None]:
sc.pp.pca(phan_2020_ctrl_21d, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(phan_2020_ctrl_21d, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(phan_2020_ctrl_21d, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(phan_2020_ctrl_21d) ** 0.5), metric='cosine')
tk.tl.triku(phan_2020_ctrl_21d)

In [None]:
sc.tl.umap(phan_2020_ctrl_21d, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(phan_2020_ctrl_21d, resolution=2, random_state=seed)

In [None]:
assign_cats(phan_2020_ctrl_21d, dict_cats=dict_cats_fb, min_score=0.5)
sc.pl.umap(phan_2020_ctrl_21d, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Dcn', 'Mpz', 'Plp1', 'Sfrp5', 'Chil1'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
phan_2020_ctrl_21d_fb = phan_2020_ctrl_21d[phan_2020_ctrl_21d.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(phan_2020_ctrl_21d_fb, min_counts=1)

In [None]:
sc.pp.pca(phan_2020_ctrl_21d_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(phan_2020_ctrl_21d_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(phan_2020_ctrl_21d_fb, use_rep='X_pca_harmony', n_neighbors=int(len(phan_2020_ctrl_21d_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(phan_2020_ctrl_21d_fb)

In [None]:
sc.tl.umap(phan_2020_ctrl_21d_fb, min_dist=0.45, random_state=seed)

In [None]:
sc.tl.leiden(phan_2020_ctrl_21d_fb, resolution=9, random_state=seed)

In [None]:
sc.pl.umap(phan_2020_ctrl_21d_fb, color=['leiden',  'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.tl.rank_genes_groups(phan_2020_ctrl_21d_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(phan_2020_ctrl_21d_fb, color=['leiden', 'Eln', 'Pi16', 'Nrep', 'Gas1', 'Cilp', 'Osr1', 'Hmcn1', 'Mmp16', 'Slit2', 'Ccl19', 'Tspan11', 'Rnf112', 'Col8a1', 'Tspan18', 'Il15', 'Mme', 'Col6a6', 'Fam69a',] , 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(phan_2020_ctrl_21d_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(phan_2020_ctrl_21d_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
phan_2020_ctrl_21d_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(phan_2020_ctrl_21d_fb.obs['cluster']))]

In [None]:
sc.pl.umap(phan_2020_ctrl_21d_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del phan_2020_ctrl_21d_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(phan_2020_ctrl_21d_fb, color=['cluster'] + [i for i in val if i in phan_2020_ctrl_21d_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(phan_2020_ctrl_21d_fb)

In [None]:
clear_adata(phan_2020_ctrl_21d_fb)
phan_2020_ctrl_21d_fb.write_h5ad(phan_2020_dir + '/phan_2020_ctrl_21d_fb_processed.h5')

In [None]:
phan_2020_ctrl_21d_fb = sc.read(phan_2020_dir + '/phan_2020_ctrl_21d_fb_processed.h5')

## Salzer 2018

In [None]:
salzer_2018_dir = data_dir + '/salzer_2018'

In [None]:
salzer_2018_young_old = sc.read(f"{salzer_2018_dir}/salzer_2018_young_old.h5")

In [None]:
# Basic QC filtering
salzer_2018_young_old.var['mt'] = salzer_2018_young_old.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(salzer_2018_young_old, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(salzer_2018_young_old, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(salzer_2018_young_old, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(salzer_2018_young_old, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': salzer_2018_young_old.obs['Internal sample identifier'], 'y': salzer_2018_young_old.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': salzer_2018_young_old.obs['Internal sample identifier'], 'y': salzer_2018_young_old.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
salzer_2018_young_old = salzer_2018_young_old[((salzer_2018_young_old.obs.log1p_n_genes_by_counts < 8.2) & 
                                (salzer_2018_young_old.obs.log1p_n_genes_by_counts > 6.2)).values, :]
salzer_2018_young_old = salzer_2018_young_old[salzer_2018_young_old.obs.pct_counts_mt < 5, :]

In [None]:
sc.pp.filter_genes(salzer_2018_young_old, min_counts=1)
sc.pp.normalize_total(salzer_2018_young_old)
sc.pp.log1p(salzer_2018_young_old)

In [None]:
sc.pp.pca(salzer_2018_young_old, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(salzer_2018_young_old, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(salzer_2018_young_old, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(salzer_2018_young_old) ** 0.5), metric='cosine')
tk.tl.triku(salzer_2018_young_old)

In [None]:
sc.tl.umap(salzer_2018_young_old, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(salzer_2018_young_old, resolution=0.5, random_state=seed)

In [None]:
assign_cats(salzer_2018_young_old, dict_cats=dict_cats_fb, min_score=0.5)
sc.pl.umap(salzer_2018_young_old, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Dcn',], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
salzer_2018_young_old_fb = salzer_2018_young_old[salzer_2018_young_old.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(salzer_2018_young_old_fb, min_counts=1)

In [None]:
sc.pp.pca(salzer_2018_young_old_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(salzer_2018_young_old_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(salzer_2018_young_old_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(salzer_2018_young_old_fb) ** 0.5), metric='cosine')
tk.tl.triku(salzer_2018_young_old_fb)

In [None]:
sc.tl.umap(salzer_2018_young_old_fb, min_dist=0.45, random_state=seed)

In [None]:
sc.tl.leiden(salzer_2018_young_old_fb, resolution=2, random_state=seed)

In [None]:
sc.pl.umap(salzer_2018_young_old_fb, color=['leiden',  'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.tl.rank_genes_groups(salzer_2018_young_old_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(salzer_2018_young_old_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(salzer_2018_young_old_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
salzer_2018_young_old_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(salzer_2018_young_old_fb.obs['cluster']))]

In [None]:
sc.pl.umap(salzer_2018_young_old_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del salzer_2018_young_old_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(salzer_2018_young_old_fb, color=['cluster'] + [i for i in val if i in salzer_2018_young_old_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(salzer_2018_young_old_fb)

In [None]:
clear_adata(salzer_2018_young_old_fb)
salzer_2018_young_old_fb.write_h5ad(salzer_2018_dir + '/salzer_2018_young_old_fb.h5')

In [None]:
salzer_2018_young_old_fb = sc.read(salzer_2018_dir + '/salzer_2018_young_old_fb.h5')

## Shook 2020

In [None]:
shook_2020_dir = data_dir + '/shook_2020'

In [None]:
shook_2020_ctrl = sc.read(f"{shook_2020_dir}/shook_2020_ctrl.h5")

In [None]:
# Basic QC filtering
shook_2020_ctrl.var['mt'] = shook_2020_ctrl.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(shook_2020_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(shook_2020_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(shook_2020_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(shook_2020_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': shook_2020_ctrl.obs['Internal sample identifier'], 'y': shook_2020_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': shook_2020_ctrl.obs['Internal sample identifier'], 'y': shook_2020_ctrl.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
shook_2020_ctrl = shook_2020_ctrl[(
#                               ((shook_2020_ctrl.obs['Internal sample identifier'] == '1') & (shook_2020_ctrl.obs.log1p_n_genes_by_counts < 7.3) & 
#                                 (shook_2020_ctrl.obs.log1p_n_genes_by_counts > 6.2)) | 
#                               ((shook_2020_ctrl.obs['Internal sample identifier'] == '2') & (shook_2020_ctrl.obs.log1p_n_genes_by_counts < 7.3) & 
#                                 (shook_2020_ctrl.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((shook_2020_ctrl.obs['Internal sample identifier'] == 'Ctrl_S4') & (shook_2020_ctrl.obs.log1p_n_genes_by_counts < 8) & 
                                (shook_2020_ctrl.obs.log1p_n_genes_by_counts > 7.35)) | 
                              ((shook_2020_ctrl.obs['Internal sample identifier'] == 'Ctrl_S5') & (shook_2020_ctrl.obs.log1p_n_genes_by_counts < 8) & 
                                (shook_2020_ctrl.obs.log1p_n_genes_by_counts > 7.35))).values, :]
shook_2020_ctrl = shook_2020_ctrl[shook_2020_ctrl.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(shook_2020_ctrl, min_counts=1)
sc.pp.normalize_total(shook_2020_ctrl)
sc.pp.log1p(shook_2020_ctrl)

In [None]:
sc.pp.pca(shook_2020_ctrl, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shook_2020_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shook_2020_ctrl, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(shook_2020_ctrl) ** 0.5), metric='cosine')
tk.tl.triku(shook_2020_ctrl)

In [None]:
sc.tl.umap(shook_2020_ctrl, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(shook_2020_ctrl, resolution=2, random_state=seed)

In [None]:
assign_cats(shook_2020_ctrl, dict_cats=dict_cats_fb, min_score=0.45)
sc.pl.umap(shook_2020_ctrl, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Rgs5', 'Pecam1', 'Lyve1', 'Mlana'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
shook_2020_ctrl_fb = shook_2020_ctrl[shook_2020_ctrl.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(shook_2020_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(shook_2020_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shook_2020_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shook_2020_ctrl_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(shook_2020_ctrl_fb)

In [None]:
sc.tl.umap(shook_2020_ctrl_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(shook_2020_ctrl_fb, resolution=8, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.umap(shook_2020_ctrl_fb, color=['leiden', 'Internal sample identifier',], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(shook_2020_ctrl_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(shook_2020_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U',)
assign_cats(shook_2020_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
shook_2020_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(shook_2020_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(shook_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del shook_2020_ctrl_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(shook_2020_ctrl_fb, color=['cluster'] + [i for i in val if i in shook_2020_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(shook_2020_ctrl_fb)

In [None]:
clear_adata(shook_2020_ctrl_fb)
shook_2020_ctrl_fb.write_h5ad(shook_2020_dir + '/shook_2020_ctrl_fb_processed.h5')

In [None]:
shook_2020_ctrl_fb = sc.read(shook_2020_dir + '/shook_2020_ctrl_fb_processed.h5')

## Yanling 2022

In [None]:
yanling_2022_dir = data_dir + '/yanling_2022'

In [None]:
yanling_2022_ctrl = sc.read(f"{yanling_2022_dir}/adata_yanling_2022_ctrl.h5")

In [None]:
# Basic QC filtering
yanling_2022_ctrl.var['mt'] = yanling_2022_ctrl.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(yanling_2022_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(yanling_2022_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(yanling_2022_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(yanling_2022_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': yanling_2022_ctrl.obs['Internal sample identifier'], 'y': yanling_2022_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': yanling_2022_ctrl.obs['Internal sample identifier'], 'y': yanling_2022_ctrl.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
yanling_2022_ctrl = yanling_2022_ctrl[(
                              ((yanling_2022_ctrl.obs['Internal sample identifier'] == 'WT1') & (yanling_2022_ctrl.obs.log1p_n_genes_by_counts < 8) & 
                                (yanling_2022_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                              ((yanling_2022_ctrl.obs['Internal sample identifier'] == 'WT2') & (yanling_2022_ctrl.obs.log1p_n_genes_by_counts < 8) & 
                                (yanling_2022_ctrl.obs.log1p_n_genes_by_counts > 6.5))).values, :]
yanling_2022_ctrl = yanling_2022_ctrl[yanling_2022_ctrl.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(yanling_2022_ctrl, min_counts=1)
sc.pp.normalize_total(yanling_2022_ctrl)
sc.pp.log1p(yanling_2022_ctrl)

In [None]:
sc.pp.pca(yanling_2022_ctrl, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(yanling_2022_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(yanling_2022_ctrl, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(yanling_2022_ctrl) ** 0.5), metric='cosine')
tk.tl.triku(yanling_2022_ctrl)

In [None]:
sc.tl.umap(yanling_2022_ctrl, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(yanling_2022_ctrl, resolution=2, random_state=seed)

In [None]:
assign_cats(yanling_2022_ctrl, dict_cats=dict_cats_fb, min_score=0.45)
sc.pl.umap(yanling_2022_ctrl, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Dcn', 'Lum', 'Rgs5', 'Pecam1', 'Lyve1', 'Mlana'], 
           legend_loc='on data', cmap=magma, ncols=3, use_raw=False, )

In [None]:
yanling_2022_ctrl_fb = yanling_2022_ctrl[yanling_2022_ctrl.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(yanling_2022_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(yanling_2022_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(yanling_2022_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(yanling_2022_ctrl_fb, use_rep='X_pca_harmony', n_neighbors=int(len(yanling_2022_ctrl_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(yanling_2022_ctrl_fb)

In [None]:
sc.tl.umap(yanling_2022_ctrl_fb, min_dist=0.4, random_state=seed)

In [None]:
sc.tl.leiden(yanling_2022_ctrl_fb, resolution=5, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.umap(yanling_2022_ctrl_fb, color=['leiden', 'Internal sample identifier',], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(yanling_2022_ctrl_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(yanling_2022_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster', others_name='U')
assign_cats(yanling_2022_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
yanling_2022_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(yanling_2022_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(yanling_2022_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del yanling_2022_ctrl_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(yanling_2022_ctrl_fb, color=['cluster'] + [i for i in val if i in yanling_2022_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(yanling_2022_ctrl_fb)

In [None]:
clear_adata(yanling_2022_ctrl_fb)
yanling_2022_ctrl_fb.write_h5ad(yanling_2022_dir + '/yanling_2022_ctrl_fb_processed.h5')

In [None]:
yanling_2022_ctrl_fb = sc.read(yanling_2022_dir + '/yanling_2022_ctrl_fb_processed.h5')

## Clusters to be studied in future iterations

In [None]:
sc.tl.leiden(yanling_2022_ctrl_fb, resolution=2, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.rank_genes_groups_tracksplot(yanling_2022_ctrl_fb, dendrogram=False, n_genes=35)

In [None]:
dict_clusters = {'3': ['Prg4', 'Fmod', 'Col2a1', 'Vit', 'Col11a1', 'Nt5e', 'Rgcc', 'Sox5', 'Cilp2', 'Comp', 'Hapln1', 'Col9a1', 'Chad', 'Ank', 'Trpv4', 'Rbp4', 'Fam180a', 'Crispld1', 'Slc38a1', 'Fibin', 'Ninj1', 'Cilp', 'Col9a2', 'Fxyd5', 'Cytl1', 'Col11a2', 'Clec3a', 
                       'Msmo1', 'Slc16a3', 'Hk1', 'Wif1', 'Smox', 'Clu', 'Cyp51', 'Mmp3', 'P3h2', 'Snorc', 'Ero1l', 'Stk26', 'Sec23b', 'Scrg1', 'Hmgcr', 'Ldlr'],
                 '11/14': ['Acan', 'Col9a3', 'Col11a2', 'Snorc', 'Col9a2', 'Col11a1', 'Hist1h2bc', 'Col9a1', 'Cnmd', 'Comp', 'Ucma', 'Hapln1', 'Fxyd2', 'Timp1', 'Scrg1', 'Col27a1', 'Hist1h1c', 'Gas5', 'Fdps', 'Prelp', 'Pmvk', 'Msmo1', 'Hmgcr', 'S100a1', 'Anxa8', 'Chst11', 
                           'Prkg2', 'Meltf', 'Col2a1', 'Loxl4', 'Cspg4', 'Chadl'], 
                 '0': ['Chad', 'Col2a1', 'Cst3', 'Snorc', 'Mia', 'Fam180a', 'Col9a1', 'Omd', 'Cox4i2', 'Cnmd', 'Comp', 'Ucma', 'Msmo1', 'Scrg1', 'Isg20', 'Cyp51', 'Ecrg4', 'Chadl', 'Hif1a', 'Lbhd2', 'Plagl1', 'Snhg1', 'Plagl1', 'Hmgcr', 'Cobll1', 'Spp1', 'Susd5', 'Mpp6', 
                       'Alpl', 'Nt5e', 'Ell2', 'Cilp2', 'Fxyd3', 'Rbp4', 'Frzb', 'Ero1l', 'Cd82', 'Slc4a7', 'Tiam2', 'Idi1', 'Phospho1', 'Hmgcs1'], 
                 '17': ['Cfh', 'Tmem176b', 'Spp1', 'Lifr', 'Tnc', 'Tmem176a', 'Hp', 'Tmp1', 'Alpl', 'Rgs3', 'Cp', 'Mmp13', 'Sp7', 'Runx2', 'Ptprd', 'Mef2c', 'Dlx5', 'Cd200', 'St3gal6', 'Plac8', 'Bglap', 'Rasl11a', 'Cadm1', 'Adamts9', 'Car3', 'Dapk2', 'Satb2', 'Cxcl12', 
                        'Vdr', 'Ibsp', 'Unc5b', 'Cdh2', 'Rasgrp2',],
                 '20': ['Srpx', 'Mdk', 'Coch', 'Kera', 'Angpt4', 'Cntn1', 'Igf1', 'Tnmd', 'Prss12', 'Meox2', 'Meis2', 'Adamts14', 'Col15a1', 'Saa1', 'Adgrl3', 'Aldh1a3', 'Prss35', 'Sulf1', 'Mllt3', 'Lrat', 'Pou3f4', 'Meis1', 'Col24a1', 'Thsd4', 'Tfpi'],
                 '16': ['Rgcc', 'Hbegf', 'F13a1', 'Cd55', 'Pcsk6', 'Col22a1', 'Htra4', 'Cdh13', 'Tspan15', 'Itga6', 'Aldh1a3', 'Plxdc2', 'Rgs5', 'Sema3d', 'Sema3a', 'Scara3', 'Rab37', 'Gfpt2', 'Ackr2', 'Gchfr', 'Prg4', 'Bystm1', 'Sbsn', 'Cmtm8', 'Sox5'],
                }

In [None]:
sc.pl.umap(yanling_2022_ctrl_fb, color=['leiden', 'Internal sample identifier', 'cluster'] + list(yanling_2022_ctrl_fb.uns['rank_genes_groups']['names']['16'][:100]), legend_loc='on data', cmap=magma, use_raw=False, ncols=6)

In [None]:
sc.tl.rank_genes_groups(yanling_2022_ctrl_fb, groupby='leiden', method='wilcoxon')
dict_yanling_clusters = {str(i) + 'yl': yanling_2022_ctrl_fb.uns['rank_genes_groups']['names'][i][:20] for i in yanling_2022_ctrl_fb.obs['leiden'].cat.categories}

In [None]:
assign_cats(abassi_2020_ctrl_fb, dict_cats=dict_yanling_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster_yl', others_name='U', verbose=False)
sc.pl.umap(abassi_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'cluster_yl'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(buechler_2021_ctrl_fb, dict_cats=dict_yanling_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster_yl', others_name='U', verbose=False)
sc.pl.umap(buechler_2021_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'cluster_yl'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(haensel_2021_ctrl_wounding_fb, dict_cats=dict_yanling_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster_yl', others_name='U', verbose=False)
sc.pl.umap(haensel_2021_ctrl_wounding_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'cluster_yl'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(phan_2020_ctrl_21d_fb, dict_cats=dict_yanling_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster_yl', others_name='U', verbose=False)
sc.pl.umap(phan_2020_ctrl_21d_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'cluster_yl'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(shook_2020_ctrl_fb, dict_cats=dict_yanling_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster_yl', others_name='U', verbose=False)
sc.pl.umap(shook_2020_ctrl_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'cluster_yl'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

## Presence of clusters for each dataset

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
list_datasets = [abassi_2020_ctrl_fb, buechler_2021_ctrl_fb, efremova_2018_ctrl_fb, haensel_2021_ctrl_wounding_fb, phan_2020_ctrl_21d_fb, shook_2020_ctrl_fb]

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='presence', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='percentage', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='axis', cluster_name='cluster', axis_name='axis')