# Comparison of fibroblast populations

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
# import ray
# import subprocess
# import time
# import scvelo as scv
# import gc
import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats
from fb_functions import clear_adata
from fb_functions import plot_score_graph

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0
%store seed

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

%store magma

In [None]:
dict_cats_fb_mouse = {'peri': ['Rgs5', 'Myl9', 'Ndufa4l2', 'Nrip2', 'Mylk', 'Rgs4', 'Acta2', 'Sncg', 'Tagln', 'Des', 'Ptp4a3', 'Myh11'], 
                'endo': ['Pecam1', 'Cdh5', 'Egfl7', 'Cd36', 'Srgn', 'Adgrf5', 'Ptprb', 'Scarb1', 'Plvap', 'Grrp1', 'C1qtnf9', 'Mmrn2', 'Flt1'], 
                'kerato': ['Krt14', 'Krt15', 'Perp', 'S100a14', 'Ccl27a', 'Gata3', 'Dapl1', 'Rab25', 'Ckmt1', 'Col17a1', 'Serpinb5'],
                'kerato Gjb2': ['Ucp2', 'Krt71', 'Gjb2', 'Ahcy', 'Acaa2', 'Cbs', 'Slc3a2', 'Serpina11', 'Lap3', 'Gss', 'Basp1', ],
                'fibro': ['Dcn', 'Pdgfra',  'Lum', 'Col1a1', 'Col1a2',],
                'fibro_2': ['Ncam1', 'Ptch1', 'Trps1', 'Col11a1', 'Wif1'],
                'fibro_acan': ['Acan', 'Col2a1', 'Col11a1', 'Col9a1', 'Snorc', 'Col9a3', 'Mia', 'Cnmd', 'Ucma', 'Chad'],
                'T cell': ['Rac2', 'Ptprcap', 'Il2rg', 'Cd3g', 'Skap1', 'Hcst', 'Ctsw', 'Ets1', 'Cd3d', 'Ctla2a', 'Cd2'],
                'APC': ['Tyrobp', 'Cd74', 'H2-Aa', 'H2-Eb1', 'Ctss', 'Spi1', 'Napsa', 'Cd68', 'Lyz2', 'Csf2ra'],
                'lymph': ['Ccl21a', 'Egfl7', 'Mmrn1', 'Nsg1', 'Meox1', 'Gimap6', 'Kdr'],
                'melano / schwann': ['Syngr1', 'Pmel', 'Mlana'],
                'myo': ['Tnnt1', 'Tnnt2', 'Tnnt3', 'Tnnc2', 'Acta1', 'Myl1', 'Tnni2', 'Tcap', 'Eno3', 'Myoz1'],
                'neural': ['Itgb8', 'Plp1', 'Ptn', 'Egfl8', 'Chl1', 'Cadm4', 'Sox10', 'Cdh19', 'Snca']
               }

# We will use this in 4M
%store dict_cats_fb_mouse  

dict_cats_clusters = { 
                      'a1': ['Ptgs2', 'Gfpt2', 'Ugdh', 'Gm48942', 'Ccl2', 'Gm45551', 'Cxcl1',
                            'Anxa3', 'Gm12840', 'Has1', 'Uap1', 'Ptx3', 'Efhd1', 'Smpd3',
                            'Errfi1', 'Gm20186', 'Akr1c18', 'Plat', 'Ifi205', 'Procr',
                            'Wnt10b', 'Prss23', 'Irak3', 'Sbsn', 'Ccl7', 'Sema3c', 'Fndc1',
                            'Wnt2', 'Emilin2', 'Anxa1'],
                     'a2': ['Anxa3', 'Efhd1', 'Aldh1a3', 'Akr1c18', 'Wnt2', 'Smpd3', 'Dpp4',
                            'Pi16', 'Sbsn', 'Pla1a', 'Sema3c', 'Emilin2', 'Chst1', 'Il18',
                            'Limch1', 'Cd248', 'Mfap5', 'Igfbp4', 'Prss23', 'Sfrp2', 'Tek',
                            'Itgb7', 'Dact2', 'Tmem100', 'Efemp1', 'Mustn1', 'Thbd',
                            'Ifi27l2a', 'Slc4a10', 'Axl'],
                     'b1': ['Cxcl12', 'Postn', 'Pltp', 'Cd36', 'Agt', 'Cyp1b1', 'C6', 'Olfm2',
                            'Adam12', 'Cyp2f2', 'Phospho1', 'Ctsk', 'Slit2', 'Scg3', 'Txnip',
                            'Fabp4', 'Mlana', 'Lpl', 'Cthrc1', 'Mex3b', 'Mmp14', 'Pparg',
                            'Cygb', 'Hpgd', 'Pth1r', 'Aoc3', 'Fzd4', 'Slco2b1', 'Gas6', 'Ggt5'],
                     'b2': ['Hmcn2', 'Thbs4', 'Mgp', 'Fbln7', 'Meox1', 'Col8a1', 'Cilp',
                            'Smoc2', 'Cxcl14', 'Prss12', 'Cygb', 'Sparcl1', 'Col5a3',
                            'Angptl1', 'Ret', 'Fhl2', 'Abca8a', 'Col4a2', 'Fap', 'Col6a3',
                            'Myoc', 'Gfra1', 'Crlf1', 'Col6a2', 'Sept4', 'Rem1', 'Col4a1',
                            'Lsamp', 'Col6a6', 'Mest'],
                     'b3': ['Cilp', 'Fgf9', 'Igf1', 'F3', 'Nkain4', 'Fxyd6', 'Angpt4', 'Mgp',
                            'Ccn5', 'Gas6', 'Clu', 'Tgm2', 'Gm5084', 'Wfdc1', 'Arhgdib',
                            'Ccn2', 'Angptl7', 'Fmo2', 'Gpx3', 'Col12a1', 'Paqr6', 'Sfrp1',
                            'Ecrg4', 'Cygb', 'Smoc2', 'Ltbp4', 'Boc', 'Gas1', 'Npy1r', 'Mfap4'],
                     'b4': ['Steap4', 'Cygb', 'Inmt', 'Agt', 'C4b', 'C2', 'Ggt5', 'Sned1',
                            'Cxcl12', 'Nmb', 'Vit', 'Cp', 'Col4a2', 'Mmp3', 'Vtn', 'Igfbp7',
                            'Adamtsl3', 'Cfh', 'Bmper', 'Entpd2', 'C1s1', 'Col4a1', 'Aoc3',
                            'Lpl', 'Nrp1', 'Gdf10', 'Tmem176a', 'Tmem176b', 'Meox2', 'Nr2f2'],
                     'b5': ['Apod', 'Rasgrp2', 'Cp', 'Vwa1', 'Ccl9', 'Spp1', 'Vit', 'P2ry14',
                            'Abca8a', 'Trf', 'Smoc2', 'Ccl11', 'Matn2', 'Gpc3', 'Vtn',
                            '1500009L16Rik', 'Ebf2', 'Myoc', 'Col8a1', 'Nr2f2', 'Cpe', 'Gfra1',
                            'Pdrg1', 'Steap4', 'Sparcl1', 'Col15a1', 'Itm2a', 'Phgdh', 'Thbs4',
                            'Ecm1'],
                     'b6': ['Ccl8', 'Slc10a6', 'Serpina3n', 'Ccl7', 'Gbp5', 'Casp4', 'Tnfaip6',
                            'Mmp3', 'Cpxm1', 'Ccl11', 'Fgl2', 'Sowahc', 'Vcam1', 'H2-Q7',
                            'H2-K1', 'Birc3', 'Gbp2', 'B2m', 'Ddah1', 'Mt1', 'Ccl19', 'Mt2',
                            'Il6', 'CR974586.5', 'Npc2', 'Ccl2', 'Postn', 'Gch1', 'Iigp1',
                            'AW112010'],
                     'b/c': ['Cyp2f2', 'Masp1', 'Rtn4r', 'Pltp', 'Crp', 'Serpina3n', 'C1qtnf3',
                            'Tgfbi', 'Sectm1a', 'Lgr5', 'Lepr', 'Crip2', 'Pth1r', 'Gas6',
                            'Hpgd', 'Ppp2r2c', 'Slco2b1', 'Mkx', 'Sparcl1', 'Cyp4b1', 'Akr1cl',
                            'Inhbb', 'Bcl11b', 'Angptl1', 'Rasa3', 'Cpz', 'Gng13', 'Penk',
                            'mt-Rnr2', 'Zfp536'],
                     'c1': ['Ndufa4l2', 'Cpz', 'Ppp1r14a', 'Cldn10', 'Cdh4', 'Cgref1',
                            'Aldh3a1', 'Col1a1', 'Pla2g5', 'Col1a2', 'Csf1r', 'Creb3l3',
                            'Sulf2', 'Tgfbi', 'Fgfr4', 'Sparc', 'Col16a1', 'Cib3', 'Cyp2f2',
                            'Sema3b', 'Serpina3n', 'Creb3l1', 'Kazald1', 'Adcy1', 'Mmp27',
                            'Rassf4', 'Gpha2', 'Rcn3', 'P4ha2', 'Tmem150c'],
                     'c2': ['Igfbp2', 'Grem1', 'F13a1', 'Sema3a', 'Mamdc2', 'Wnt5a',
                            'Serpina3g', 'Qpct', 'Kcnk2', 'Slc6a2', 'Sp5', 'Stk32c', 'Btc',  
                            'Ccbe1', 'Ccnd1', 'Fxyd6', 'Nt5e', 'Rarres1', 'Tmem132c', 'Meox2',
                            'Zfp385b', 'Rspo1', 'Nkd1', 'Cdh4', 'Cldn10', 'Ccdc42',
                            'Miat',  'Adcy1'],
                     'c/d': ['Mfap4', 'Cpxm2', 'Dkk2', 'Ltbp2', 'Tnmd', 'Ccl19', 'Coch',
                            'Pianp', 'Slit2', 'Gas1', 'Wnt5b', 'Ackr4', 'Eln', 'Syt13',
                            'Mmp16', 'Il15', 'Cyp26b1', 'Fam180a', 'Nrep', 'Mafb', 'Enpp2',
                            'Ptgfr', 'Cyp1b1', 'Pth1r', 'Cmklr1', 'Atp1a2', 'Tmem204', 'Cd9',
                            'Gpm6b', 'Nrn1'],
                      'd1': ['Sostdc1', 'Ltbp1', 'Sfrp2', 'S100b', 'Fam171b', 'Sparcl1', 'Slc26a7', 'Sox2', 'Ddit4l', 'Cxcl14', 'Gdf10', 'Unc5c', 'Fam210b', 'Pde1a', 'Luzp2'], # DP upper
                      'd2': ['Notum', 'Alpl', 'Gldn', 'Bambi', 'Emb', 'Chst8', 'Snhg11', 'Fap', 'Grin3a', 'Grb14', 'S100b', 'Lingo2', 'Sod3', 'Angpt4'] + \
                            ['Frzb', 'Wif1', 'Aldh1a3', 'Sdk2', 'Rorb', 'Ralgps2', 'Clstn2', 'Rhbdf2', 'Adrbk2', 'Lypd1', 'Hsd11b2', 'Brdt', 'Wnt5b', 'N4bp3', 'Sox18', 'Enc1'], # DP lower
                      'd3': ['Mgp', 'Stmn2', 'Csrp2', 'Corin', 'Cilp', 'Stx18', 'Csgalnact1', 'Npr3', 'Prss35', 'Msx1', 'Grem2', 'Pdlim3', 'Cck', 'Fmod', 'Megf11']  +  \
                            ['Mmp11', 'Ctsc', 'Chchd10', 'Gas1', 'a', 'Ociad2', 'Tnn', 'Thbs2', 'Mamdc2', 'Plxdc1', 'Nrep', 'Gadd45a', 'Fmo1', 'Mitf', 'Fmo2', 'Shc4', 'Bpgm', 'Stmn2', 'F2r'], # CTS Lower
                      'd4': ['Ednrb', 'Tnmd', 'Ramp1', 'Ccdc80', 'Matn4', 'Ntrk2', 'Gpxm2', 'Thbs4', 'Plxdc2', 'Sbspon', 'Kamk4', 'Casq2', 'Cpz', 'Col6a6', 'Lbh'], # CTS Upper
                      'd5': ['Birc5', 'Cks2', 'Ccna2', 'Ube2c', 'Lockd', 'Spc25', 'Top2a', 'Cdca3', 'Cenpm', 'Tpx2', '2810417H13Rik', 'Cdk1',
                             'Spc24', 'Diaph3', 'Mki67', 'Pbk', 'Cdca8', 'Ckap2', 'Cdkn3', 'Prc1', 'Tk1', 'Ccnb2', 'Gmnn', 'Cdc20'],  # DC/CTS Progenitor
                     'e1': ['Cldn1', 'Ebf2', 'Klf5', 'Sbspon', 'Itga6', 'Nr2f2', 'Igfbp6',
                            'Itgb4', 'Mgp', 'Cav1', 'Gas6', 'Fxyd6', 'Tubb2b', 'Mfap5',
                            'Stxbp6', 'Phlda3', 'Lmo4', 'Lbp', 'Ptch1', 'Ccdc3', 'Mras',
                            'Arhgdib', 'Rgs16', 'Ndrg2', 'Acer3', 'Wnt6', 'Tln2', 'Akap12',
                            'Scd1', 'Homer2'],
                     'x1': ['Mbp', 'Mpz', 'Itga6', 'Cldn1', 'Sbspon', 'Cd59a', 'Ebf2', 'Itgb4',
                            'Gab1', 'Cryab', 'Fxyd6', 'Nr2f2', 'Cpe', 'Tmod2', 'Ndrg1',
                            'Igfbp6', 'Col5a3', 'Secisbp2l', 'Klf5', 'Scd1', 'Sfrp1', 'Dbi',
                            'Phlda3', 'Cxcl1', 'Pmp22', 'Adam10', 'Nr4a2', 'Utrn', 'Phlda1',
                            'Csrp1'], 
#                     'x2': ['Igfbp4', 'Mafb', 'Mgst3', 'Itih5', 'Sept11', 'Phlda3', 'Sms', 'Adamts18', 
#                            'Slco3a1', 'Tpd52e1', 'Fam65b', 'Sphk1', 'Hmgcs2', 'Rarres1', 'Acot7', 'Kazn', 'Kif26b', 'Fhl2', 'Cyp1b1', 'Mustn1', 'Lmo1', 'Asgr2', 'Jup']
}

dict_cats_axes = {'a': ['Efhd1', 'Anxa3', 'Akr1c18', 'Smpd3', 'Aldh1a3', 'Sbsn', 'Wnt2',
                        'Chst1', 'Wnt10b', 'Sema3c', 'Limch1', 'Pla1a', 'Gfpt2', 'Emilin2',
                        'Il18', 'Dpp4', 'Itgb7', 'Ugdh', 'Prss23', 'Thbd', 'Sema3e',
                        'Pi16', 'Cd55', 'Tek', 'Fndc1', 'Aif1l', 'Ptgs2', 'Dbn1', 'Dmkn',
                        'Axl'],
                 'b': ['Cygb', 'Mgp', 'F3', 'Cxcl14', 'Hmcn2', 'Smoc2', 'Fxyd6', 'Meox1',
                        'Fbln7', 'Steap4', 'Cilp', 'Gpx3', 'Gas6', 'Ggt5', 'Abca8a',
                        'Podn', 'Thbs4', 'Igfbp7', 'Entpd2', 'Col8a1', 'Igf1', 'Nfib',
                        'Sfrp1', 'Bgn', 'Angpt4', 'Cxcl12', 'Ltbp4', 'Fzd4', 'Sparcl1',
                        'Col4a2'],
                 'c': ['Ppp1r14a', 'Ndufa4l2', 'Cdh4', 'Cldn10', 'Cpz', 'Aldh3a1',
                        'Pla2g5', 'Cgref1', 'Fgfr4', 'Csf1r', 'Grem1', 'Cib3', 'Sulf2',
                        'Pla2g2e', 'Adcy1', 'Tgfbi', 'Creb3l3', 'Col1a1', 'Rassf4',
                        'Col1a2', 'Miat', 'Igfbp2', 'Creb3l1', 'Gpha2', 'Tmem150c',
                        'Serpina3n', 'Sema3a', 'Rspo1', 'Sema3b', 'Col16a1'],
                 'd': ['Wif1', 'Trps1', 'Lamc3', 'Tpm2', 'Ncam1', 'Enpp2', 'a', 'Col23a1',
                        'Col11a1', 'Runx3', 'Ptger3', 'Robo2', 'Kif26b', 'Scube3', 'Myo1b',
                        'Crabp1', 'Lrrc15', 'Aqp1', 'Crabp2', 'Prlr', 'Prdm1', 'Mdk',
                        'Nav2', 'Inhba', 'Daam2', 'Edn3', 'Myo10', 'Aplp1', 'Kctd1',
                        'Gng2'],
                 'e': ['Cldn1', 'Ebf2', 'Klf5', 'Sbspon', 'Itga6', 'Nr2f2', 'Igfbp6',
                        'Itgb4', 'Mgp', 'Cav1', 'Gas6', 'Fxyd6', 'Tubb2b', 'Mfap5',
                        'Stxbp6', 'Phlda3', 'Lmo4', 'Lbp', 'Ptch1', 'Ccdc3', 'Mras',
                        'Arhgdib', 'Rgs16', 'Ndrg2', 'Acer3', 'Wnt6', 'Tln2', 'Akap12',
                        'Scd1', 'Homer2'],
                 }

In [None]:
dict_colors_mouse = { 'a1': '#c93038', 'a2': '#e01f6c', 
               'b1': '#ffa900', 'b2': '#ff6600', 'b3': '#d45500', 'b4': '#c87137', 'b5': '#ecaa87', 'b6': '#aaa57a',
               'b/c': "#ddff55",
               'c1': '#b4d645', 'c2': '#51c43f',
               'c/d': '#80ffb3',
               'd1': '#aaeeff', 'd2': '#2ad4ff', 'd3': '#5599ff', 'd4': '#0066ff',   'd5': '#9e62c7', 
               'e1': '#ddafe9', 'x1': '#8d5fd3', 'x2': '#6d3fa3'
              }


%store dict_colors_mouse

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
data_dir = os.getcwd() + '/data/'
print(data_dir)
%store data_dir

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Data extraction and processing

**Note: The notebook 1M also has a preprocessing of samples, but this preprocessing might be different in 2M. Also, preprocessing of new datasets will be included.**

## Abbasi et al. 2020

In [None]:
abbasi_2020_dir = data_dir + '/abassi_2020'

In [None]:
abassi_2020_ctrl_mouse = sc.read(f"{abbasi_2020_dir}/abassi_2020_ctrl_mouse.h5")

In [None]:
# Basic QC filtering
abassi_2020_ctrl_mouse.var['mt'] = abassi_2020_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(abassi_2020_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(abassi_2020_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(abassi_2020_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(abassi_2020_ctrl_mouse, x='log1p_total_counts', y='log1p_n_genes_by_counts')

In [None]:
abassi_2020_ctrl_mouse = abassi_2020_ctrl_mouse[(
                              ((abassi_2020_ctrl_mouse.obs.n_genes_by_counts < 3000) & 
                                (abassi_2020_ctrl_mouse.obs.n_genes_by_counts > 1100))).values, :]
abassi_2020_ctrl_mouse = abassi_2020_ctrl_mouse[abassi_2020_ctrl_mouse.obs.pct_counts_mt < 9, :]

In [None]:
sc.pp.filter_genes(abassi_2020_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(abassi_2020_ctrl_mouse)
sc.pp.log1p(abassi_2020_ctrl_mouse)

In [None]:
sc.pp.pca(abassi_2020_ctrl_mouse, random_state=seed, n_comps=50)
sc.pp.neighbors(abassi_2020_ctrl_mouse, n_neighbors=int(0.5 * len(abassi_2020_ctrl_mouse) ** 0.5), metric='cosine')
tk.tl.triku(abassi_2020_ctrl_mouse)

In [None]:
sc.tl.umap(abassi_2020_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(abassi_2020_ctrl_mouse, resolution=2, random_state=seed)

In [None]:
assign_cats(abassi_2020_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(abassi_2020_ctrl_mouse, color=['leiden', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(abassi_2020_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in abassi_2020_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
abassi_2020_ctrl_mouse_fb = abassi_2020_ctrl_mouse[abassi_2020_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(abassi_2020_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(abassi_2020_ctrl_mouse_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(abassi_2020_ctrl_mouse_fb, n_neighbors=int(0.5 * len(abassi_2020_ctrl_mouse_fb) ** 0.5), metric='cosine')
tk.tl.triku(abassi_2020_ctrl_mouse_fb)
sc.pp.pca(abassi_2020_ctrl_mouse_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(abassi_2020_ctrl_mouse_fb, n_neighbors=int(0.5 * len(abassi_2020_ctrl_mouse_fb) ** 0.5), metric='cosine')

In [None]:
sc.tl.umap(abassi_2020_ctrl_mouse_fb, random_state=seed)

In [None]:
sc.tl.leiden(abassi_2020_ctrl_mouse_fb, resolution=8, random_state=seed)

In [None]:
sc.pl.umap(abassi_2020_ctrl_mouse_fb, color=['leiden'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(abassi_2020_ctrl_mouse_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(abassi_2020_ctrl_mouse_fb, color=['leiden'] + ['Ccl19', 'Ndufa4l2', 'Mfap4', 'Eln', 'Tnmd', 'Cyp2f2', 'Tsc22d3', 'Rtn4r', 'Ltbp2', 'Gpm6b', 'Fam180a', 'Lsamp', 'Pid1', 'Lgr5', 'Sectm1a', 'Prkcb', 'Entpd1',  'Serpina3c', 'Myo1b', 'H2-Q7', 'Bmp4', 'Cystm1', 'Cyp1b1', 'Wnt5b', 'Id1', 'Syt13', 'Etv1', 'Eya1'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(abassi_2020_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.65, key_added='cluster', others_name='U')
assign_cats(abassi_2020_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
abassi_2020_ctrl_mouse_fb.obs['cluster'] = abassi_2020_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
abassi_2020_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for  i in abassi_2020_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(abassi_2020_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del abassi_2020_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(abassi_2020_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in abassi_2020_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(abassi_2020_ctrl_mouse_fb)

In [None]:
clear_adata(abassi_2020_ctrl_mouse_fb)
abassi_2020_ctrl_mouse_fb.write_h5ad(abbasi_2020_dir + '/abassi_2020_ctrl_mouse_fb_processed.h5')
abassi_2020_ctrl_mouse.write_h5ad(abbasi_2020_dir + '/abassi_2020_ctrl_mouse_processed.h5')

In [None]:
abassi_2020_ctrl_mouse_fb = sc.read(abbasi_2020_dir + '/abassi_2020_ctrl_mouse_fb_processed.h5')
abassi_2020_ctrl_mouse = sc.read(abbasi_2020_dir + '/abassi_2020_ctrl_mouse_processed.h5')

## Boothby et al. 2021

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'

In [None]:
boothby_2021_ctrl_mouse = sc.read(boothby_2021_dir + '/boothby_2021_mouse_ctrl_mouse.h5')

In [None]:
boothby_2021_ctrl_mouse.var_names

In [None]:
# Basic QC filtering
boothby_2021_ctrl_mouse.var['mt'] = boothby_2021_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(boothby_2021_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(boothby_2021_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(boothby_2021_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(boothby_2021_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
boothby_2021_ctrl_mouse = boothby_2021_ctrl_mouse[((boothby_2021_ctrl_mouse.obs.n_genes_by_counts < 3500) & 
                                    (boothby_2021_ctrl_mouse.obs.n_genes_by_counts > 1500)).values, :]
boothby_2021_ctrl_mouse = boothby_2021_ctrl_mouse[boothby_2021_ctrl_mouse.obs.pct_counts_mt < 4, :]

In [None]:
sc.pp.filter_genes(boothby_2021_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(boothby_2021_ctrl_mouse)
sc.pp.log1p(boothby_2021_ctrl_mouse)

In [None]:
sc.pp.pca(boothby_2021_ctrl_mouse, random_state=seed, n_comps=50)
sc.pp.neighbors(boothby_2021_ctrl_mouse, n_neighbors=int(0.5 * len(boothby_2021_ctrl_mouse) ** 0.5 // 2), metric='cosine')
tk.tl.triku(boothby_2021_ctrl_mouse)

In [None]:
sc.tl.umap(boothby_2021_ctrl_mouse, min_dist=0.2, random_state=seed)
sc.tl.leiden(boothby_2021_ctrl_mouse, resolution=4, random_state=seed)

In [None]:
sc.pp.subsample(boothby_2021_ctrl_mouse, fraction=1, random_state=0, copy=False)
sc.pl.umap(boothby_2021_ctrl_mouse, color=['leiden'], legend_loc='on data')

In [None]:
assign_cats(boothby_2021_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.35, quantile_gene_sel=0.85)

In [None]:
sc.pl.umap(boothby_2021_ctrl_mouse, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(boothby_2021_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in boothby_2021_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
boothby_2021_ctrl_mouse_fb = boothby_2021_ctrl_mouse[boothby_2021_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])].copy()

In [None]:
sc.pp.filter_genes(boothby_2021_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(boothby_2021_ctrl_mouse_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(boothby_2021_ctrl_mouse_fb,  n_neighbors=int(len(boothby_2021_ctrl_mouse_fb) ** 0.5 // 9), metric='cosine')
tk.tl.triku(boothby_2021_ctrl_mouse_fb)
sc.pp.pca(boothby_2021_ctrl_mouse_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(boothby_2021_ctrl_mouse_fb,  n_neighbors=int(len(boothby_2021_ctrl_mouse_fb) ** 0.5 // 9), metric='cosine')

In [None]:
sc.tl.umap(boothby_2021_ctrl_mouse_fb, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(boothby_2021_ctrl_mouse_fb, resolution=7, random_state=seed)
# sc.tl.leiden(buechler_2021_ctrl_mouse_fb, resolution=0.2, random_state=seed)

In [None]:
assign_cats(boothby_2021_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(boothby_2021_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
boothby_2021_ctrl_mouse_fb.obs['cluster'] = boothby_2021_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
boothby_2021_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in boothby_2021_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(boothby_2021_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del boothby_2021_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(boothby_2021_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in boothby_2021_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.tl.rank_genes_groups(boothby_2021_ctrl_mouse_fb, groupby='cluster', groups=['x1'])
sc.pl.umap(boothby_2021_ctrl_mouse_fb, color=boothby_2021_ctrl_mouse_fb.uns['rank_genes_groups']['names']['x1'][220:350], cmap=magma)

In [None]:
plot_score_graph(boothby_2021_ctrl_mouse_fb)

In [None]:
clear_adata(boothby_2021_ctrl_mouse_fb)
boothby_2021_ctrl_mouse_fb.write_h5ad(boothby_2021_dir + '/boothby_2021_ctrl_mouse_fb_processed.h5')
boothby_2021_ctrl_mouse.write_h5ad(boothby_2021_dir + '/boothby_2021_ctrl_mouse_processed.h5')

In [None]:
boothby_2021_ctrl_mouse_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_fb_processed.h5')
boothby_2021_ctrl_mouse = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_processed.h5')

## Buechler et al. 2021

In [None]:
buechler_2021_dir = data_dir + '/buechler_2021'

In [None]:
buechler_2021_ctrl_mouse = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse.h5')
buechler_2021_ctrl_mouse.var_names_make_unique()

In [None]:
# Basic QC filtering
buechler_2021_ctrl_mouse.var['mt'] = buechler_2021_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(buechler_2021_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(buechler_2021_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(buechler_2021_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(buechler_2021_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
buechler_2021_ctrl_mouse = buechler_2021_ctrl_mouse[((buechler_2021_ctrl_mouse.obs.n_genes_by_counts < 4000) & 
                                    (buechler_2021_ctrl_mouse.obs.n_genes_by_counts > 2000)).values, :]
buechler_2021_ctrl_mouse = buechler_2021_ctrl_mouse[buechler_2021_ctrl_mouse.obs.pct_counts_mt < 12, :]

In [None]:
sc.pp.filter_genes(buechler_2021_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(buechler_2021_ctrl_mouse)
sc.pp.log1p(buechler_2021_ctrl_mouse)

In [None]:
sc.pp.pca(buechler_2021_ctrl_mouse, random_state=seed, n_comps=50)
sc.pp.neighbors(buechler_2021_ctrl_mouse, n_neighbors=int(0.5 * len(buechler_2021_ctrl_mouse) ** 0.5 // 2), metric='cosine')
tk.tl.triku(buechler_2021_ctrl_mouse)

In [None]:
sc.tl.umap(buechler_2021_ctrl_mouse, min_dist=0.2, random_state=seed)
sc.tl.leiden(buechler_2021_ctrl_mouse, resolution=4, random_state=seed)

In [None]:
sc.pp.subsample(buechler_2021_ctrl_mouse, fraction=1, random_state=0, copy=False)
sc.pl.umap(buechler_2021_ctrl_mouse, color=['leiden'], legend_loc='on data')

In [None]:
assign_cats(buechler_2021_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.35, quantile_gene_sel=0.85)

In [None]:
sc.pl.umap(buechler_2021_ctrl_mouse, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(buechler_2021_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in buechler_2021_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
buechler_2021_ctrl_mouse_fb = buechler_2021_ctrl_mouse[buechler_2021_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(buechler_2021_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(buechler_2021_ctrl_mouse_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(buechler_2021_ctrl_mouse_fb,  n_neighbors=int(len(buechler_2021_ctrl_mouse_fb) ** 0.5 // 6), metric='cosine')
tk.tl.triku(buechler_2021_ctrl_mouse_fb)
sc.pp.pca(buechler_2021_ctrl_mouse_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(buechler_2021_ctrl_mouse_fb,  n_neighbors=int(len(buechler_2021_ctrl_mouse_fb) ** 0.5 // 6), metric='cosine')

In [None]:
sc.tl.umap(buechler_2021_ctrl_mouse_fb, min_dist=0.1, random_state=seed)

In [None]:
assign_cats(buechler_2021_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.6, key_added='cluster', others_name='U')
assign_cats(buechler_2021_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
buechler_2021_ctrl_mouse_fb.obs['cluster'] = buechler_2021_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
sc.tl.leiden(buechler_2021_ctrl_mouse_fb, resolution=10, random_state=seed)
# sc.tl.leiden(buechler_2021_ctrl_mouse_fb, resolution=0.2, random_state=seed)

In [None]:
buechler_2021_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for  i in buechler_2021_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(buechler_2021_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del buechler_2021_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(buechler_2021_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in buechler_2021_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(buechler_2021_ctrl_mouse_fb)

In [None]:
clear_adata(buechler_2021_ctrl_mouse_fb)
buechler_2021_ctrl_mouse_fb.write_h5ad(buechler_2021_dir + '/buechler_2021_ctrl_mouse_fb_processed.h5')
buechler_2021_ctrl_mouse.write_h5ad(buechler_2021_dir + '/buechler_2021_ctrl_mouse_processed.h5')

In [None]:
buechler_2021_ctrl_mouse_fb = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse_fb_processed.h5')
buechler_2021_ctrl_mouse = sc.read(buechler_2021_dir + '/buechler_2021_ctrl_mouse_processed.h5')

## Efremova, Mirjana 2018

In [None]:
efremova_2018_dir = data_dir + '/efremova_2018'

In [None]:
efremova_2018_ctrl_mouse = sc.read(f"{efremova_2018_dir}/efremova_2018_ctrl_mouse.h5")

In [None]:
# Basic QC filtering
efremova_2018_ctrl_mouse.var['mt'] = efremova_2018_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(efremova_2018_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(efremova_2018_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(efremova_2018_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(efremova_2018_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': efremova_2018_ctrl_mouse.obs['Internal sample identifier'], 'y': efremova_2018_ctrl_mouse.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': efremova_2018_ctrl_mouse.obs['Internal sample identifier'], 'y': efremova_2018_ctrl_mouse.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
efremova_2018_ctrl_mouse = efremova_2018_ctrl_mouse[(
                              ((efremova_2018_ctrl_mouse.obs['Internal sample identifier'] == '0') & (efremova_2018_ctrl_mouse.obs.log1p_n_genes_by_counts < 8.3) & 
                                (efremova_2018_ctrl_mouse.obs.log1p_n_genes_by_counts > 6.5))).values, :]
efremova_2018_ctrl_mouse = efremova_2018_ctrl_mouse[efremova_2018_ctrl_mouse.obs.pct_counts_mt < 9, :]

In [None]:
sc.pp.filter_genes(efremova_2018_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(efremova_2018_ctrl_mouse)
sc.pp.log1p(efremova_2018_ctrl_mouse)

In [None]:
sc.pp.pca(efremova_2018_ctrl_mouse, random_state=seed, n_comps=50)
sc.pp.neighbors(efremova_2018_ctrl_mouse, n_neighbors=int(0.5 * len(efremova_2018_ctrl_mouse) ** 0.5), metric='cosine')
tk.tl.triku(efremova_2018_ctrl_mouse)

In [None]:
sc.tl.umap(efremova_2018_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(efremova_2018_ctrl_mouse, resolution=2, random_state=seed)

In [None]:
assign_cats(efremova_2018_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(efremova_2018_ctrl_mouse, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(efremova_2018_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in efremova_2018_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
efremova_2018_ctrl_mouse_fb = efremova_2018_ctrl_mouse[efremova_2018_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(efremova_2018_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(efremova_2018_ctrl_mouse_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(efremova_2018_ctrl_mouse_fb, n_neighbors=int(0.5 * len(efremova_2018_ctrl_mouse_fb) ** 0.5), metric='cosine')
tk.tl.triku(efremova_2018_ctrl_mouse_fb)
sc.pp.pca(efremova_2018_ctrl_mouse_fb, random_state=seed, n_comps=25)
sc.pp.neighbors(efremova_2018_ctrl_mouse_fb, n_neighbors=int(0.5 * len(efremova_2018_ctrl_mouse_fb) ** 0.5), metric='cosine')

In [None]:
sc.tl.umap(efremova_2018_ctrl_mouse_fb, random_state=seed)

In [None]:
sc.tl.leiden(efremova_2018_ctrl_mouse_fb, resolution=0.6, random_state=seed)

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden', 'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(efremova_2018_ctrl_mouse_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden'] + list(efremova_2018_ctrl_mouse_fb.uns['rank_genes_groups']['names']['0'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden'] + list(efremova_2018_ctrl_mouse_fb.uns['rank_genes_groups']['names']['2'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden'] + list(efremova_2018_ctrl_mouse_fb.uns['rank_genes_groups']['names']['3'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden'] + list(efremova_2018_ctrl_mouse_fb.uns['rank_genes_groups']['names']['1'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden'] + list(efremova_2018_ctrl_mouse_fb.uns['rank_genes_groups']['names']['4'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['leiden'] + list(efremova_2018_ctrl_mouse_fb.uns['rank_genes_groups']['names']['5'][:250]), 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(efremova_2018_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.6, key_added='cluster', others_name='U')
assign_cats(efremova_2018_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
efremova_2018_ctrl_mouse_fb.obs['cluster'] = efremova_2018_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
efremova_2018_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in efremova_2018_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del efremova_2018_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(efremova_2018_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in efremova_2018_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(efremova_2018_ctrl_mouse_fb)

In [None]:
clear_adata(efremova_2018_ctrl_mouse_fb)
efremova_2018_ctrl_mouse_fb.write_h5ad(efremova_2018_dir + '/efremova_2018_ctrl_mouse_fb_processed.h5')
efremova_2018_ctrl_mouse.write_h5ad(efremova_2018_dir + '/efremova_2018_ctrl_mouse_processed.h5')

In [None]:
efremova_2018_ctrl_mouse_fb = sc.read(efremova_2018_dir + '/efremova_2018_ctrl_mouse_fb_processed.h5')
efremova_2018_ctrl_mouse = sc.read(efremova_2018_dir + '/efremova_2018_ctrl_mouse_processed.h5')

## Haensel et al. 2021

**CAUTION** We include **wounded and unwounded** samples! We have seen that B1 population (B general population) from this dataset was almost depleted in the unwounded, and included with more heterogeneity in the wounded state. However, these populations appear in other datasets such as Abassi and Shook (at homeostatic states), so we are going to keep them.

In [None]:
haensel_2021_dir = data_dir + '/haensel_2021'

In [None]:
haensel_2021_ctrl_mouse = sc.read(f"{haensel_2021_dir}/haensel_2021_ctrl_mouse.h5")
haensel_2021_wounding = sc.read(f"{haensel_2021_dir}/haensel_2021_wounding.h5")

In [None]:
haensel_2021_ctrl_mouse_wounding = sc.AnnData.concatenate(haensel_2021_ctrl_mouse, haensel_2021_wounding, batch_categories=['Healthy', 'Wounding'],
                                           batch_key='Condition')


In [None]:
# Basic QC filtering
haensel_2021_ctrl_mouse_wounding.var['mt'] = haensel_2021_ctrl_mouse_wounding.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(haensel_2021_ctrl_mouse_wounding, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(haensel_2021_ctrl_mouse_wounding, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(haensel_2021_ctrl_mouse_wounding, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(haensel_2021_ctrl_mouse_wounding, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': haensel_2021_ctrl_mouse_wounding.obs['Internal sample identifier'], 'y': haensel_2021_ctrl_mouse_wounding.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': haensel_2021_ctrl_mouse_wounding.obs['Internal sample identifier'], 'y': haensel_2021_ctrl_mouse_wounding.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
haensel_2021_ctrl_mouse_wounding = haensel_2021_ctrl_mouse_wounding[(((haensel_2021_ctrl_mouse_wounding.obs.pct_counts_mt < 10) &
                                (haensel_2021_ctrl_mouse_wounding.obs.log1p_n_genes_by_counts > 7.2) & 
                                (haensel_2021_ctrl_mouse_wounding.obs.log1p_n_genes_by_counts < 8.1)) 
                                            ).values, :]

In [None]:
sc.pp.filter_genes(haensel_2021_ctrl_mouse_wounding, min_counts=1)
sc.pp.normalize_total(haensel_2021_ctrl_mouse_wounding)
sc.pp.log1p(haensel_2021_ctrl_mouse_wounding)

In [None]:
sc.pp.pca(haensel_2021_ctrl_mouse_wounding, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(haensel_2021_ctrl_mouse_wounding, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_mouse_wounding, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(haensel_2021_ctrl_mouse_wounding) ** 0.5 // 4), metric='cosine')
tk.tl.triku(haensel_2021_ctrl_mouse_wounding)

In [None]:
sc.tl.umap(haensel_2021_ctrl_mouse_wounding, min_dist=0.2, random_state=seed)

In [None]:
sc.tl.leiden(haensel_2021_ctrl_mouse_wounding, resolution=0.2, random_state=seed)

In [None]:
assign_cats(haensel_2021_ctrl_mouse_wounding, dict_cats=dict_cats_fb_mouse, min_score=0.4, quantile_gene_sel=0.4)
sc.pl.umap(haensel_2021_ctrl_mouse_wounding, color=['leiden', 'assigned_cats', 'Internal sample identifier'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(haensel_2021_ctrl_mouse_wounding, color=['assigned_cats'] + [i for i in val if i in haensel_2021_ctrl_mouse_wounding.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
haensel_2021_ctrl_mouse_wounding_fb = haensel_2021_ctrl_mouse_wounding[haensel_2021_ctrl_mouse_wounding.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(haensel_2021_ctrl_mouse_wounding_fb, min_counts=1)

In [None]:
sc.pp.pca(haensel_2021_ctrl_mouse_wounding_fb, random_state=seed, n_comps=35)
sce.pp.harmony_integrate(haensel_2021_ctrl_mouse_wounding_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_mouse_wounding_fb, use_rep='X_pca_harmony', n_neighbors=int(len(haensel_2021_ctrl_mouse_wounding_fb) ** 0.5 // 7), metric='cosine')
tk.tl.triku(haensel_2021_ctrl_mouse_wounding_fb)

sc.pp.pca(haensel_2021_ctrl_mouse_wounding_fb, random_state=seed, n_comps=35)
sce.pp.harmony_integrate(haensel_2021_ctrl_mouse_wounding_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_mouse_wounding_fb, use_rep='X_pca_harmony', n_neighbors=int(len(haensel_2021_ctrl_mouse_wounding_fb) ** 0.5 // 7), metric='cosine')

In [None]:
sc.tl.umap(haensel_2021_ctrl_mouse_wounding_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(haensel_2021_ctrl_mouse_wounding_fb, resolution=5, random_state=seed)

In [None]:
sc.pl.umap(haensel_2021_ctrl_mouse_wounding_fb, color=['leiden'], cmap=magma, use_raw=False, legend_loc='on data', ncols=1)
sc.pl.umap(haensel_2021_ctrl_mouse_wounding_fb, color=['Internal sample identifier'], cmap=magma, use_raw=False, ncols=1)

In [None]:
sc.tl.rank_genes_groups(haensel_2021_ctrl_mouse_wounding_fb, groupby='leiden')

In [None]:
assign_cats(haensel_2021_ctrl_mouse_wounding_fb, dict_cats={'krt-like': ['Lgals7', 'Fxyd3', 'Perp', 'Krt15', 'S100a14', 'Sfn', 'Krt5', 'Anxa8', 'Sfn', 'Ly6d', ], 
                                                      'immune-like': ['Fcer1g', 'Tyrobp', 'Srgn', 'Cd52', 'Cxcl2'],
                                                     }, min_score=0.99, quantile_gene_sel=0.99, key_added='clusterx', others_name='U')

In [None]:
sc.pl.umap(haensel_2021_ctrl_mouse_wounding_fb, color=['Internal sample identifier', 'leiden', 'clusterx'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
haensel_2021_ctrl_mouse_wounding_fb = haensel_2021_ctrl_mouse_wounding_fb[haensel_2021_ctrl_mouse_wounding_fb.obs['clusterx'] == 'U']

In [None]:
sc.pp.filter_genes(haensel_2021_ctrl_mouse_wounding_fb, min_counts=1)

In [None]:
sc.pp.pca(haensel_2021_ctrl_mouse_wounding_fb, random_state=seed, n_comps=35)
sce.pp.harmony_integrate(haensel_2021_ctrl_mouse_wounding_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_mouse_wounding_fb, use_rep='X_pca_harmony', n_neighbors=int(len(haensel_2021_ctrl_mouse_wounding_fb) ** 0.5 // 7), metric='cosine')
tk.tl.triku(haensel_2021_ctrl_mouse_wounding_fb)

sc.pp.pca(haensel_2021_ctrl_mouse_wounding_fb, random_state=seed, n_comps=35)
sce.pp.harmony_integrate(haensel_2021_ctrl_mouse_wounding_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(haensel_2021_ctrl_mouse_wounding_fb, use_rep='X_pca_harmony', n_neighbors=int(len(haensel_2021_ctrl_mouse_wounding_fb) ** 0.5 // 7), metric='cosine')

In [None]:
sc.tl.umap(haensel_2021_ctrl_mouse_wounding_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(haensel_2021_ctrl_mouse_wounding_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(haensel_2021_ctrl_mouse_wounding_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(haensel_2021_ctrl_mouse_wounding_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
haensel_2021_ctrl_mouse_wounding_fb.obs['cluster'] = haensel_2021_ctrl_mouse_wounding_fb.obs['cluster'].astype('category')

In [None]:
haensel_2021_ctrl_mouse_wounding_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in haensel_2021_ctrl_mouse_wounding_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(haensel_2021_ctrl_mouse_wounding_fb, color=['leiden', 'cluster', 'axis'], legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del haensel_2021_ctrl_mouse_wounding_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(haensel_2021_ctrl_mouse_wounding_fb, color=['cluster'] + [i for i in val if i in haensel_2021_ctrl_mouse_wounding_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(haensel_2021_ctrl_mouse_wounding_fb)

In [None]:
clear_adata(haensel_2021_ctrl_mouse_wounding_fb)
haensel_2021_ctrl_mouse_wounding_fb.write_h5ad(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_fb_processed.h5')
haensel_2021_ctrl_mouse_wounding.write_h5ad(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_processed.h5')

In [None]:
haensel_2021_ctrl_mouse_wounding_fb = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_fb_processed.h5')
haensel_2021_ctrl_mouse_wounding = sc.read(haensel_2021_dir + '/haensel_2021_ctrl_mouse_wounding_processed.h5')

## Joost et al. 2020

In [None]:
joost_2020_dir = data_dir + '/joost_2020'

In [None]:
joost_2020_ctrl_mouse = sc.read(f"{joost_2020_dir}/adata_joost_2020_ctrl_mouse.h5")

In [None]:
# Basic QC filtering
joost_2020_ctrl_mouse.var['mt'] = joost_2020_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(joost_2020_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(joost_2020_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(joost_2020_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(joost_2020_ctrl_mouse, x='log1p_total_counts', y='log1p_n_genes_by_counts')

In [None]:
joost_2020_ctrl_mouse = joost_2020_ctrl_mouse[(
                              ((joost_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8.5) & 
                                (joost_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 5.5))).values, :]
joost_2020_ctrl_mouse = joost_2020_ctrl_mouse[joost_2020_ctrl_mouse.obs.pct_counts_mt < 9, :]
joost_2020_ctrl_mouse = joost_2020_ctrl_mouse[joost_2020_ctrl_mouse.obs.pct_counts_mt > 1, :]

In [None]:
sc.pp.filter_genes(joost_2020_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(joost_2020_ctrl_mouse)
sc.pp.log1p(joost_2020_ctrl_mouse)

In [None]:
sc.pp.pca(joost_2020_ctrl_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(joost_2020_ctrl_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(joost_2020_ctrl_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(joost_2020_ctrl_mouse) ** 0.5 // 4), metric='cosine')
tk.tl.triku(joost_2020_ctrl_mouse)

In [None]:
sc.tl.umap(joost_2020_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(joost_2020_ctrl_mouse, resolution=12, random_state=seed)

In [None]:
joost_2020_ctrl_mouse.obs['Internal sample identifier']

In [None]:
assign_cats(joost_2020_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(joost_2020_ctrl_mouse, color=['leiden', 'Age', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(joost_2020_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in joost_2020_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
joost_2020_ctrl_mouse_fb = joost_2020_ctrl_mouse[joost_2020_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(joost_2020_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(joost_2020_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(joost_2020_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(joost_2020_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(joost_2020_ctrl_mouse) ** 0.5 // 4), metric='cosine')
tk.tl.triku(joost_2020_ctrl_mouse_fb) 

sc.pp.pca(joost_2020_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(joost_2020_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(joost_2020_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(joost_2020_ctrl_mouse) ** 0.5 // 4), metric='cosine')

In [None]:
sc.tl.umap(joost_2020_ctrl_mouse_fb, random_state=seed, min_dist=1.15)

In [None]:
sc.tl.leiden(joost_2020_ctrl_mouse_fb, resolution=10, random_state=seed)

In [None]:
sc.pl.umap(joost_2020_ctrl_mouse_fb, color=['leiden'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(joost_2020_ctrl_mouse_fb, color=['leiden'] + ['Ccl19', 'Ndufa4l2', 'Mfap4', 'Eln', 'Tnmd', 'Cyp2f2', 'Tsc22d3', 'Rtn4r', 'Ltbp2', 'Gpm6b', 'Fam180a', 'Lsamp', 'Pid1', 'Lgr5', 'Sectm1a', 'Prkcb', 'Entpd1',  'Serpina3c', 'Myo1b', 'H2-Q7', 'Bmp4', 'Cystm1', 'Cyp1b1', 'Wnt5b', 'Id1', 'Syt13', 'Etv1', 'Eya1'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(joost_2020_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(joost_2020_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
joost_2020_ctrl_mouse_fb.obs['cluster'] = joost_2020_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del joost_2020_ctrl_mouse_fb.obs[cluster]

In [None]:
joost_2020_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in joost_2020_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(joost_2020_ctrl_mouse_fb, color=['Internal sample identifier', 'Age', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
dict_mapping_joost = {'FIB1': ['Sparc', 'Col1a1', 'Col1a2', 'Ndufa4l2', 'Cldn10', 'Cpz', 'Col3a1', 'Cgref1', 'Ppp1r14a', 
                               'Col16a1', 'Clec11a', 'Creb3l3', ], # c1, c2 > c3
                      'FIB2': ['Dcn', 'Lum', 'Igfbp7', 'Cd63', 'Mfap4', 'Aebp1', 'Mt1', 'Fth1', 'Cyp2f2', 'Ccl19', 
                               'Rarres2', 'Mt2', ],         # c3, b/c > c1, c2
                      'FIB3': ['Gpx3', 'Cygb', 'F3', 'Gsn', 'Cxcl12', 'Dpt', 'Myoc', 'Tmeff2', 
                               'Hmcn2', 'Mgst1', 'Fxyd6', 'S100a6',],  # b4 > b1 > b3 > b2 > a
                      'FIB4': ['Anxa3', 'Akr1c18', 'Plac8', 'Pla1a', 'Ifi27l2a', 'Ifi205', 'Sfrp4', 'Prss23', 'Mfap5', 
                               'Ackr3', 'Smpd3', 'Igfbp6', ],   # a > b4
                      'tDP': ['Crabp1', 'Notum', 'Pappa2', 'Rasd1', 'Ramp3', 'a', 'Slc26a7'],      # d3 > d2, d4
                      'aDP': ['Corin', 'Nrg2', 'Cntn1', 'Nrg2', 'Cntn1', 'Ptprz1'],                # d3 > d4
                      'DS1': ['Abi3bp', 'Ramp1', 'Mylk', 'Prelid2', 'Dusp14', 'Enpp2', 'Tpcn2'],            # d2 > d4, d3
                      'DS2': ['Tagln', 'Lrrc15', 'Acta2', 'Wif1', 'Corin', 'Stmn2', 'Tpm2', 'F2r', 'Actg2', 'Myl9', 
                              'Ptger3', 'Adamts18', ],                        # d4
                       }

assign_cats(joost_2020_ctrl_mouse_fb, column_groupby='leiden', dict_cats=dict_mapping_joost, min_score=0.4, key_added='mapping_joost', 
            intermediate_states=False, diff=0.15, others_name='U', quantile_gene_sel=0.85)
joost_2020_ctrl_mouse_fb.obs['mapping_joost'] = joost_2020_ctrl_mouse_fb.obs['mapping_joost'].astype('category')

sc.pl.umap(joost_2020_ctrl_mouse_fb, color=['mapping_joost', 'cluster'], legend_loc='on data', frameon=False, cmap=magma)

# for cat, genes in dict_mapping_joost.items():
#     print(cat)
#     sc.pl.umap(joost_2020_ctrl_mouse_fb, color=['mapping_joost', 'cluster'] + genes, legend_loc='on data', frameon=False, cmap=magma)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(joost_2020_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in joost_2020_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(joost_2020_ctrl_mouse_fb)

In [None]:
clear_adata(joost_2020_ctrl_mouse_fb)
joost_2020_ctrl_mouse_fb.write_h5ad(joost_2020_dir + '/joost_2020_ctrl_mouse_fb_processed.h5')
joost_2020_ctrl_mouse.write_h5ad(joost_2020_dir + '/joost_2020_ctrl_mouse_processed.h5')

In [None]:
joost_2020_ctrl_mouse_fb = sc.read(joost_2020_dir + '/joost_2020_ctrl_mouse_fb_processed.h5')
joost_2020_ctrl_mouse = sc.read(joost_2020_dir + '/joost_2020_ctrl_mouse_processed.h5')

## Leyva-Castillo et al. 2022  [WARNING!!! BALB/C MICE, THERE ARE MAJOR POPULATION DIFFERENCES]

In [None]:
leyva_castillo_2022_dir = data_dir + '/leyca_castillo_2022'

In [None]:
leyva_castillo_2022_ctrl_mouse = sc.read(f"{leyva_castillo_2022_dir}/leyva_castillo_2022_SAL.h5")

In [None]:
# Basic QC filtering
leyva_castillo_2022_ctrl_mouse.var['mt'] = leyva_castillo_2022_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(leyva_castillo_2022_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(leyva_castillo_2022_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(leyva_castillo_2022_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(leyva_castillo_2022_ctrl_mouse, x='log1p_total_counts', y='log1p_n_genes_by_counts')

In [None]:
leyva_castillo_2022_ctrl_mouse = leyva_castillo_2022_ctrl_mouse[(
                              ((leyva_castillo_2022_ctrl_mouse.obs.log1p_n_genes_by_counts < 8.2) & 
                                (leyva_castillo_2022_ctrl_mouse.obs.log1p_n_genes_by_counts > 5.7))).values, :]
leyva_castillo_2022_ctrl_mouse = leyva_castillo_2022_ctrl_mouse[leyva_castillo_2022_ctrl_mouse.obs.pct_counts_mt < 20, :]
leyva_castillo_2022_ctrl_mouse = leyva_castillo_2022_ctrl_mouse[leyva_castillo_2022_ctrl_mouse.obs.pct_counts_mt > 1, :]

In [None]:
sc.pp.filter_genes(leyva_castillo_2022_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(leyva_castillo_2022_ctrl_mouse)
sc.pp.log1p(leyva_castillo_2022_ctrl_mouse)

In [None]:
sc.pp.pca(leyva_castillo_2022_ctrl_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(leyva_castillo_2022_ctrl_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(leyva_castillo_2022_ctrl_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(leyva_castillo_2022_ctrl_mouse) ** 0.5 // 4), metric='cosine')
tk.tl.triku(leyva_castillo_2022_ctrl_mouse)

In [None]:
sc.tl.umap(leyva_castillo_2022_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(leyva_castillo_2022_ctrl_mouse, resolution=12, random_state=seed)

In [None]:
leyva_castillo_2022_ctrl_mouse.obs['Internal sample identifier']

In [None]:
assign_cats(leyva_castillo_2022_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(leyva_castillo_2022_ctrl_mouse, color=['leiden', 'Age', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(leyva_castillo_2022_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in leyva_castillo_2022_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
leyva_castillo_2022_ctrl_mouse_fb = leyva_castillo_2022_ctrl_mouse[leyva_castillo_2022_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2', 'fibro_acan'])]

In [None]:
sc.pp.filter_genes(leyva_castillo_2022_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(leyva_castillo_2022_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(leyva_castillo_2022_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(leyva_castillo_2022_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(leyva_castillo_2022_ctrl_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(leyva_castillo_2022_ctrl_mouse_fb) 

sc.pp.pca(leyva_castillo_2022_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(leyva_castillo_2022_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(leyva_castillo_2022_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(leyva_castillo_2022_ctrl_mouse_fb) ** 0.5 // 4), metric='cosine')

In [None]:
sc.tl.umap(leyva_castillo_2022_ctrl_mouse_fb, random_state=seed)

In [None]:
sc.tl.leiden(leyva_castillo_2022_ctrl_mouse_fb, resolution=8, random_state=seed)

In [None]:
sc.pl.umap(leyva_castillo_2022_ctrl_mouse_fb, color=['leiden'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(leyva_castillo_2022_ctrl_mouse_fb, color=['leiden'] + ['Ccl19', 'Ndufa4l2', 'Mfap4', 'Eln', 'Tnmd', 'Cyp2f2', 'Tsc22d3', 'Rtn4r', 'Ltbp2', 'Gpm6b', 'Fam180a', 'Lsamp', 'Pid1', 'Lgr5', 'Sectm1a', 'Prkcb', 'Entpd1',  'Serpina3c', 'Myo1b', 'H2-Q7', 'Bmp4', 'Cystm1', 'Cyp1b1', 'Wnt5b', 'Id1', 'Syt13', 'Etv1', 'Eya1'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(leyva_castillo_2022_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(leyva_castillo_2022_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
leyva_castillo_2022_ctrl_mouse_fb.obs['cluster'] = leyva_castillo_2022_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del leyva_castillo_2022_ctrl_mouse_fb.obs[cluster]

In [None]:
leyva_castillo_2022_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in leyva_castillo_2022_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(leyva_castillo_2022_ctrl_mouse_fb, color=['Internal sample identifier', 'Age', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(leyva_castillo_2022_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in leyva_castillo_2022_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(leyva_castillo_2022_ctrl_mouse_fb)

In [None]:
clear_adata(leyva_castillo_2022_ctrl_mouse_fb)
leyva_castillo_2022_ctrl_mouse_fb.write_h5ad(leyva_castillo_2022_dir + '/leyva_castillo_2022_ctrl_mouse_fb_processed.h5')
leyva_castillo_2022_ctrl_mouse.write_h5ad(leyva_castillo_2022_dir + '/leyva_castillo_2022_ctrl_mouse_processed.h5')

In [None]:
leyva_castillo_2022_ctrl_mouse_fb = sc.read(leyva_castillo_2022_dir + '/leyva_castillo_2022_ctrl_mouse_fb_processed.h5')
leyva_castillo_2022_ctrl_mouse = sc.read(leyva_castillo_2022_dir + '/leyva_castillo_2022_ctrl_mouse_processed.h5')

## Lin et al. 2022 [WARNING!!! VERY FEW CELLS]

In [None]:
lin_2022_dir = data_dir + '/lin_2022'

In [None]:
lin_2022_ctrl_mouse = sc.read(lin_2022_dir + '/lin_2022_ctrl.h5')
lin_2022_ctrl_mouse.var_names_make_unique()

In [None]:
# Basic QC filtering
lin_2022_ctrl_mouse.var['mt'] = lin_2022_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(lin_2022_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(lin_2022_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(lin_2022_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(lin_2022_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
lin_2022_ctrl_mouse = lin_2022_ctrl_mouse[((lin_2022_ctrl_mouse.obs.n_genes_by_counts < 3700) & 
                                    (lin_2022_ctrl_mouse.obs.n_genes_by_counts > 1000)).values, :]
lin_2022_ctrl_mouse = lin_2022_ctrl_mouse[lin_2022_ctrl_mouse.obs.pct_counts_mt < 9, :]

In [None]:
sc.pp.filter_genes(lin_2022_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(lin_2022_ctrl_mouse)
sc.pp.log1p(lin_2022_ctrl_mouse)

In [None]:
sc.pp.pca(lin_2022_ctrl_mouse, random_state=seed, n_comps=50)
sc.pp.neighbors(lin_2022_ctrl_mouse, n_neighbors=int(0.5 * len(lin_2022_ctrl_mouse) ** 0.5 // 2), metric='cosine')
tk.tl.triku(lin_2022_ctrl_mouse)

In [None]:
sc.tl.umap(lin_2022_ctrl_mouse, min_dist=0.2, random_state=seed)
sc.tl.leiden(lin_2022_ctrl_mouse, resolution=4, random_state=seed)

In [None]:
sc.pp.subsample(lin_2022_ctrl_mouse, fraction=1, random_state=0, copy=False)
sc.pl.umap(lin_2022_ctrl_mouse, color=['leiden', 'Pdgfra'], legend_loc='on data', cmap=magma)

In [None]:
assign_cats(lin_2022_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.35, quantile_gene_sel=0.85)

In [None]:
sc.pl.umap(lin_2022_ctrl_mouse, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(lin_2022_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in lin_2022_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
lin_2022_ctrl_mouse_fb = lin_2022_ctrl_mouse[lin_2022_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(lin_2022_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(lin_2022_ctrl_mouse_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(lin_2022_ctrl_mouse_fb,  n_neighbors=int(len(lin_2022_ctrl_mouse_fb) ** 0.5 // 6), metric='cosine')
tk.tl.triku(lin_2022_ctrl_mouse_fb)
sc.pp.pca(lin_2022_ctrl_mouse_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(lin_2022_ctrl_mouse_fb,  n_neighbors=int(len(lin_2022_ctrl_mouse_fb) ** 0.5 // 6), metric='cosine')

In [None]:
sc.tl.umap(lin_2022_ctrl_mouse_fb, min_dist=0.1, random_state=seed)

In [None]:
sc.tl.leiden(lin_2022_ctrl_mouse_fb, resolution=10, random_state=seed)
# sc.tl.leiden(buechler_2021_ctrl_mouse_fb, resolution=0.2, random_state=seed)

In [None]:
assign_cats(lin_2022_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.6, key_added='cluster', others_name='U')
assign_cats(lin_2022_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
lin_2022_ctrl_mouse_fb.obs['cluster'] = lin_2022_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
lin_2022_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for  i in lin_2022_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(lin_2022_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del lin_2022_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(lin_2022_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in lin_2022_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(lin_2022_ctrl_mouse_fb)

In [None]:
clear_adata(lin_2022_ctrl_mouse_fb)
lin_2022_ctrl_mouse_fb.write_h5ad(buechler_2021_dir + '/lin_2022_ctrl_mouse_fb_processed.h5')
lin_2022_ctrl_mouse.write_h5ad(buechler_2021_dir + '/lin_2022_ctrl_mouse_processed.h5')

In [None]:
lin_2022_ctrl_mouse_fb = sc.read(buechler_2021_dir + '/lin_2022_ctrl_mouse_fb_processed.h5')
lin_2022_ctrl_mouse = sc.read(buechler_2021_dir + '/lin_2022_ctrl_mouse_processed.h5')

## Phan 2020

In [None]:
phan_2020_dir = data_dir + '/phan_2020'

In [None]:
phan_2020_ctrl_mouse_21d = sc.read(f"{phan_2020_dir}/phan_2020_ctrl_mouse_21d.h5")

In [None]:
phan_2020_ctrl_mouse_21d

In [None]:
# Basic QC filtering
phan_2020_ctrl_mouse_21d.var['mt'] = phan_2020_ctrl_mouse_21d.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(phan_2020_ctrl_mouse_21d, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(phan_2020_ctrl_mouse_21d, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(phan_2020_ctrl_mouse_21d, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(phan_2020_ctrl_mouse_21d, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': phan_2020_ctrl_mouse_21d.obs['Internal sample identifier'], 'y': phan_2020_ctrl_mouse_21d.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': phan_2020_ctrl_mouse_21d.obs['Internal sample identifier'], 'y': phan_2020_ctrl_mouse_21d.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
phan_2020_ctrl_mouse_21d = phan_2020_ctrl_mouse_21d[(
                              ((phan_2020_ctrl_mouse_21d.obs['Internal sample identifier'] == 'P21_Un_1') & (phan_2020_ctrl_mouse_21d.obs.log1p_n_genes_by_counts < 8) & 
                                (phan_2020_ctrl_mouse_21d.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((phan_2020_ctrl_mouse_21d.obs['Internal sample identifier'] == 'P21_Un_2') & (phan_2020_ctrl_mouse_21d.obs.log1p_n_genes_by_counts < 8) & 
                                (phan_2020_ctrl_mouse_21d.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((phan_2020_ctrl_mouse_21d.obs['Internal sample identifier'] == 'P21_Un_3') & (phan_2020_ctrl_mouse_21d.obs.log1p_n_genes_by_counts < 8) & 
                                (phan_2020_ctrl_mouse_21d.obs.log1p_n_genes_by_counts > 6.2)) 
                             ).values, :]
phan_2020_ctrl_mouse_21d = phan_2020_ctrl_mouse_21d[phan_2020_ctrl_mouse_21d.obs.pct_counts_mt < 5, :]

In [None]:
sc.pp.filter_genes(phan_2020_ctrl_mouse_21d, min_counts=1)
sc.pp.normalize_total(phan_2020_ctrl_mouse_21d)
sc.pp.log1p(phan_2020_ctrl_mouse_21d)

In [None]:
sc.pp.pca(phan_2020_ctrl_mouse_21d, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(phan_2020_ctrl_mouse_21d, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(phan_2020_ctrl_mouse_21d, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(phan_2020_ctrl_mouse_21d) ** 0.5), metric='cosine')
tk.tl.triku(phan_2020_ctrl_mouse_21d)

In [None]:
sc.tl.umap(phan_2020_ctrl_mouse_21d, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(phan_2020_ctrl_mouse_21d, resolution=2, random_state=seed)

In [None]:
assign_cats(phan_2020_ctrl_mouse_21d, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(phan_2020_ctrl_mouse_21d, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Dcn', 'Mpz', 'Plp1', 'Sfrp5', 'Chil1'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(phan_2020_ctrl_mouse_21d, color=['assigned_cats'] + [i for i in val if i in phan_2020_ctrl_mouse_21d.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
phan_2020_ctrl_mouse_21d_fb = phan_2020_ctrl_mouse_21d[phan_2020_ctrl_mouse_21d.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(phan_2020_ctrl_mouse_21d_fb, min_counts=1)

In [None]:
sc.pp.pca(phan_2020_ctrl_mouse_21d_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(phan_2020_ctrl_mouse_21d_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(phan_2020_ctrl_mouse_21d_fb, use_rep='X_pca_harmony', n_neighbors=int(len(phan_2020_ctrl_mouse_21d_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(phan_2020_ctrl_mouse_21d_fb)

In [None]:
sc.tl.umap(phan_2020_ctrl_mouse_21d_fb, min_dist=0.45, random_state=seed)

In [None]:
sc.tl.leiden(phan_2020_ctrl_mouse_21d_fb, resolution=9, random_state=seed)

In [None]:
sc.pl.umap(phan_2020_ctrl_mouse_21d_fb, color=['leiden',  'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.tl.rank_genes_groups(phan_2020_ctrl_mouse_21d_fb, groupby='leiden', method='wilcoxon')

In [None]:
sc.pl.umap(phan_2020_ctrl_mouse_21d_fb, color=['leiden', 'Eln', 'Pi16', 'Nrep', 'Gas1', 'Cilp', 'Osr1', 'Hmcn1', 'Mmp16', 'Slit2', 'Ccl19', 'Tspan11', 'Rnf112', 'Col8a1', 'Tspan18', 'Il15', 'Mme', 'Col6a6', 'Fam69a',] , 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
assign_cats(phan_2020_ctrl_mouse_21d_fb, dict_cats=dict_cats_clusters, min_score=0.35, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(phan_2020_ctrl_mouse_21d_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
phan_2020_ctrl_mouse_21d_fb.obs['cluster'] = phan_2020_ctrl_mouse_21d_fb.obs['cluster'].astype('category')

In [None]:
sc.pl.umap(phan_2020_ctrl_mouse_21d_fb, color=['leiden',  'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
phan_2020_ctrl_mouse_21d_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in phan_2020_ctrl_mouse_21d_fb.obs['cluster'].cat.categories]

In [None]:
# We remove these clusters because they are more "neuro-like" and do not appear in any other dataset
phan_2020_ctrl_mouse_21d_fb = phan_2020_ctrl_mouse_21d_fb[~ phan_2020_ctrl_mouse_21d_fb.obs['cluster'].isin(['U', 'x1'])]

In [None]:
sc.pp.filter_genes(phan_2020_ctrl_mouse_21d_fb, min_counts=1)

In [None]:
sc.pp.pca(phan_2020_ctrl_mouse_21d_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(phan_2020_ctrl_mouse_21d_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(phan_2020_ctrl_mouse_21d_fb, use_rep='X_pca_harmony', n_neighbors=int(len(phan_2020_ctrl_mouse_21d_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(phan_2020_ctrl_mouse_21d_fb)

sc.pp.pca(phan_2020_ctrl_mouse_21d_fb, random_state=seed, n_comps=25)
sce.pp.harmony_integrate(phan_2020_ctrl_mouse_21d_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(phan_2020_ctrl_mouse_21d_fb, use_rep='X_pca_harmony', n_neighbors=int(len(phan_2020_ctrl_mouse_21d_fb) ** 0.5 // 2), metric='cosine')

In [None]:
sc.tl.umap(phan_2020_ctrl_mouse_21d_fb, min_dist=0.9, random_state=seed)

In [None]:
sc.tl.leiden(phan_2020_ctrl_mouse_21d_fb, resolution=8.5, random_state=seed)

In [None]:
sc.pl.umap(phan_2020_ctrl_mouse_21d_fb, color=['leiden',  'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.tl.rank_genes_groups(phan_2020_ctrl_mouse_21d_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(phan_2020_ctrl_mouse_21d_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(phan_2020_ctrl_mouse_21d_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
phan_2020_ctrl_mouse_21d_fb.obs['cluster'] = phan_2020_ctrl_mouse_21d_fb.obs['cluster'].astype('category')

In [None]:
phan_2020_ctrl_mouse_21d_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in phan_2020_ctrl_mouse_21d_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(phan_2020_ctrl_mouse_21d_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del phan_2020_ctrl_mouse_21d_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(phan_2020_ctrl_mouse_21d_fb, color=['cluster'] + [i for i in val if i in phan_2020_ctrl_mouse_21d_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(phan_2020_ctrl_mouse_21d_fb)

In [None]:
clear_adata(phan_2020_ctrl_mouse_21d_fb)
phan_2020_ctrl_mouse_21d_fb.write_h5ad(phan_2020_dir + '/phan_2020_ctrl_mouse_21d_fb_processed.h5')
phan_2020_ctrl_mouse_21d.write_h5ad(phan_2020_dir + '/phan_2020_ctrl_mouse_21d_processed.h5')

In [None]:
phan_2020_ctrl_mouse_21d_fb = sc.read(phan_2020_dir + '/phan_2020_ctrl_mouse_21d_fb_processed.h5')
phan_2020_ctrl_mouse_21d = sc.read(phan_2020_dir + '/phan_2020_ctrl_mouse_21d_processed.h5')

## Salzer 2018

In [None]:
salzer_2018_dir = data_dir + '/salzer_2018'

In [None]:
salzer_2018_young_old_mouse = sc.read(f"{salzer_2018_dir}/salzer_2018_young_old_mouse.h5")

In [None]:
# Basic QC filtering
salzer_2018_young_old_mouse.var['mt'] = salzer_2018_young_old_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(salzer_2018_young_old_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(salzer_2018_young_old_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(salzer_2018_young_old_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(salzer_2018_young_old_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': salzer_2018_young_old_mouse.obs['Internal sample identifier'], 'y': salzer_2018_young_old_mouse.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': salzer_2018_young_old_mouse.obs['Internal sample identifier'], 'y': salzer_2018_young_old_mouse.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
salzer_2018_young_old_mouse = salzer_2018_young_old_mouse[((salzer_2018_young_old_mouse.obs.log1p_n_genes_by_counts < 8.2) & 
                                (salzer_2018_young_old_mouse.obs.log1p_n_genes_by_counts > 6.2)).values, :]
salzer_2018_young_old_mouse = salzer_2018_young_old_mouse[salzer_2018_young_old_mouse.obs.pct_counts_mt < 5, :]

In [None]:
sc.pp.filter_genes(salzer_2018_young_old_mouse, min_counts=1)
sc.pp.normalize_total(salzer_2018_young_old_mouse)
sc.pp.log1p(salzer_2018_young_old_mouse)

In [None]:
sc.pp.pca(salzer_2018_young_old_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(salzer_2018_young_old_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(salzer_2018_young_old_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(salzer_2018_young_old_mouse) ** 0.5), metric='cosine')
tk.tl.triku(salzer_2018_young_old_mouse)

In [None]:
sc.tl.umap(salzer_2018_young_old_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(salzer_2018_young_old_mouse, resolution=0.5, random_state=seed)

In [None]:
assign_cats(salzer_2018_young_old_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(salzer_2018_young_old_mouse, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Dcn',], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(salzer_2018_young_old_mouse, color=['assigned_cats'] + [i for i in val if i in salzer_2018_young_old_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
salzer_2018_young_old_mouse_fb = salzer_2018_young_old_mouse[salzer_2018_young_old_mouse.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(salzer_2018_young_old_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(salzer_2018_young_old_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(salzer_2018_young_old_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(salzer_2018_young_old_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(salzer_2018_young_old_mouse_fb) ** 0.5), metric='cosine')
tk.tl.triku(salzer_2018_young_old_mouse_fb)

sc.pp.pca(salzer_2018_young_old_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(salzer_2018_young_old_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(salzer_2018_young_old_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(salzer_2018_young_old_mouse_fb) ** 0.5), metric='cosine')

In [None]:
sc.tl.umap(salzer_2018_young_old_mouse_fb, min_dist=0.45, random_state=seed)

In [None]:
sc.tl.leiden(salzer_2018_young_old_mouse_fb, resolution=2, random_state=seed)

In [None]:
sc.pl.umap(salzer_2018_young_old_mouse_fb, color=['leiden',  'Internal sample identifier'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.tl.rank_genes_groups(salzer_2018_young_old_mouse_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(salzer_2018_young_old_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(salzer_2018_young_old_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
salzer_2018_young_old_mouse_fb.obs['cluster'] = salzer_2018_young_old_mouse_fb.obs['cluster'].astype('category')

In [None]:
salzer_2018_young_old_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for  i in salzer_2018_young_old_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(salzer_2018_young_old_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del salzer_2018_young_old_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(salzer_2018_young_old_mouse_fb, color=['cluster'] + [i for i in val if i in salzer_2018_young_old_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(salzer_2018_young_old_mouse_fb)

In [None]:
clear_adata(salzer_2018_young_old_mouse_fb)
salzer_2018_young_old_mouse_fb.write_h5ad(salzer_2018_dir + '/salzer_2018_young_old_mouse_fb.h5')
salzer_2018_young_old_mouse.write_h5ad(salzer_2018_dir + '/salzer_2018_young_old_mouse.h5')

In [None]:
salzer_2018_young_old_mouse_fb = sc.read(salzer_2018_dir + '/salzer_2018_young_old_mouse_fb.h5')
salzer_2018_young_old_mouse = sc.read(salzer_2018_dir + '/salzer_2018_young_old_mouse.h5')

## Shin et al. 2020

In [None]:
shin_2020_dir = data_dir + '/shin_2020'

In [None]:
shin_2020_ctrl_mouse = sc.read(f"{shin_2020_dir}/shin_2020_ctrl_mouse.h5")

In [None]:
# Basic QC filtering
shin_2020_ctrl_mouse.var['mt'] = shin_2020_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(shin_2020_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(shin_2020_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(shin_2020_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(shin_2020_ctrl_mouse, x='log1p_total_counts', y='log1p_n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': shin_2020_ctrl_mouse.obs['Internal sample identifier'], 'y': shin_2020_ctrl_mouse.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': shin_2020_ctrl_mouse.obs['Internal sample identifier'], 'y': shin_2020_ctrl_mouse.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
shin_2020_ctrl_mouse = shin_2020_ctrl_mouse[(
                              ((shin_2020_ctrl_mouse.obs['Internal sample identifier'] == '2mo1') & (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8) & 
                                (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 7.35)) | 
                              ((shin_2020_ctrl_mouse.obs['Internal sample identifier'] == '18mo') & (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8.1) & 
                                (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 7.4)) | 
                              ((shin_2020_ctrl_mouse.obs['Internal sample identifier'] == '2mo2') & (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8.5) & 
                                (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 7.5)) |
                              ((shin_2020_ctrl_mouse.obs['Internal sample identifier'] == '12mo') & (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8.5) & 
                                                        (shin_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 7.5))).values, :]
shin_2020_ctrl_mouse = shin_2020_ctrl_mouse[shin_2020_ctrl_mouse.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(shin_2020_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(shin_2020_ctrl_mouse)
sc.pp.log1p(shin_2020_ctrl_mouse)

In [None]:
sc.pp.pca(shin_2020_ctrl_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shin_2020_ctrl_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shin_2020_ctrl_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(shin_2020_ctrl_mouse) ** 0.5 // 4), metric='cosine')
tk.tl.triku(shin_2020_ctrl_mouse) 

In [None]:
sc.tl.umap(shin_2020_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(shin_2020_ctrl_mouse, resolution=2, random_state=seed)

In [None]:
assign_cats(shin_2020_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.5)
sc.pl.umap(shin_2020_ctrl_mouse, color=['leiden', 'assigned_cats', 'Internal sample identifier', 'Pdgfra', 'Lum', 'Col1a1', 'Coch', 'Vim'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(shin_2020_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in shin_2020_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
shin_2020_ctrl_mouse_fb = shin_2020_ctrl_mouse[shin_2020_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(shin_2020_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(shin_2020_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shin_2020_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shin_2020_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(shin_2020_ctrl_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(shin_2020_ctrl_mouse_fb) 

sc.pp.pca(shin_2020_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shin_2020_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shin_2020_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(shin_2020_ctrl_mouse_fb) ** 0.5 // 4), metric='cosine')

In [None]:
sc.tl.umap(shin_2020_ctrl_mouse_fb, min_dist=0.5, random_state=seed)

In [None]:
sc.tl.leiden(shin_2020_ctrl_mouse_fb, resolution=7, random_state=seed)

In [None]:
sc.pl.umap(shin_2020_ctrl_mouse_fb, color=['leiden'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(shin_2020_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.65, key_added='cluster', others_name='U')
assign_cats(shin_2020_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
shin_2020_ctrl_mouse_fb.obs['cluster'] = shin_2020_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
shin_2020_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for  i in shin_2020_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(shin_2020_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=4)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del shin_2020_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(shin_2020_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in shin_2020_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(shin_2020_ctrl_mouse_fb)

In [None]:
clear_adata(shin_2020_ctrl_mouse_fb)
shin_2020_ctrl_mouse_fb.write_h5ad(shin_2020_dir + '/shin_2020_ctrl_mouse_fb_processed.h5')
shin_2020_ctrl_mouse.write_h5ad(shin_2020_dir + '/shin_2020_ctrl_mouse_processed.h5')

In [None]:
shin_2020_ctrl_mouse_fb = sc.read(shin_2020_dir + '/shin_2020_ctrl_mouse_fb_processed.h5')
shin_2020_ctrl_mouse = sc.read(shin_2020_dir + '/shin_2020_ctrl_mouse_processed.h5')

## Shook 2020

In [None]:
shook_2020_dir = data_dir + '/shook_2020'

In [None]:
shook_2020_ctrl_mouse = sc.read(f"{shook_2020_dir}/shook_2020_ctrl_mouse.h5")

In [None]:
# Basic QC filtering
shook_2020_ctrl_mouse.var['mt'] = shook_2020_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(shook_2020_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(shook_2020_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(shook_2020_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(shook_2020_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': shook_2020_ctrl_mouse.obs['Internal sample identifier'], 'y': shook_2020_ctrl_mouse.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': shook_2020_ctrl_mouse.obs['Internal sample identifier'], 'y': shook_2020_ctrl_mouse.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
shook_2020_ctrl_mouse = shook_2020_ctrl_mouse[(
#                               ((shook_2020_ctrl_mouse.obs['Internal sample identifier'] == '1') & (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 7.3) & 
#                                 (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 6.2)) | 
#                               ((shook_2020_ctrl_mouse.obs['Internal sample identifier'] == '2') & (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 7.3) & 
#                                 (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 6.2)) | 
                              ((shook_2020_ctrl_mouse.obs['Internal sample identifier'] == 'Ctrl_S4') & (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8) & 
                                (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 7.35)) | 
                              ((shook_2020_ctrl_mouse.obs['Internal sample identifier'] == 'Ctrl_S5') & (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts < 8) & 
                                (shook_2020_ctrl_mouse.obs.log1p_n_genes_by_counts > 7.35))).values, :]
shook_2020_ctrl_mouse = shook_2020_ctrl_mouse[shook_2020_ctrl_mouse.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(shook_2020_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(shook_2020_ctrl_mouse)
sc.pp.log1p(shook_2020_ctrl_mouse)

In [None]:
sc.pp.pca(shook_2020_ctrl_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shook_2020_ctrl_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shook_2020_ctrl_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(shook_2020_ctrl_mouse) ** 0.5), metric='cosine')
tk.tl.triku(shook_2020_ctrl_mouse)

In [None]:
sc.tl.umap(shook_2020_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(shook_2020_ctrl_mouse, resolution=2, random_state=seed)

In [None]:
assign_cats(shook_2020_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.45)
sc.pl.umap(shook_2020_ctrl_mouse, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Rgs5', 'Pecam1', 'Lyve1', 'Mlana'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(shook_2020_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in shook_2020_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
shook_2020_ctrl_mouse_fb = shook_2020_ctrl_mouse[shook_2020_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(shook_2020_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(shook_2020_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shook_2020_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shook_2020_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(shook_2020_ctrl_mouse_fb)

sc.pp.pca(shook_2020_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(shook_2020_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(shook_2020_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_mouse_fb) ** 0.5 // 4), metric='cosine')

In [None]:
sc.tl.umap(shook_2020_ctrl_mouse_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(shook_2020_ctrl_mouse_fb, resolution=10, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_mouse_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.umap(shook_2020_ctrl_mouse_fb, color=['leiden', 'Internal sample identifier',], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(shook_2020_ctrl_mouse_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(shook_2020_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster', others_name='U',)
assign_cats(shook_2020_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
shook_2020_ctrl_mouse_fb.obs['cluster'] = shook_2020_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
shook_2020_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in shook_2020_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(shook_2020_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
sc.pl.umap(shook_2020_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis', 'cluster_d1', 'cluster_d2', 'cluster_d3', 'cluster_d4', 'cluster_d5'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del shook_2020_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(shook_2020_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in shook_2020_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(shook_2020_ctrl_mouse_fb)

In [None]:
clear_adata(shook_2020_ctrl_mouse_fb)
shook_2020_ctrl_mouse_fb.write_h5ad(shook_2020_dir + '/shook_2020_ctrl_mouse_fb_processed.h5')
shook_2020_ctrl_mouse.write_h5ad(shook_2020_dir + '/shook_2020_ctrl_mouse_processed.h5')

In [None]:
shook_2020_ctrl_mouse_fb = sc.read(shook_2020_dir + '/shook_2020_ctrl_mouse_fb_processed.h5')
shook_2020_ctrl_mouse = sc.read(shook_2020_dir + '/shook_2020_ctrl_mouse_processed.h5')

## Vorstandlechner 2021

In [None]:
vorstandlechner_2021_dir = data_dir + '/Vorstandlechner_2021'

In [None]:
vorstandlechner_2021_ctrl_mouse = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse.h5")

In [None]:
# Basic QC filtering
vorstandlechner_2021_ctrl_mouse.var['mt'] = vorstandlechner_2021_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(vorstandlechner_2021_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(vorstandlechner_2021_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(vorstandlechner_2021_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(vorstandlechner_2021_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': vorstandlechner_2021_ctrl_mouse.obs['Internal sample identifier'], 'y': vorstandlechner_2021_ctrl_mouse.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': vorstandlechner_2021_ctrl_mouse.obs['Internal sample identifier'], 'y': vorstandlechner_2021_ctrl_mouse.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
vorstandlechner_2021_ctrl_mouse = vorstandlechner_2021_ctrl_mouse[(                             
                              ((vorstandlechner_2021_ctrl_mouse.obs['Internal sample identifier'] == 'mouse_healthy_1') & (vorstandlechner_2021_ctrl_mouse.obs.log1p_n_genes_by_counts < 7.2) & 
                                (vorstandlechner_2021_ctrl_mouse.obs.log1p_n_genes_by_counts > 5.9)) | 
                              ((vorstandlechner_2021_ctrl_mouse.obs['Internal sample identifier'] == 'mouse_healthy_2') & (vorstandlechner_2021_ctrl_mouse.obs.log1p_n_genes_by_counts < 7.2) & 
                                (vorstandlechner_2021_ctrl_mouse.obs.log1p_n_genes_by_counts > 5.9))).values, :]
vorstandlechner_2021_ctrl_mouse = vorstandlechner_2021_ctrl_mouse[vorstandlechner_2021_ctrl_mouse.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(vorstandlechner_2021_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(vorstandlechner_2021_ctrl_mouse)
sc.pp.log1p(vorstandlechner_2021_ctrl_mouse)

In [None]:
sc.pp.pca(vorstandlechner_2021_ctrl_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(vorstandlechner_2021_ctrl_mouse) ** 0.5), metric='cosine')
tk.tl.triku(vorstandlechner_2021_ctrl_mouse)

In [None]:
sc.tl.umap(vorstandlechner_2021_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(vorstandlechner_2021_ctrl_mouse, resolution=0.1, random_state=seed)

In [None]:
assign_cats(vorstandlechner_2021_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.45, quantile_gene_sel=0.5)
sc.pl.umap(vorstandlechner_2021_ctrl_mouse, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Lum', 'Dcn', 'Rgs5', 'Pecam1', 'Lyve1', 'Mlana'], legend_loc='on data', cmap=magma, ncols=2, 
           use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(vorstandlechner_2021_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in vorstandlechner_2021_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
vorstandlechner_2021_ctrl_mouse_fb = vorstandlechner_2021_ctrl_mouse[vorstandlechner_2021_ctrl_mouse.obs['assigned_cats'].isin(['fibro'])].copy()

In [None]:
sc.pp.filter_genes(vorstandlechner_2021_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(vorstandlechner_2021_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_mouse_fb) ** 0.5 // 7), metric='cosine')
tk.tl.triku(vorstandlechner_2021_ctrl_mouse_fb)

sc.pp.pca(vorstandlechner_2021_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_mouse_fb) ** 0.5 // 7), metric='cosine')

In [None]:
sc.tl.umap(vorstandlechner_2021_ctrl_mouse_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(vorstandlechner_2021_ctrl_mouse_fb, resolution=1, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_mouse_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.umap(vorstandlechner_2021_ctrl_mouse_fb, color=['leiden', 'Internal sample identifier',], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
dict_krt = {'krt': ['Lgals7', 'Krt15', 'Krt5', 'Krt14']}
assign_cats(vorstandlechner_2021_ctrl_mouse_fb, dict_cats={**dict_cats_clusters, **dict_krt}, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U',)
assign_cats(vorstandlechner_2021_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'] = vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
vorstandlechner_2021_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(vorstandlechner_2021_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
vorstandlechner_2021_ctrl_mouse_fb = vorstandlechner_2021_ctrl_mouse_fb[~ vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'].isin(['krt'])].copy()

In [None]:
sc.pp.filter_genes(vorstandlechner_2021_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(vorstandlechner_2021_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_mouse_fb) ** 0.5 // 7), metric='cosine')
tk.tl.triku(vorstandlechner_2021_ctrl_mouse_fb)

sc.pp.pca(vorstandlechner_2021_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(shook_2020_ctrl_mouse_fb) ** 0.5 // 7), metric='cosine')

In [None]:
sc.tl.umap(vorstandlechner_2021_ctrl_mouse_fb, min_dist=0.25, random_state=seed)

In [None]:
sc.tl.leiden(vorstandlechner_2021_ctrl_mouse_fb, resolution=5, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_mouse_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.umap(vorstandlechner_2021_ctrl_mouse_fb, color=['leiden', 'Internal sample identifier',], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(vorstandlechner_2021_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U',)
assign_cats(vorstandlechner_2021_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'] = vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
vorstandlechner_2021_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in vorstandlechner_2021_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(vorstandlechner_2021_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=3)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del vorstandlechner_2021_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(vorstandlechner_2021_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in vorstandlechner_2021_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(vorstandlechner_2021_ctrl_mouse_fb)

In [None]:
clear_adata(vorstandlechner_2021_ctrl_mouse_fb)
vorstandlechner_2021_ctrl_mouse_fb.write_h5ad(vorstandlechner_2021_dir + '/vorstandlechner_2021_ctrl_mouse_fb_processed.h5')
vorstandlechner_2021_ctrl_mouse.write_h5ad(vorstandlechner_2021_dir + '/vorstandlechner_2021_ctrl_mouse_processed.h5')

In [None]:
vorstandlechner_2021_ctrl_mouse_fb = sc.read(vorstandlechner_2021_dir + '/vorstandlechner_2021_ctrl_mouse_fb_processed.h5')
vorstandlechner_2021_ctrl_mouse = sc.read(vorstandlechner_2021_dir + '/vorstandlechner_2021_ctrl_mouse_processed.h5')

## Yanling 2022

In [None]:
yanling_2022_dir = data_dir + '/yanling_2022'

In [None]:
yanling_2022_ctrl_mouse = sc.read(f"{yanling_2022_dir}/adata_yanling_2022_ctrl.h5")

In [None]:
# Basic QC filtering
yanling_2022_ctrl_mouse.var['mt'] = yanling_2022_ctrl_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(yanling_2022_ctrl_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(yanling_2022_ctrl_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(yanling_2022_ctrl_mouse, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(yanling_2022_ctrl_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': yanling_2022_ctrl_mouse.obs['Internal sample identifier'], 'y': yanling_2022_ctrl_mouse.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': yanling_2022_ctrl_mouse.obs['Internal sample identifier'], 'y': yanling_2022_ctrl_mouse.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
yanling_2022_ctrl_mouse = yanling_2022_ctrl_mouse[(
                              ((yanling_2022_ctrl_mouse.obs['Internal sample identifier'] == 'WT1') & (yanling_2022_ctrl_mouse.obs.log1p_n_genes_by_counts < 8) & 
                                (yanling_2022_ctrl_mouse.obs.log1p_n_genes_by_counts > 6.5)) | 
                              ((yanling_2022_ctrl_mouse.obs['Internal sample identifier'] == 'WT2') & (yanling_2022_ctrl_mouse.obs.log1p_n_genes_by_counts < 8) & 
                                (yanling_2022_ctrl_mouse.obs.log1p_n_genes_by_counts > 6.5))).values, :]
yanling_2022_ctrl_mouse = yanling_2022_ctrl_mouse[yanling_2022_ctrl_mouse.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(yanling_2022_ctrl_mouse, min_counts=1)
sc.pp.normalize_total(yanling_2022_ctrl_mouse)
sc.pp.log1p(yanling_2022_ctrl_mouse)

In [None]:
sc.pp.pca(yanling_2022_ctrl_mouse, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(yanling_2022_ctrl_mouse, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(yanling_2022_ctrl_mouse, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(yanling_2022_ctrl_mouse) ** 0.5), metric='cosine')
tk.tl.triku(yanling_2022_ctrl_mouse)

In [None]:
sc.tl.umap(yanling_2022_ctrl_mouse, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(yanling_2022_ctrl_mouse, resolution=2, random_state=seed)

In [None]:
assign_cats(yanling_2022_ctrl_mouse, dict_cats=dict_cats_fb_mouse, min_score=0.45, quantile_gene_sel=0.5)
sc.pl.umap(yanling_2022_ctrl_mouse, color=['leiden', 'Internal sample identifier', 'assigned_cats', 'Pdgfra', 'Dcn', 'Lum', 'Rgs5', 'Pecam1', 'Lyve1', 'Mlana'], 
           legend_loc='on data', cmap=magma, ncols=3, use_raw=False, )

In [None]:
for key, val in dict_cats_fb_mouse.items():
    print(key)
    sc.pl.umap(yanling_2022_ctrl_mouse, color=['assigned_cats'] + [i for i in val if i in yanling_2022_ctrl_mouse.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
yanling_2022_ctrl_mouse_fb = yanling_2022_ctrl_mouse[yanling_2022_ctrl_mouse.obs['assigned_cats'].isin(['fibro', 'fibro_2'])]

In [None]:
sc.pp.filter_genes(yanling_2022_ctrl_mouse_fb, min_counts=1)

In [None]:
sc.pp.pca(yanling_2022_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(yanling_2022_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(yanling_2022_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(yanling_2022_ctrl_mouse_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(yanling_2022_ctrl_mouse_fb)

sc.pp.pca(yanling_2022_ctrl_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(yanling_2022_ctrl_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(yanling_2022_ctrl_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(len(yanling_2022_ctrl_mouse_fb) ** 0.5 // 2), metric='cosine')

In [None]:
sc.tl.umap(yanling_2022_ctrl_mouse_fb, min_dist=0.4, random_state=seed)

In [None]:
sc.tl.leiden(yanling_2022_ctrl_mouse_fb, resolution=5, random_state=seed)
# sc.tl.leiden(shook_2020_ctrl_mouse_fb, resolution=0.4, random_state=seed)

In [None]:
sc.pl.umap(yanling_2022_ctrl_mouse_fb, color=['leiden', 'Internal sample identifier',], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(yanling_2022_ctrl_mouse_fb, groupby='leiden', method='wilcoxon')

In [None]:
assign_cats(yanling_2022_ctrl_mouse_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster', others_name='U')
assign_cats(yanling_2022_ctrl_mouse_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')
yanling_2022_ctrl_mouse_fb.obs['cluster'] = yanling_2022_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
yanling_2022_ctrl_mouse_fb.obs['cluster'] = yanling_2022_ctrl_mouse_fb.obs['cluster'].astype('category')

In [None]:
yanling_2022_ctrl_mouse_fb.uns['cluster_colors'] = [dict_colors_mouse[i] if i in dict_colors_mouse else '#bcbcbc' for i in yanling_2022_ctrl_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pl.umap(yanling_2022_ctrl_mouse_fb, color=['Internal sample identifier', 'leiden', 'cluster', 'axis'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
for cluster in ['a', 'b', 'c', 'd']:
    del yanling_2022_ctrl_mouse_fb.obs[cluster]

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(yanling_2022_ctrl_mouse_fb, color=['cluster'] + [i for i in val if i in yanling_2022_ctrl_mouse_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(yanling_2022_ctrl_mouse_fb)

In [None]:
clear_adata(yanling_2022_ctrl_mouse_fb)
yanling_2022_ctrl_mouse_fb.write_h5ad(yanling_2022_dir + '/yanling_2022_ctrl_mouse_fb_processed.h5')
yanling_2022_ctrl_mouse.write_h5ad(yanling_2022_dir + '/yanling_2022_ctrl_mouse_processed.h5')

In [None]:
yanling_2022_ctrl_mouse_fb = sc.read(yanling_2022_dir + '/yanling_2022_ctrl_mouse_fb_processed.h5')
yanling_2022_ctrl_mouse = sc.read(yanling_2022_dir + '/yanling_2022_ctrl_mouse_processed.h5')

## Presence of clusters for each dataset

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
list_datasets = [abassi_2020_ctrl_mouse_fb, boothby_2021_ctrl_mouse_fb, buechler_2021_ctrl_mouse_fb, haensel_2021_ctrl_mouse_wounding_fb, 
                 joost_2020_ctrl_mouse_fb, phan_2020_ctrl_mouse_21d_fb, shin_2020_ctrl_mouse_fb, shook_2020_ctrl_mouse_fb, vorstandlechner_2021_ctrl_mouse_fb]

list_names = [adatax.obs['Author'].values[0] for adatax in list_datasets]

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='presence', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='percentage', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='axis', cluster_name='cluster', axis_name='axis')

## Plotting all Adatas

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(3 * 4, 3 * 4))

for ax in axs.ravel()[len(list_datasets) - len(axs) :]:
    ax.set_axis_off()

for adata, name, idx in zip(list_datasets, list_names, range(len(list_datasets))):
    sc.pl.umap(adata, color=['cluster'], legend_loc='on data', show=False, ax = axs.ravel()[idx], 
               title=str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])), size=15, cmap=magma, frameon=False)