# Comparison of fibroblast populations

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
# import ray
# import subprocess
# import time
# import scvelo as scv
# import gc
import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats
from fb_functions import clear_adata

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0
%store seed

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

%store magma

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'COL1A1', 'DCN', 'SFRP2', 'APOE', 'APOD', 'FN1'], 
                'fibro - ANGPTL7': ['ANGPTL7', 'ENTPD2', 'ETV1', 'C2orf40', 'SCN7A', 'SOX8'],
                'F': ['B4GALT1', 'TMSB4X', 'PPP1CB', 'WTAP', 'PTPRS', 'CTNNB1', 'INSR', 'BICC1', 'CTNNB1'], 
                'melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'], 
                'neuro': ['GPM6B','PLP1','S100B','SCN7A','NRXN1','GFRA3','MPZ'],
                'secretory cell': ['KRT7', 'KRT8', 'KRT18', 'KRT19', 'DCD', 'SCGB2A2', 'PPP1R1B', 'MUCL1', 'AZGP1', 'SCGB1D2', 'PDCD4', 'TSPAN8'], 
                # PERI / MUSCLE
                'muscle': ['TAGLN', 'DES', 'PCP4', 'ACTG2', 'CNN1', 'CSRP1', 'TPM1', 'SYNPO2', 'PRUNE2', 'SORBS1', 'P2RX1'],
                'peri - CYCS': ['TAGLN', 'CRISPLD2',  'CYCS', 'VDAC1', 'RHOB', 'SORBS2', 'PLEKHO1', 'CNN1', 'DNAJB9', 'CSRP2'],
                'peri - RERGL': ['TAGLN', 'ACTA2', 'CRISPLD2',  'RERGL', 'BCAM', 'ADIRF', 'NET1', 'ARPC1A', 'PLN'],
                'peri - RGS5': ['ACTA2',  'RGS5', 'ABCC9', 'HOPX', 'ARHGDIB', 'KCNJ8', 'FXYD6'],
                'peri - ZFP36': [ 'RGS16', 'NR2F2', 'TGFBI', 'CCL8', 'RERG', 'HOPX'],
                #---------------------------------
                # ENDO INFO IS RECOLLECTED FROM LI et al 2021 - 10.7150/thno.54917
                'endo artery': ['PLVAP', 'CLDN5', 'PECAM1', 'IGFBP3', 'SRGN', 'SEMA3G', 'RHOB', 'HEY1',], 
                'endo capillary': ['PLVAP', 'CLDN5', 'PECAM1', 'SELE', 'SOCS3', 'CDKN1A', 'NFKBIA', 'DNAJB1', 'ATF3',], 
                'endo venule': ['PLVAP', 'CLDN5', 'PECAM1', 'CYP1B1', 'CLU', 'PERP', 'VWF', 'IER3', 'TSC22D3'],                 
                'lymph': ['CCL21', 'LYVE1', 'CLDN5', 'TFF3', 'MMRN1', 'EFEMP1', 'FGL2', 'TFPI', 'MAF'],
                #----------------------------------------------------------
                # KERATINOCYTE INFO - based on Cheng et al 2018 - 10.1016/j.celrep.2018.09.006
                'krt basal': ['KRT14', 'COL17A1', 'KRT5', 'KRT15', 'DST', 'PDLIM1'], 
                'krt channel': ['KRT23', 'GJB6', 'GJB2', 'CDA', 'MMP7', 'PNLIPRP3'], 
                'krt spinous': ['KRT1', 'KRT10', 'DMKN', 'KRTDAP', 'CHP2', 'LYPD3'],
                'krt gran': ['FLG', 'NCCRP1', 'CNFN', 'TGM1', 'CST6', 'KLK7'],
                #----------------------------------------------
                'immune': ['TPSB2', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 
                'T CD4+': ['CD52', 'CD3D', 'TRAC', 'TCF7', 'CD4', 'IL7R', 'CD40LG'],
                'T CD8+': ['CD52', 'CD3D', 'TRAC', 'CD8B', 'THEMIS', 'CD8A', 'FOXP3', 'CCR4', 'RORC', 'TIGIT',],
                'B cells': ['IGHM','CD74','CD79A','NIBAN3','TCL1A','NCF1','MS4A1', 'BTK', 'CD19'],
                'plasma cell': ['SDC1', 'SLAMF7', 'TNFRSF17', 'PTPRC', 'CXCR4', 'MYH9', 'PRDM1', 'CD38', 'CD27', 'IGHG1'],
                'dendritic cell': ['GZMB', 'MRC1', 'XCR1', 'CLEC9A', 'IRF8', 'EPCAM', 'CD1B', 'STMN1', 'IDO1',],
                'APC': ['HLA-DQA1', 'HLA-DRB6', 'TYROBP', 'FCER1G', 'AIF1'], 
                'mast cell': ['IL1RL1', 'CPA3', 'HPGDS', 'TPSB2', 'HPGD', 'RGS13', 'CTSG',  'TPSAB1', 'GATA2'],
                'NK cell': ['NCAM1', 'XCL1', 'CD38', 'CD7', 'IL18R1', 'KLRF1', 'KLRK1'],
                'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6'], 
                'eritro': ['HBB', 'HBA2', 'HBA1', 'HBD'],
               }

dict_cats_axes = {'A': ['PTGIS', 'SFRP2', 'MMP2', 'RGCC', 'COL14A1', 'AQP1', 'PTGIS', 'QPCT', 'ELN', 'COL14A1', 'LEPR', 
                        'ISM1', 'CES1', 'WIF1', 'RECK', 'SGCA', 'FBN1', 'NBL1', 'CTSB', 'COL5A1', 'SMOC2', 'SGCG'],
                  'B': ['GGT5', 'APOE', 'APOC1', 'CYGB', 'C7', 'IGFBP7', 'TNFSF13B', 'APOC1', 'RARRES2', 'CCDC146', 'CXCL12', 
                        'ITM2A', 'EBF1', 'CCL2', 'IGFBP3', 'EBF1', 'CXCL2', 'EFEMP1', 'TMEM176A', 'C3', 'EGR1', 'LGALS3BP', 
                        'BST2', 'ANGPTL4', 'ABCA8', ],
                  'C': ['TNN', 'TNMD', 'ASPN', 'CYP1B1', 'GPC1', 'PPP1R14A', 'GPM6B', 'COL11A1', 'DKK3', 'OGN', 'SDC1', 
                        'PDE1A', 'MDK', 'NRP2', 'POSTN', 'F2R', 'KIF26B', 'TENM3', 'ALX4', 'PMEPA1', 'FIBIN', 'PCDH15',]}

# provisional_manual_dict_cats_clusters
dict_cats_clusters= {
                     'A1': ['SLPI', 'C1QTNF3', 'CD70', 'SEMA3C', 'TSPAN8', 'DPP4', 'CHRDL1', 
                             'SEMA3B', 'CTHRC1', 'WISP2', 'CCN5', 'PI16', 'QPCT'],
                     'A2': ['CLEC2A', 'COL13A1', 'COL23A1', 'COL6A5', 'HSPB3', 'NPTX2', 'COMP', 'APCDD1', 'NKD2', 
                            'PHACTR3', 'AKAP6', 'LEPR', 'STC2', 'WIF1', 'CDC42EP3', 'COL18A1',], 
                     'A3': ['WIF1', 'HAS1', 'SGCG', 'CORIN', 'ACKR4', 'C1QTNF3', 'CFD', 'QPCT', 'SGCA', ], # 'HAS1', 'CORIN', 'SGCG', 'F13A1', 'RETREG1', 
                     'A4': ['SFRP4', 'HSD3B7', 'CD70', 'C1QTNF3', 'PRG4', 'GLIPR2', 'PAMR1', 'FNDC1',],
                     'B1': ['MEDAG', 'CXCL1', 'IL6', 'ERRFI1', 'SPSB1', 'C11orf96', 'KDM6B', 'EIF4A3', 'NR4A2', 'ZC3H12A', 
                            'KDM6B', 'ZC3H12A', 'UAP1', 'CXCL2', 'SOD2', 'CSRNP1', 'ICAM1', 'MAFF', 'KPNA2', ],
                     'B3': ['CCL19', 'CCL2', 'CDKN1A', 'CH25H', 'IL33', 'FOSB', 'EGR1', 'HES1', 'UBD', 'TNFAIP3', ],  # B0 = B1B2
                     'B2': ['EGR1', 'CD74', 'IGFBP3', 'SPARCL1', 'IGFBP7', 'TMEM150C', 'P2RY14', 'IL34', 'PTGDS', 'APOC1', 'CYGB', 
                            'CTSH', 'TNFSF13B', 'CCDC146', 'C7', 'RBP5', 'CXCL12', 'PTGDS', 'ICAM2', 'RARRES2', 'LPAR4', 'COX4I2', ],
                     'B4': ['ITM2A', 'GPC3', 'CHRDL1', 'GPX3', 'EFEMP1', 'MYOC', 'CXCL12', 'PLA2G2A', 'RARRES2',  'MGP', 'TSPAN8',
                            'GSN', 'APOD', 'PLA2G5', 'CFH', 'GGT5', 'SIX1', 'ADH1B', 'GDF10', 'MGST1', 
                             ], 
                     'C1': ['DPEP1', 'UGT3A2', 'COL11A1', 'MME', 'RBP4', 'MYL4', 'WFDC1', 'CYYR1', 'EDNRA', 
                            'MEF2C', 'TNMD', 'CDH11', 'COL21A1', 'GPC3', 'KIF26B', 'PPP1R14A', 'EDIL3', 'SLC26A7' ],
                     'C2': ['COCH', 'SLITRK6', 'MKX', 'CHADL', 'RSPO4', 'CRABP1', 'NDNF', 'SLC22A16', 'FIBIN', 'TNN', 'CCK', 
                            'RHPN1', 'MAB21L2', 'ASPN', 'OGN', 'PLXDC1', 'SPARCL1', 'PLPP5', 'WNT10A', 'NECAB1', 'TNMD' ], # 'SLITRK6', 'MKX'
                     'C3': ['BGN', 'F2R', 'TNN', 'POSTN', 'GPM6B', 'PRSS23', 'FAP', 'EMID1', ],
                     'C5': ['IGFBP3', 'LUZP2', 'CENPW', 'TFAP2A', 'TPD52', 'LEF1', 'CPE', 'BMP7', 'DIO3', 'ACTR3B', 
                            'BAMBI', 'INHBA', 'SERPINE2', 'WNT5A', 'BMP4', 'STMN1', 'SEMA3G', 'NOG', 'DCXR', 'EDNRA'],
    
                     'D1': ['ANGPTL7', 'SCN7A', 'C2orf40', 'APOD', 'CLDN1', 'CYP1B1', 'FGFBP2', 'MARCKSL1', 'PODNL1', 'KLK1', 'TM4SF1', 
                             'NRP2', 'COL8A1', 'P2RY14', 'SFRP4', 'MRAS', 'GPC3', 'ETV1', 'TIAM1', 'SPARCL1'], 
                     'D2': ['NGFR', 'CLDN1', 'SBSPON', 'TAGLN', 'TM4SF1', 'SLC2A1', 'SLC22A3', 'TNNC1', 'BNC2', 'KLF5', 'C2orf40', 'AQP3', 
                             'CSRP1', 'PALMD', 'SFRP4', 'PLEKHA4', 'NR2F2', 'ISYNA1', 'SCN7A', 'IGFBP6'],
                     'E1': ['FGFBP2', 'CPE', 'OLFML2A', 'SLC22A3', 'IGFBP2', 'SPON2', 'APOD', 'EGR2', 
                            'RAMP1', 'IGF1', 'KLK1', 'RGMA', 'PDGFD', 'PRSS23', 'TIMP3',], 
    
                     'T1': ['ASPN', 'MOB3B', 'PLEKHH2', 'PCDH15', 'RAI2', 'SPARCL1', 'TMEM176B', 'COL6A6', 'TMEM176A', 
                            'CYP1B1', 'MCTP2', 'BCL11A', 'LMO2', 'NTRK3', 'GLI2', 'PLPP4', 'CHN1', 'PDGFRL', 'IDE', 'LGR6', 'PI16', 'TELO2'],
    
                     'Glial': ['SOX10', 'S100B', 'NRNX1', 'L1CAM', 'AATK', 'SCN7A', 'GFRA3'], 
                     'Melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'],
                     'Endo': ['PLVAP', 'CLDN5', 'ACKR1', 'LMCD1', 'NPDC1', 'A2M', 
                         'PECAM1', 'CLU', 'VWF', 'CD74', 'RAMP2', 'IFI27', 'GNG11'], 
                      'Peri': ['RGS5', 'ITGA7', 'GJA4', 'MYH11', 'ANGPT2', 'LAMA5'], 
                    }  

In [None]:
dict_colors_human = {'A1': '#c93038', 'A2': '#efb241', 'A3': '#e38341', 'A4': '#740a2c',
               'B1': '#d4f561', 'B2': '#2bb844', 'B3': '#7bdf46', 'B4': '#158858',
               'C1': '#93dfe4', 'C2': '#63c2c9', 'C3': '#4c93ad', 'C5': '#264f6e',
               'D1': '#ea387f', 'D2': '#fa7eaf', 
               'E1': '#b58057',
               'T1': '#8e7cc3', 'U': '#dedede'}

%store dict_colors_human

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 100

In [None]:
def plot_score_graph(adatax):
    df_cats_own = pd.DataFrame(index=adatax.obs_names, columns=['clusters', 'score'])
    for cluster in adatax.obs['cluster'].cat.categories:
        adata_sub = adatax[adatax.obs['cluster'] == cluster]
        try:
            df_cats_own.loc[adata_sub.obs_names, 'score'] = adata_sub.obs[f'cluster_{cluster}']
            df_cats_own.loc[adata_sub.obs_names, 'clusters'] = cluster
        except:
            pass

    df_cats_own = df_cats_own.sort_values('clusters')
    sns.barplot(x='clusters', y='score', data=df_cats_own, palette=adatax.uns['cluster_colors'])

In [None]:
data_dir = os.getcwd() + '/data/'
print(data_dir)
%store data_dir

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## data extraction and processing

### Ahlers et al. 2022

In [None]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
os.makedirs(ahlers_2022_dir, exist_ok=True)

In [None]:
ahlers_2022_young_human = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human.h5')

In [None]:
ahlers_2022_young_human.var_names = [dict_rep[i] if i in dict_rep else i for i in ahlers_2022_young_human.var_names ]

In [None]:
# Basic QC filtering
ahlers_2022_young_human.var['mt'] = ahlers_2022_young_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(ahlers_2022_young_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(ahlers_2022_young_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(ahlers_2022_young_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(ahlers_2022_young_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': ahlers_2022_young_human.obs['Internal sample identifier'], 'y': ahlers_2022_young_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
ahlers_2022_young_human = ahlers_2022_young_human[(((ahlers_2022_young_human.obs['Internal sample identifier'] == 'P1_Y') & (ahlers_2022_young_human.obs.log1p_n_genes_by_counts < 7.5) & 
                                                (ahlers_2022_young_human.obs.log1p_n_genes_by_counts > 6.3)) | 
                                              ((ahlers_2022_young_human.obs['Internal sample identifier'] == 'P3_Y') & (ahlers_2022_young_human.obs.log1p_n_genes_by_counts < 7.5) & 
                                                (ahlers_2022_young_human.obs.log1p_n_genes_by_counts > 6.3)) | 
                                              ((ahlers_2022_young_human.obs['Internal sample identifier'] == 'P5_Y') & (ahlers_2022_young_human.obs.log1p_n_genes_by_counts < 7.5) & 
                                                (ahlers_2022_young_human.obs.log1p_n_genes_by_counts > 6.3)) 
                                             ).values, :]
ahlers_2022_young_human = ahlers_2022_young_human[ahlers_2022_young_human.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(ahlers_2022_young_human, min_counts=1)
sc.pp.normalize_total(ahlers_2022_young_human)
sc.pp.log1p(ahlers_2022_young_human)

In [None]:
sc.pp.pca(ahlers_2022_young_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(ahlers_2022_young_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(ahlers_2022_young_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(ahlers_2022_young_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(ahlers_2022_young_human)

In [None]:
sc.tl.umap(ahlers_2022_young_human, min_dist=0.4, random_state=seed)
sc.tl.leiden(ahlers_2022_young_human, resolution=8, random_state=seed)

In [None]:
sc.pp.subsample(ahlers_2022_young_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(ahlers_2022_young_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(ahlers_2022_young_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL', 'HBB'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(ahlers_2022_young_human, dict_cats=dict_cats_fb, min_score=0.45, quantile_gene_sel=0.97)

In [None]:
sc.pl.umap(ahlers_2022_young_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(ahlers_2022_young_human, color=['assigned_cats'] + [i for i in val if i in ahlers_2022_young_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(ahlers_2022_young_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19', 'GGT5',  'CHRDL1', 'GPX3', 'BGN', 'ASPN', 'TNN', 'COL11A1', 'COCH', 
                                     'IGFBP3', 'ANGPTL7', 'SCN7A', 'C2orf40', 'NGFR', 'CLDN1', 'SBSPON', 'FGFBP2', 'DIO3', 'LUZP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
ahlers_2022_young_human_fb = ahlers_2022_young_human[ahlers_2022_young_human.obs['assigned_cats'].isin(['fibro', 'F', 'fibro - ANGPTL7'])]

In [None]:
sc.pp.filter_genes(ahlers_2022_young_human_fb, min_counts=1)

In [None]:
sc.pp.pca(ahlers_2022_young_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(ahlers_2022_young_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(ahlers_2022_young_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(ahlers_2022_young_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(ahlers_2022_young_human_fb)

In [None]:
sc.tl.umap(ahlers_2022_young_human_fb, min_dist=0.05, random_state=seed)
sc.tl.leiden(ahlers_2022_young_human_fb, resolution=12, random_state=seed)

In [None]:
assign_cats(ahlers_2022_young_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(ahlers_2022_young_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
ahlers_2022_young_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(ahlers_2022_young_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(ahlers_2022_young_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del ahlers_2022_young_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(ahlers_2022_young_human_fb, color=['cluster'] + [i for i in val if i in ahlers_2022_young_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(ahlers_2022_young_human_fb)

In [None]:
clear_adata(ahlers_2022_young_human_fb)
ahlers_2022_young_human_fb.write_h5ad(ahlers_2022_dir + '/ahlers_2022_young_human_fb_processed.h5')
ahlers_2022_young_human.write_h5ad(ahlers_2022_dir + '/ahlers_2022_young_human_processed.h5')

In [None]:
ahlers_2022_young_human = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human_processed.h5')
ahlers_2022_young_human_fb = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human_fb_processed.h5')

### Boothby et al. 2021

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'

In [None]:
boothby_2021_ctrl_human = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human.h5')

In [None]:
boothby_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in boothby_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
boothby_2021_ctrl_human.var['mt'] = boothby_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(boothby_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(boothby_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(boothby_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(boothby_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': boothby_2021_ctrl_human.obs['Internal sample identifier'], 'y': boothby_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
boothby_2021_ctrl_human = boothby_2021_ctrl_human[(((boothby_2021_ctrl_human.obs['Internal sample identifier'] == 'HC01') & (boothby_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (boothby_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.3)) | 
                                              ((boothby_2021_ctrl_human.obs['Internal sample identifier'] == 'HC02') & (boothby_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (boothby_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.2)) | 
                                              ((boothby_2021_ctrl_human.obs['Internal sample identifier'] == 'HC03') & (boothby_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (boothby_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.3)) 
                                             ).values, :]
boothby_2021_ctrl_human = boothby_2021_ctrl_human[boothby_2021_ctrl_human.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(boothby_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(boothby_2021_ctrl_human)
sc.pp.log1p(boothby_2021_ctrl_human)

In [None]:
sc.pp.pca(boothby_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(boothby_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(boothby_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(boothby_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(boothby_2021_ctrl_human)

In [None]:
sc.tl.umap(boothby_2021_ctrl_human, min_dist=0.4, random_state=seed)
sc.tl.leiden(boothby_2021_ctrl_human, resolution=5, random_state=seed)

In [None]:
sc.pp.subsample(boothby_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(boothby_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(boothby_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL', 'HBB', 'S100B'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(boothby_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.4, quantile_gene_sel=0.6)

In [None]:
sc.pl.umap(boothby_2021_ctrl_human, color=['leiden', 'assigned_cats'] + dict_cats_fb['peri - CYCS'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(boothby_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in boothby_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(boothby_2021_ctrl_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19', 'GGT5',  'CHRDL1', 'GPX3', 'BGN', 'ASPN', 'TNN', 'COL11A1', 'COCH', 
                                     'IGFBP3', 'ANGPTL7', 'SCN7A', 'C2orf40', 'NGFR', 'CLDN1', 'SBSPON', 'FGFBP2', 'DIO3', 'LUZP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
boothby_2021_ctrl_human_fb = boothby_2021_ctrl_human[boothby_2021_ctrl_human.obs['assigned_cats'].isin(['fibro', 'F', 'neuro'])].copy()

In [None]:
sc.pp.filter_genes(boothby_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(boothby_2021_ctrl_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(boothby_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(boothby_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.1 * len(boothby_2021_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(boothby_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(boothby_2021_ctrl_human_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(boothby_2021_ctrl_human_fb, resolution=9, random_state=seed)  # leiden 9

In [None]:
assign_cats(boothby_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(boothby_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
boothby_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(boothby_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(boothby_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del boothby_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(boothby_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in boothby_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(boothby_2021_ctrl_human_fb)

In [None]:
clear_adata(boothby_2021_ctrl_human_fb)
boothby_2021_ctrl_human_fb.write_h5ad(boothby_2021_dir + '/boothby_2021_ctrl_human_fb_processed.h5')
boothby_2021_ctrl_human.write_h5ad(boothby_2021_dir + '/boothby_2021_ctrl_human_processed.h5')

In [None]:
boothby_2021_ctrl_human_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_fb_processed.h5')

**U** cluster *BNIP3*, *BNIP3L*, *HILPDA*, *ERO1A*, *ENO2* indicates a possible hypoxia state in these cells.

In [None]:
sc.tl.rank_genes_groups(boothby_2021_ctrl_human_fb, groupby='leiden', groups=['5'], reference='rest')
sc.pl.rank_genes_groups_tracksplot(boothby_2021_ctrl_human_fb, dendrogram=False, n_genes=200)

### Deng et al. 2021

In [None]:
deng_dir = data_dir + '/deng_2021'

In [None]:
deng_2021_scar_human = sc.read(deng_dir + '/deng_2021_scar_human.h5')

In [None]:
deng_2021_scar_human.var_names = [dict_rep[i] if i in dict_rep else i for i in deng_2021_scar_human.var_names ]

In [None]:
# Basic QC filtering
deng_2021_scar_human.var['mt'] = deng_2021_scar_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(deng_2021_scar_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(deng_2021_scar_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(deng_2021_scar_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(deng_2021_scar_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': deng_2021_scar_human.obs['Internal sample identifier'], 'y': deng_2021_scar_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
deng_2021_scar_human = deng_2021_scar_human[((deng_2021_scar_human.obs.n_genes_by_counts < 4500) & 
                                    (deng_2021_scar_human.obs.n_genes_by_counts > 1000)).values, :]
deng_2021_scar_human = deng_2021_scar_human[deng_2021_scar_human.obs.pct_counts_mt < 18, :]

In [None]:
sc.pp.filter_genes(deng_2021_scar_human, min_counts=1)
sc.pp.normalize_total(deng_2021_scar_human)
sc.pp.log1p(deng_2021_scar_human)

In [None]:
sc.pp.pca(deng_2021_scar_human, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(deng_2021_scar_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(deng_2021_scar_human, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(deng_2021_scar_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(deng_2021_scar_human)

In [None]:
sc.tl.umap(deng_2021_scar_human, min_dist=0.2, random_state=seed)
sc.tl.leiden(deng_2021_scar_human, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(deng_2021_scar_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(deng_2021_scar_human, color=['leiden', 'Sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(deng_2021_scar_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'KRT5', 'DMKN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(deng_2021_scar_human, color=['leiden', 'C2orf40', 'CDH19', 'ANGPTL7','PLEKHB1','ENTPD2', 
                                   'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5', 'WNT6', ], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(deng_2021_scar_human, dict_cats=dict_cats_fb, quantile_gene_sel=0.9)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(deng_2021_scar_human, color=['assigned_cats'] + [i for i in val if i in deng_2021_scar_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(deng_2021_scar_human, color=['leiden', 'assigned_cats'] + dict_cats_fb['endo venule'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
deng_2021_scar_human_fb = deng_2021_scar_human[deng_2021_scar_human.obs['assigned_cats'].isin(['fibro', 'unassigned', 'fibro - ANGPTL7'])]

In [None]:
sc.pp.filter_genes(deng_2021_scar_human_fb, min_counts=1)

In [None]:
sc.pp.pca(deng_2021_scar_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(deng_2021_scar_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(deng_2021_scar_human_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(deng_2021_scar_human_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(deng_2021_scar_human_fb)

In [None]:
sc.tl.umap(deng_2021_scar_human_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(deng_2021_scar_human_fb, resolution=18, random_state=seed)

In [None]:
assign_cats(deng_2021_scar_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(deng_2021_scar_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
deng_2021_scar_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(deng_2021_scar_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(deng_2021_scar_human_fb, color=['leiden', 'axis', 'cluster', 'Internal sample identifier'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del deng_2021_scar_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(deng_2021_scar_human_fb, color=['cluster'] + [i for i in val if i in deng_2021_scar_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(deng_2021_scar_human_fb)

In [None]:
clear_adata(deng_2021_scar_human_fb)
deng_2021_scar_human_fb.write_h5ad(deng_dir + '/deng_2021_scar_human_fb_processed.h5')
deng_2021_scar_human.write_h5ad(deng_dir + '/deng_2021_scar_human_processed.h5')

In [None]:
deng_2021_scar_human_fb = sc.read(deng_dir + '/deng_2021_scar_human_fb_processed.h5')

### Gao et al. 2021

In [None]:
gao_dir = data_dir + '/gao_2021'

In [None]:
gao_2021_ctrl_human = sc.read(gao_dir + '/gao_2021_ctrl_human.h5')
gao_2021_ctrl_human = gao_2021_ctrl_human[gao_2021_ctrl_human.obs['Internal sample identifier'].isin(['Ctrl1', 'Ctrl2', 'Ctrl3'])]

In [None]:
# Basic QC filtering
gao_2021_ctrl_human.var['mt'] = gao_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(gao_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(gao_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(gao_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(gao_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
gao_2021_ctrl_human = gao_2021_ctrl_human[((gao_2021_ctrl_human.obs.n_genes_by_counts < 7000) & 
                                    (gao_2021_ctrl_human.obs.n_genes_by_counts > 500)).values, :]
gao_2021_ctrl_human = gao_2021_ctrl_human[gao_2021_ctrl_human.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(gao_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(gao_2021_ctrl_human)
sc.pp.log1p(gao_2021_ctrl_human)

In [None]:
sc.pp.pca(gao_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(gao_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(gao_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(gao_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(gao_2021_ctrl_human)

In [None]:
sc.tl.umap(gao_2021_ctrl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(gao_2021_ctrl_human, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(gao_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(gao_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(gao_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.4,  quantile_gene_sel=0.9)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(gao_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in gao_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(gao_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
gao_2021_ctrl_human_fb = gao_2021_ctrl_human[gao_2021_ctrl_human.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(gao_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(gao_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(gao_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(gao_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(gao_2021_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(gao_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(gao_2021_ctrl_human_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(gao_2021_ctrl_human_fb, resolution=10, random_state=seed)

In [None]:
assign_cats(gao_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(gao_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.25, quantile_gene_sel=0.6,
            key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
gao_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(gao_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(gao_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

The unassigned cluster doesn't show any relevant DEGs, so it is just "rubbish".

In [None]:
del gao_2021_ctrl_human_fb.obs['C3']
del gao_2021_ctrl_human_fb.obs['C2']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(gao_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in gao_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(gao_2021_ctrl_human_fb)

In [None]:
clear_adata(gao_2021_ctrl_human_fb)
gao_2021_ctrl_human_fb.write_h5ad(gao_dir + '/gao_2021_ctrl_human_fb_processed.h5')
gao_2021_ctrl_human.write_h5ad(gao_dir + '/gao_2021_ctrl_human_processed.h5')

In [None]:
gao_2021_ctrl_human_fb = sc.read(gao_dir + '/gao_2021_ctrl_human_fb_processed.h5')

### Gaydosik et al. 2020

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'

In [None]:
gaydosik_2020_ctrl_human = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_ctrl_human.h5')

In [None]:
# Basic QC filtering
gaydosik_2020_ctrl_human.var['mt'] = gaydosik_2020_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(gaydosik_2020_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(gaydosik_2020_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(gaydosik_2020_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(gaydosik_2020_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
gaydosik_2020_ctrl_human = gaydosik_2020_ctrl_human[((gaydosik_2020_ctrl_human.obs.n_genes_by_counts < 5500) & 
                                    (gaydosik_2020_ctrl_human.obs.n_genes_by_counts > 400)).values, :]
gaydosik_2020_ctrl_human = gaydosik_2020_ctrl_human[gaydosik_2020_ctrl_human.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(gaydosik_2020_ctrl_human, min_counts=1)
sc.pp.normalize_total(gaydosik_2020_ctrl_human)
sc.pp.log1p(gaydosik_2020_ctrl_human)

In [None]:
sc.pp.pca(gaydosik_2020_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(gaydosik_2020_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(gaydosik_2020_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(gaydosik_2020_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(gaydosik_2020_ctrl_human)

In [None]:
sc.tl.umap(gaydosik_2020_ctrl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(gaydosik_2020_ctrl_human, resolution=0.3, random_state=seed)

In [None]:
sc.tl.umap(gaydosik_2020_ctrl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(gaydosik_2020_ctrl_human, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(gaydosik_2020_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(gaydosik_2020_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(gaydosik_2020_ctrl_human, dict_cats=dict_cats_fb, quantile_gene_sel=0.7,)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(gaydosik_2020_ctrl_human, color=['assigned_cats'] + [i for i in val if i in gaydosik_2020_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(gaydosik_2020_ctrl_human, color=['assigned_cats'] + ['KRT7', 'KRT8', 'KRT18', 'KRT19', 'PPARG'], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(gaydosik_2020_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
gaydosik_2020_ctrl_human_fb = gaydosik_2020_ctrl_human[gaydosik_2020_ctrl_human.obs['assigned_cats'].isin(['fibro', 'fibro - ANGPTL7'])]

In [None]:
sc.pp.filter_genes(gaydosik_2020_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(gaydosik_2020_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(gaydosik_2020_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(gaydosik_2020_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(gaydosik_2020_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(gaydosik_2020_ctrl_human_fb)

In [None]:
sc.tl.umap(gaydosik_2020_ctrl_human_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(gaydosik_2020_ctrl_human_fb, resolution=6, random_state=seed)

In [None]:
assign_cats(gaydosik_2020_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(gaydosik_2020_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
gaydosik_2020_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(gaydosik_2020_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(gaydosik_2020_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del gaydosik_2020_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(gaydosik_2020_ctrl_human_fb, color=['cluster'] + [i for i in val if i in gaydosik_2020_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(gaydosik_2020_ctrl_human_fb)

In [None]:
clear_adata(gaydosik_2020_ctrl_human_fb)
gaydosik_2020_ctrl_human_fb.write_h5ad(gaydosik_dir + '/gaydosik_2020_ctrl_human_fb_processed.h5')
gaydosik_2020_ctrl_human.write_h5ad(gaydosik_dir + '/gaydosik_2020_ctrl_human_processed.h5')

In [None]:
gaydosik_2020_ctrl_human_fb = sc.read(gaydosik_dir + '/gaydosik_2020_ctrl_human_fb_processed.h5')

### Gur et al. 2022

In [None]:
gur_2022_dir = data_dir + '/gur_2022'

In [None]:
gur_2022_ctrl_human = sc.read_h5ad(gur_2022_dir + '/gur_2022_ctrl_human.h5')

In [None]:
# Basic QC filtering
gur_2022_ctrl_human.var['mt'] = gur_2022_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(gur_2022_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(gur_2022_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(gur_2022_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(gur_2022_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': gur_2022_ctrl_human.obs['Internal sample identifier'], 'y': gur_2022_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
gur_2022_ctrl_human = gur_2022_ctrl_human[((gur_2022_ctrl_human.obs.n_genes_by_counts < 1950) & 
                                    (gur_2022_ctrl_human.obs.n_genes_by_counts > 300)).values, :]
gur_2022_ctrl_human = gur_2022_ctrl_human[(gur_2022_ctrl_human.obs.pct_counts_mt < 40) & (gur_2022_ctrl_human.obs.pct_counts_mt > 10), :]

In [None]:
sc.pp.filter_genes(gur_2022_ctrl_human, min_counts=1)
sc.pp.normalize_total(gur_2022_ctrl_human)
sc.pp.log1p(gur_2022_ctrl_human)

In [None]:
sc.pp.pca(gur_2022_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(gur_2022_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(gur_2022_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(gur_2022_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(gur_2022_ctrl_human)

In [None]:
sc.tl.umap(gur_2022_ctrl_human, min_dist=0.05, random_state=seed)
sc.tl.leiden(gur_2022_ctrl_human, resolution=4,  random_state=seed)

In [None]:
sc.pl.umap(gur_2022_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(gur_2022_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'KRT5', 'KRT14', 'C2orf40', 'ANGPTL7', 'SOX8', 'TIAM1',
                                 'GPM6B','PLP1','S100B','SCN7A','NRXN1','GFRA3','MPZ', 'IGFBP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(gur_2022_ctrl_human, dict_cats=dict_cats_fb, quantile_gene_sel=0.8, min_score=0.45)

In [None]:
sc.pl.umap(gur_2022_ctrl_human, color=['leiden', 'assigned_cats', 'PDGFRA', 'LUM', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(gur_2022_ctrl_human, color=['assigned_cats'] + [i for i in val if i in gur_2022_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
gur_2022_ctrl_human_fb = gur_2022_ctrl_human[gur_2022_ctrl_human.obs['assigned_cats'].isin(['fibro', 'F', 'mt'])]

In [None]:
sc.pp.filter_genes(gur_2022_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(gur_2022_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(gur_2022_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(gur_2022_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(gur_2022_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(gur_2022_ctrl_human_fb)

In [None]:
sc.tl.umap(gur_2022_ctrl_human_fb, min_dist=0.35, random_state=seed)
sc.tl.leiden(gur_2022_ctrl_human_fb, resolution=3, random_state=seed)

In [None]:
assign_cats(gur_2022_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(gur_2022_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
gur_2022_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(gur_2022_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(gur_2022_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster', 'log1p_n_genes_by_counts'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del gur_2022_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(gur_2022_ctrl_human_fb, color=['cluster'] + [i for i in val if i in gur_2022_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(gur_2022_ctrl_human_fb)

In [None]:
clear_adata(gur_2022_ctrl_human_fb)
gur_2022_ctrl_human_fb.write_h5ad(gur_2022_dir + '/gur_2022_ctrl_human_fb_processed.h5')
gur_2022_ctrl_human.write_h5ad(gur_2022_dir + '/gur_2022_ctrl_human_processed.h5')

In [None]:
gur_2022_ctrl_human_fb = sc.read(gur_2022_dir + '/gur_2022_ctrl_human_fb_processed.h5')

### He et al. 2020

In [None]:
he_dir = data_dir + '/He_2020'

In [None]:
he_2020_ctrl_human = sc.read_h5ad(he_dir + '/adata_he_2020_ctrl_human.h5')
he_2020_ctrl_human.var_names_make_unique()

In [None]:
# Replace CCN5 by WISP2 because it is a key gene
he_2020_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in he_2020_ctrl_human.var_names]

In [None]:
# Basic QC filtering
he_2020_ctrl_human.var['mt'] = he_2020_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(he_2020_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(he_2020_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)
sc.pl.scatter(he_2020_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(he_2020_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
he_2020_ctrl_human = he_2020_ctrl_human[he_2020_ctrl_human.obs.n_genes_by_counts < 5000, :]
he_2020_ctrl_human = he_2020_ctrl_human[he_2020_ctrl_human.obs.n_genes_by_counts > 600, :]
he_2020_ctrl_human = he_2020_ctrl_human[he_2020_ctrl_human.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(he_2020_ctrl_human, min_counts=1)
sc.pp.normalize_total(he_2020_ctrl_human)
sc.pp.log1p(he_2020_ctrl_human)

In [None]:
sc.pp.pca(he_2020_ctrl_human, random_state=seed, n_comps=30)
sc.pp.neighbors(he_2020_ctrl_human, random_state=seed, n_neighbors=int(0.5 * len(he_2020_ctrl_human) ** 0.5 // 4), metric='cosine')
tk.tl.triku(he_2020_ctrl_human)

In [None]:
sc.tl.umap(he_2020_ctrl_human, min_dist=0.3, random_state=seed)
sc.tl.leiden(he_2020_ctrl_human, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(he_2020_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(he_2020_ctrl_human, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(he_2020_ctrl_human, color=['assigned_cats'] + [i for i in val if i in he_2020_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(he_2020_ctrl_human, color=['leiden', 'assigned_cats', 'CDH19', 'ANGPTL7', 'PLEKHB1', 'ENTPD2', 'C2orf40', 
                           'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(he_2020_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
he_2020_ctrl_human_fb = he_2020_ctrl_human[he_2020_ctrl_human.obs['assigned_cats'].isin(['fibro', 'neuro', 'muscle', 'fibro - ANGPTL7'])]

In [None]:
sc.pp.filter_genes(he_2020_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(he_2020_ctrl_human_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(he_2020_ctrl_human_fb, random_state=seed, n_neighbors=int(0.5 * len(he_2020_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(he_2020_ctrl_human_fb)

In [None]:
sc.tl.umap(he_2020_ctrl_human_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(he_2020_ctrl_human_fb, resolution=15, random_state=seed)

In [None]:
assign_cats(he_2020_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster')
assign_cats(he_2020_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
            quantile_gene_sel=0.4, intermediate_states=True, diff=0.15)

In [None]:
he_2020_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(he_2020_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(he_2020_ctrl_human_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(he_2020_ctrl_human_fb, color=['DMKN', 'KRT5', 'KRT14'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
he_2020_ctrl_human_fb = he_2020_ctrl_human_fb[~ he_2020_ctrl_human_fb.obs['cluster'].isin(['Glial', 'unassigned', 'endo'])]

In [None]:
sc.pp.filter_genes(he_2020_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(he_2020_ctrl_human_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(he_2020_ctrl_human_fb, random_state=seed, n_neighbors=int(0.5 * len(he_2020_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(he_2020_ctrl_human_fb)

In [None]:
sc.tl.umap(he_2020_ctrl_human_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(he_2020_ctrl_human_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(he_2020_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(he_2020_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
            quantile_gene_sel=0.4, intermediate_states=True, diff=0.15, others_name='U')

In [None]:
he_2020_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(he_2020_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(he_2020_ctrl_human_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del he_2020_ctrl_human_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(he_2020_ctrl_human_fb, groupby='leiden', groups=['1', '2', '3'])
sc.pl.rank_genes_groups_tracksplot(he_2020_ctrl_human_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(he_2020_ctrl_human_fb, color=['cluster'] + [i for i in val if i in he_2020_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(he_2020_ctrl_human_fb)

In [None]:
clear_adata(he_2020_ctrl_human_fb)
he_2020_ctrl_human_fb.write_h5ad(he_dir + '/he_2020_ctrl_human_fb_processed.h5')
he_2020_ctrl_human.write_h5ad(he_dir + '/he_2020_ctrl_human_processed.h5')

In [None]:
he_2020_ctrl_human_fb = sc.read(he_dir + '/he_2020_ctrl_human_fb_processed.h5')

### Hughes et al. 2020

In [None]:
hughes_dir = data_dir + '/hughes_2020'

In [None]:
hughes_2020_all = sc.read(hughes_dir + '/adata_hughes_2020_all.h5')
hughes_2020_ctrl_human = hughes_2020_all[hughes_2020_all.obs['Internal sample identifier'].isin(['Normal1', 'Normal2', 'Normal3'])]

In [None]:
sc.pp.filter_genes(hughes_2020_ctrl_human, min_counts=1)

In [None]:
# Basic QC filtering
hughes_2020_ctrl_human.var['mt'] = hughes_2020_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(hughes_2020_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(hughes_2020_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(hughes_2020_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(hughes_2020_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
hughes_2020_ctrl_human = hughes_2020_ctrl_human[((hughes_2020_ctrl_human.obs.n_genes_by_counts < 3000) & 
                                    (hughes_2020_ctrl_human.obs.n_genes_by_counts > 200)).values, :]
hughes_2020_ctrl_human = hughes_2020_ctrl_human[hughes_2020_ctrl_human.obs.pct_counts_mt < 15, :]

In [None]:
batches = sorted(list(set(hughes_2020_ctrl_human.obs['Internal sample identifier'].values)))
for batch in batches:
    counts = hughes_2020_ctrl_human.obs['n_genes_by_counts'].loc[hughes_2020_ctrl_human.obs['Internal sample identifier'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(hughes_2020_ctrl_human, min_counts=1)
sc.pp.normalize_total(hughes_2020_ctrl_human)
sc.pp.log1p(hughes_2020_ctrl_human)

In [None]:
sc.pp.pca(hughes_2020_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(hughes_2020_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(hughes_2020_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(hughes_2020_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(hughes_2020_ctrl_human)

In [None]:
sc.tl.umap(hughes_2020_ctrl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(hughes_2020_ctrl_human, resolution=2.5, random_state=seed)

In [None]:
sc.pl.umap(hughes_2020_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(hughes_2020_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(hughes_2020_ctrl_human, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.99)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(hughes_2020_ctrl_human, color=['assigned_cats'] + [i for i in val if i in hughes_2020_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(hughes_2020_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
hughes_2020_ctrl_human_fb = hughes_2020_ctrl_human[hughes_2020_ctrl_human.obs['assigned_cats'].isin(['fibro', 'F'])]

In [None]:
sc.pp.filter_genes(hughes_2020_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(hughes_2020_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(hughes_2020_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(hughes_2020_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(hughes_2020_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(hughes_2020_ctrl_human_fb)

In [None]:
sc.tl.umap(hughes_2020_ctrl_human_fb, min_dist=0.35, random_state=seed)
sc.tl.leiden(hughes_2020_ctrl_human_fb, resolution=9, random_state=seed)

In [None]:
assign_cats(hughes_2020_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(hughes_2020_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
hughes_2020_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(hughes_2020_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(hughes_2020_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del hughes_2020_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(hughes_2020_ctrl_human_fb, color=['cluster'] + [i for i in val if i in hughes_2020_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(hughes_2020_ctrl_human_fb)

In [None]:
clear_adata(hughes_2020_ctrl_human_fb)
hughes_2020_ctrl_human_fb.write_h5ad(hughes_dir + '/hughes_2020_ctrl_human_fb_processed.h5')
hughes_2020_ctrl_human.write_h5ad(hughes_dir + '/hughes_2020_ctrl_human_processed.h5')

In [None]:
hughes_2020_ctrl_human_fb = sc.read(hughes_dir + '/hughes_2020_ctrl_human_fb_processed.h5')

### Kim et al. 2020

In [None]:
kim_dir_2020 = data_dir + '/Kim_2020'

In [None]:
kim_2020_ctrl_human = sc.read(kim_dir_2020 + '/adata_kim_2020_ctrl_human.h5')
kim_2020_ctrl_human.var_names_make_unique()

In [None]:
kim_2020_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in kim_2020_ctrl_human.var_names ]

In [None]:
sc.pp.filter_genes(kim_2020_ctrl_human, min_counts=1)

In [None]:
kim_2020_ctrl_human.X = np.array(kim_2020_ctrl_human.X.todense())

In [None]:
# Basic QC filtering
kim_2020_ctrl_human.var['mt'] = kim_2020_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(kim_2020_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(kim_2020_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(kim_2020_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(kim_2020_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
kim_2020_ctrl_human = kim_2020_ctrl_human[((kim_2020_ctrl_human.obs.n_genes_by_counts < 4000) & 
                                    (kim_2020_ctrl_human.obs.n_genes_by_counts > 500)).values, :]
kim_2020_ctrl_human = kim_2020_ctrl_human[kim_2020_ctrl_human.obs.pct_counts_mt < 25, :]

In [None]:
batches = sorted(list(set(kim_2020_ctrl_human.obs['Internal sample identifier'].values)))
for batch in batches:
    counts = kim_2020_ctrl_human.obs['n_genes_by_counts'].loc[kim_2020_ctrl_human.obs['Internal sample identifier'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(kim_2020_ctrl_human, min_counts=1)
sc.pp.normalize_total(kim_2020_ctrl_human)
sc.pp.log1p(kim_2020_ctrl_human)

In [None]:
sc.pp.pca(kim_2020_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(kim_2020_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(kim_2020_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(kim_2020_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(kim_2020_ctrl_human)

In [None]:
sc.tl.umap(kim_2020_ctrl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(kim_2020_ctrl_human, resolution=0.7, random_state=seed)

In [None]:
sc.pl.umap(kim_2020_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(kim_2020_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'S100B', 'MPZ', 'DMKN', 'RGS5', 'C2orf40'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(kim_2020_ctrl_human, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(kim_2020_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(kim_2020_ctrl_human, color=['assigned_cats'] + [i for i in val if i in kim_2020_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
kim_2020_ctrl_human_fb = kim_2020_ctrl_human[kim_2020_ctrl_human.obs['assigned_cats'] == 'fibro']

In [None]:
np.unique(kim_2020_ctrl_human_fb.obs['Internal sample identifier'].values, return_counts=True)

In [None]:
kim_2020_ctrl_human_fb = kim_2020_ctrl_human_fb[kim_2020_ctrl_human_fb.obs['Internal sample identifier'].isin(['HC1', 'HC2', 'HC3', 'HC4', 'HC6'])]

In [None]:
sc.pp.filter_genes(kim_2020_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(kim_2020_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(kim_2020_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(kim_2020_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(kim_2020_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(kim_2020_ctrl_human_fb)

In [None]:
sc.tl.umap(kim_2020_ctrl_human_fb, min_dist=0.8, random_state=seed)
sc.tl.leiden(kim_2020_ctrl_human_fb, resolution=10, random_state=seed)

In [None]:
assign_cats(kim_2020_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(kim_2020_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
kim_2020_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(kim_2020_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(kim_2020_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del kim_2020_ctrl_human_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(kim_2020_ctrl_human_fb, groupby='cluster', groups=['U'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(kim_2020_ctrl_human_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(kim_2020_ctrl_human_fb, color=['cluster'] + [i for i in val if i in kim_2020_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(kim_2020_ctrl_human_fb)

In [None]:
clear_adata(kim_2020_ctrl_human_fb)
kim_2020_ctrl_human_fb.write_h5ad(kim_dir_2020 + '/kim_2020_ctrl_human_fb_processed.h5')
kim_2020_ctrl_human.write_h5ad(kim_dir_2020 + '/kim_2020_ctrl_human_processed.h5')

In [None]:
kim_2020_ctrl_human_fb = sc.read(kim_dir_2020 + '/kim_2020_ctrl_human_fb_processed.h5')

### Kim et al. 2021 [NAIL, EXPECTING DIFFERENT RESULTS]

In [None]:
kim_dir_2021 = data_dir + '/kim_2021'

In [None]:
kim_2021_ctrl_human = sc.read(kim_dir_2021 + '/kim_2021_ctrl_human.h5')
kim_2021_ctrl_human.var_names_make_unique()

In [None]:
kim_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in kim_2021_ctrl_human.var_names ]

In [None]:
sc.pp.filter_genes(kim_2021_ctrl_human, min_counts=1)

In [None]:
kim_2021_ctrl_human.X = np.array(kim_2021_ctrl_human.X.todense())

In [None]:
# Basic QC filtering
kim_2021_ctrl_human.var['mt'] = kim_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(kim_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(kim_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(kim_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(kim_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
kikim_2021_ctrl_humanm = kim_2021_ctrl_human[((kim_2021_ctrl_human.obs.n_genes_by_counts < 6000) & 
                                    (kim_2021_ctrl_human.obs.n_genes_by_counts > 1000)).values, :]
kim_2021_ctrl_human = kim_2021_ctrl_human[kim_2021_ctrl_human.obs.pct_counts_mt < 40, :]

In [None]:
kim_2021_ctrl_human

In [None]:
batches = sorted(list(set(kim_2021_ctrl_human.obs['Internal sample identifier'].values)))
for batch in batches:
    counts = kim_2021_ctrl_human.obs['n_genes_by_counts'].loc[kim_2021_ctrl_human.obs['Internal sample identifier'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(kim_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(kim_2021_ctrl_human)
sc.pp.log1p(kim_2021_ctrl_human)

In [None]:
sc.pp.pca(kim_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(kim_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(kim_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(kim_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(kim_2021_ctrl_human)

In [None]:
sc.tl.umap(kim_2021_ctrl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(kim_2021_ctrl_human, resolution=3, random_state=seed)

In [None]:
sc.pp.subsample(kim_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(kim_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(kim_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(kim_2021_ctrl_human, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(kim_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(kim_2021_ctrl_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
kim_2021_ctrl_human_fb = kim_2021_ctrl_human[kim_2021_ctrl_human.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(kim_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(kim_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(kim_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(kim_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(kim_2021_ctrl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(kim_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(kim_2021_ctrl_human_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(kim_2021_ctrl_human_fb, resolution=9, random_state=seed)

In [None]:
assign_cats(kim_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(kim_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
kim_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(kim_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(kim_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(kim_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in kim_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(kim_2021_ctrl_human_fb)

In [None]:
clear_adata(kim_2021_ctrl_human_fb)
kim_2021_ctrl_human_fb.write_h5ad(kim_dir_2021 + '/kim_2021_ctrl_human_fb_processed.h5')
kim_2021_ctrl_human.write_h5ad(kim_dir_2021 + '/kim_2021_ctrl_human_processed.h5')

In [None]:
kim_2021_ctrl_human_fb = sc.read(kim_dir_2021 + '/kim_2021_ctrl_human_fb_processed.h5')

### Liu et al. 2021

In [None]:
liu_dir = data_dir + '/liu_2021'
os.makedirs(liu_dir, exist_ok=True)

In [None]:
liu_2021_ctrl_human = sc.read(liu_dir + '/adata_liu_2021_ctrl_human.h5')

In [None]:
liu_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in liu_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
liu_2021_ctrl_human.var['mt'] = liu_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(liu_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(liu_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(liu_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(liu_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': liu_2021_ctrl_human.obs['Internal sample identifier'], 'y': liu_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
liu_2021_ctrl_human = liu_2021_ctrl_human[(((liu_2021_ctrl_human.obs['Internal sample identifier'] == 'K007CTRL') & (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((liu_2021_ctrl_human.obs['Internal sample identifier'] == 'K009CTRL') & (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((liu_2021_ctrl_human.obs['Internal sample identifier'] == 'K012CTRL') & (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((liu_2021_ctrl_human.obs['Internal sample identifier'] == 'K013CTRL') & (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.2) & 
                                                (liu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6))).values, :]
liu_2021_ctrl_human = liu_2021_ctrl_human[liu_2021_ctrl_human.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(liu_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(liu_2021_ctrl_human)
sc.pp.log1p(liu_2021_ctrl_human)

In [None]:
sc.pp.pca(liu_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(liu_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(liu_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(liu_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(liu_2021_ctrl_human)

In [None]:
sc.tl.umap(liu_2021_ctrl_human, min_dist=0.2, random_state=seed)
sc.tl.leiden(liu_2021_ctrl_human, resolution=14, random_state=seed)

In [None]:
sc.pp.subsample(liu_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(liu_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(liu_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(liu_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.7)

In [None]:
sc.pl.umap(liu_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(liu_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in liu_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(liu_2021_ctrl_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
liu_2021_ctrl_human_fb = liu_2021_ctrl_human[liu_2021_ctrl_human.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(liu_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(liu_2021_ctrl_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(liu_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(liu_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(liu_2021_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(liu_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(liu_2021_ctrl_human_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(liu_2021_ctrl_human_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(liu_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(liu_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
liu_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(liu_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(liu_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del liu_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(liu_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in liu_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(liu_2021_ctrl_human_fb)

In [None]:
clear_adata(liu_2021_ctrl_human_fb)
liu_2021_ctrl_human_fb.write_h5ad(liu_dir + '/liu_2021_ctrl_human_fb_processed.h5')
liu_2021_ctrl_human.write_h5ad(liu_dir + '/liu_2021_ctrl_human_processed.h5')

In [None]:
liu_2021_ctrl_human_fb = sc.read(liu_dir + '/liu_2021_ctrl_human_fb_processed.h5')

### Mariottoni et al. 2021

In [None]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'

In [None]:
mariottoni_2021_ctrl_human = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human.h5')

In [None]:
mariottoni_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in mariottoni_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
mariottoni_2021_ctrl_human.var['mt'] = mariottoni_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(mariottoni_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(mariottoni_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(mariottoni_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(mariottoni_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': mariottoni_2021_ctrl_human.obs['Internal sample identifier'], 'y': mariottoni_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
mariottoni_2021_ctrl_human = mariottoni_2021_ctrl_human[(((mariottoni_2021_ctrl_human.obs['Internal sample identifier'] == 'HC') & (mariottoni_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8.3) & 
                                                (mariottoni_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.3)) 
                                             ).values, :]
mariottoni_2021_ctrl_human = mariottoni_2021_ctrl_human[mariottoni_2021_ctrl_human.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(mariottoni_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(mariottoni_2021_ctrl_human)
sc.pp.log1p(mariottoni_2021_ctrl_human)

In [None]:
sc.pp.pca(mariottoni_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(mariottoni_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(mariottoni_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(mariottoni_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(mariottoni_2021_ctrl_human)

In [None]:
sc.tl.umap(mariottoni_2021_ctrl_human, min_dist=0.4, random_state=seed)
sc.tl.leiden(mariottoni_2021_ctrl_human, resolution=5, random_state=seed)

In [None]:
sc.pp.subsample(mariottoni_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(mariottoni_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(mariottoni_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL', 'HBB', 'S100B', 'CLDN5', 'PECAM1'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(mariottoni_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.45, quantile_gene_sel=0.65)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(mariottoni_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in mariottoni_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(mariottoni_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(mariottoni_2021_ctrl_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19', 'GGT5',  'CHRDL1', 'GPX3', 'BGN', 'ASPN', 'TNN', 'COL11A1', 'COCH', 
                                     'IGFBP3', 'ANGPTL7', 'SCN7A', 'C2orf40', 'NGFR', 'CLDN1', 'SBSPON', 'FGFBP2', 'DIO3', 'LUZP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
mariottoni_2021_ctrl_human_fb = mariottoni_2021_ctrl_human[mariottoni_2021_ctrl_human.obs['assigned_cats'].isin(['fibro'])].copy()

In [None]:
sc.pp.filter_genes(mariottoni_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(mariottoni_2021_ctrl_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(mariottoni_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(mariottoni_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(mariottoni_2021_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(mariottoni_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(mariottoni_2021_ctrl_human_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(mariottoni_2021_ctrl_human_fb, resolution=6, random_state=seed)  # leiden 9

In [None]:
assign_cats(mariottoni_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(mariottoni_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
mariottoni_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(mariottoni_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(mariottoni_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del mariottoni_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(mariottoni_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in mariottoni_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(mariottoni_2021_ctrl_human_fb)

In [None]:
clear_adata(mariottoni_2021_ctrl_human_fb)
mariottoni_2021_ctrl_human_fb.write_h5ad(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_fb_processed.h5')
mariottoni_2021_ctrl_human.write_h5ad(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_processed.h5')

In [None]:
mariottoni_2021_ctrl_human_fb = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_fb_processed.h5')

### Mirizio et al. 2020

In [None]:
mirizio_dir = data_dir + '/mirizio_2020'

In [None]:
mirizio_2020_scl_human = sc.read(mirizio_dir + '/adata_mirizio_2020_scleroderma_human.h5')
mirizio_2020_scl_human.var_names_make_unique()

In [None]:
# Basic QC filtering
mirizio_2020_scl_human.var['mt'] = mirizio_2020_scl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(mirizio_2020_scl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(mirizio_2020_scl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(mirizio_2020_scl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(mirizio_2020_scl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
mirizio_2020_scl_human = mirizio_2020_scl_human[((mirizio_2020_scl_human.obs.n_genes_by_counts < 4000) & 
                                    (mirizio_2020_scl_human.obs.n_genes_by_counts > 250)).values, :]
mirizio_2020_scl_human = mirizio_2020_scl_human[mirizio_2020_scl_human.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(mirizio_2020_scl_human, min_counts=1)
sc.pp.normalize_total(mirizio_2020_scl_human)
sc.pp.log1p(mirizio_2020_scl_human)

In [None]:
sc.pp.pca(mirizio_2020_scl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(mirizio_2020_scl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(mirizio_2020_scl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(mirizio_2020_scl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(mirizio_2020_scl_human)

In [None]:
sc.tl.umap(mirizio_2020_scl_human, min_dist=0.1, random_state=seed)
sc.tl.leiden(mirizio_2020_scl_human, resolution=7, random_state=seed)

In [None]:
sc.pl.umap(mirizio_2020_scl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(mirizio_2020_scl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(mirizio_2020_scl_human, dict_cats=dict_cats_fb, min_score=0.4)

In [None]:
sc.pl.umap(mirizio_2020_scl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(mirizio_2020_scl_human, color=['assigned_cats'] + [i for i in val if i in mirizio_2020_scl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
mirizio_2020_scl_human_fb = mirizio_2020_scl_human[mirizio_2020_scl_human.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(mirizio_2020_scl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(mirizio_2020_scl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(mirizio_2020_scl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(mirizio_2020_scl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(mirizio_2020_scl_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(mirizio_2020_scl_human_fb)

In [None]:
sc.tl.umap(mirizio_2020_scl_human_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(mirizio_2020_scl_human_fb, resolution=8, random_state=seed)

In [None]:
assign_cats(mirizio_2020_scl_human_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(mirizio_2020_scl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, quantile_gene_sel=0.75,
            key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
mirizio_2020_scl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(mirizio_2020_scl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(mirizio_2020_scl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
# UNASSIGNED cells may refer to stress
sc.tl.rank_genes_groups(mirizio_2020_scl_human_fb, groupby='leiden', groups=['28'])
sc.pl.rank_genes_groups_tracksplot(mirizio_2020_scl_human_fb, dendrogram=False, n_genes=150)

In [None]:
del mirizio_2020_scl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(mirizio_2020_scl_human_fb, color=['cluster'] + [i for i in val if i in mirizio_2020_scl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(mirizio_2020_scl_human_fb)

In [None]:
clear_adata(mirizio_2020_scl_human_fb)
mirizio_2020_scl_human_fb.write_h5ad(mirizio_dir + '/mirizio_2020_scl_human_fb_processed.h5')
mirizio_2020_scl_human.write_h5ad(mirizio_dir + '/mirizio_2020_scl_human_processed.h5')

In [None]:
mirizio_2020_scl_human_fb = sc.read(mirizio_dir + '/mirizio_2020_scl_human_fb_processed.h5')

### Reynolds et al. 2021 [Discarded because of bad quality cells]

In [None]:
reynolds_dir = data_dir + '/reynolds_2021'

In [None]:
reynolds_2021_ctrl_human_fb = sc.read(reynolds_dir + '/reynolds_2021_ctrl_human_fb.h5')

In [None]:
sc.pl.umap(reynolds_2021_ctrl_human_fb, color=['hypoxia_stress', 'Internal sample identifier', 'full_clustering'], legend_loc='on data')

In [None]:
reynolds_2021_ctrl_human_fb = reynolds_2021_ctrl_human_fb[reynolds_2021_ctrl_human_fb.obs['hypoxia_stress'] == 'Normal']

In [None]:
sc.pp.filter_genes(reynolds_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(reynolds_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(reynolds_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(reynolds_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(reynolds_2021_ctrl_human_fb) ** 0.5 ), metric='cosine')
tk.tl.triku(reynolds_2021_ctrl_human_fb, use_raw=False)

In [None]:
sc.tl.umap(reynolds_2021_ctrl_human_fb, min_dist=0.3, random_state=seed)
sc.tl.leiden(reynolds_2021_ctrl_human_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(reynolds_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(reynolds_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, quantile_gene_sel=0.75,
            key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
reynolds_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(reynolds_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(reynolds_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(reynolds_2021_ctrl_human_fb, color=['COL18A1', 'CLEC2A', 'COL13A1', 'COL6A5', 'NPTX2', 'HSPB3', 'COMP', 'APCDD1', 'NKD2', 'AKAP6'], 
           legend_loc='on data', cmap=magma, use_raw=False, palette='Dark2')

In [None]:
del reynolds_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(reynolds_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in reynolds_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(reynolds_2021_ctrl_human_fb)

In [None]:
clear_adata(reynolds_2021_ctrl_human_fb)
reynolds_2021_ctrl_human_fb.write_h5ad(reynolds_dir + '/reynolds_2021_ctrl_human_fb_processed.h5')

In [None]:
reynolds_2021_ctrl_human_fb = sc.read(reynolds_dir + '/reynolds_2021_ctrl_human_fb_processed.h5')

### Rindler et al. 2021

In [None]:
rindler_2021_dir = data_dir + '/rindler_2021'

In [None]:
rindler_2021_ctrl_human = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_human.h5')

In [None]:
rindler_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in rindler_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
rindler_2021_ctrl_human.var['mt'] = rindler_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(rindler_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(rindler_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(rindler_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(rindler_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': rindler_2021_ctrl_human.obs['Internal sample identifier'], 'y': rindler_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
rindler_2021_ctrl_human = rindler_2021_ctrl_human[(((rindler_2021_ctrl_human.obs['Internal sample identifier'] == 'P112 HC') & (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.8)) | 
                                              ((rindler_2021_ctrl_human.obs['Internal sample identifier'] == 'P115 HC') & (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.8)) | 
                                              ((rindler_2021_ctrl_human.obs['Internal sample identifier'] == 'P116 HC') & (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8.25) & 
                                                (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.3))| 
                                              ((rindler_2021_ctrl_human.obs['Internal sample identifier'] == 'P121 HC') & (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7) & 
                                                (rindler_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) 
                                             ).values, :]
rindler_2021_ctrl_human = rindler_2021_ctrl_human[rindler_2021_ctrl_human.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(rindler_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(rindler_2021_ctrl_human)
sc.pp.log1p(rindler_2021_ctrl_human)

In [None]:
sc.pp.pca(rindler_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(rindler_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(rindler_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(rindler_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(rindler_2021_ctrl_human)

In [None]:
sc.tl.umap(rindler_2021_ctrl_human, min_dist=0.4, random_state=seed)
sc.tl.leiden(rindler_2021_ctrl_human, resolution=0.9, random_state=seed)

In [None]:
sc.pp.subsample(rindler_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(rindler_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(rindler_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL', 'HBB'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(rindler_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.45, quantile_gene_sel=0.85)

In [None]:
sc.pl.umap(rindler_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(rindler_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in rindler_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(rindler_2021_ctrl_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19', 'GGT5',  'CHRDL1', 'GPX3', 'BGN', 'ASPN', 'TNN', 'COL11A1', 'COCH', 
                                     'IGFBP3', 'ANGPTL7', 'SCN7A', 'C2orf40', 'NGFR', 'CLDN1', 'SBSPON', 'FGFBP2', 'DIO3', 'LUZP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
rindler_2021_ctrl_human_fb = rindler_2021_ctrl_human[rindler_2021_ctrl_human.obs['assigned_cats'].isin(['fibro', 'neuro'])]

In [None]:
sc.pp.filter_genes(rindler_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(rindler_2021_ctrl_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(rindler_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(rindler_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.1 * len(rindler_2021_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(rindler_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(rindler_2021_ctrl_human_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(rindler_2021_ctrl_human_fb, resolution=15, random_state=seed) # 18

In [None]:
assign_cats(rindler_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.7, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(rindler_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
rindler_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(rindler_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(rindler_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

**MIRAR EL CLUSTER C5 Y DEGS DE B1/B2**

In [None]:
del rindler_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(rindler_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in rindler_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(rindler_2021_ctrl_human_fb)

In [None]:
clear_adata(rindler_2021_ctrl_human_fb)
rindler_2021_ctrl_human_fb.write_h5ad(rindler_2021_dir + '/rindler_2021_ctrl_human_fb_processed.h5')
rindler_2021_ctrl_human.write_h5ad(rindler_2021_dir + '/rindler_2021_ctrl_human_processed.h5')

In [None]:
rindler_2021_ctrl_human_fb = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_human_fb_processed.h5')

There are two small clusters that show the same patterns as A1/A4, but express genes such as *ZFP36*, *FOS*, *HSPA1A*, *HSPA1B*, *ATF3*, *DNAJB1*, *IER3*, *IER2*, *GADD45B*, *JUNB*, *DUSP1*, *FOSB*, *NR4A2*, *EGR1*, *NFKBIA*, etc. These genes are common in some populations (a subcluster in endothelial cells, or even in some cases, the cluster B1), but this is an "aberrant" pattern because this population does not appear with a secondary profile. This appeared in Reynolds dataset, and has been published about it before (https://doi.org/10.12688/f1000research.54864.2).

In [None]:
sc.tl.leiden(rindler_2021_ctrl_human_fb, resolution=2, random_state=seed)
del rindler_2021_ctrl_human_fb.uns['leiden_colors']

In [None]:
sc.pl.umap(rindler_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

There are other clusters that are marked as **U** but, when looking at DEGs, they do not show a clear transcriptomic profile. Considering that there are more "spurious" clusters, we are going to ignore these cells so far.

In [None]:
sc.tl.rank_genes_groups(rindler_2021_ctrl_human_fb, groups=['8', '24'], reference='rest', groupby='leiden')
sc.pl.rank_genes_groups_tracksplot(rindler_2021_ctrl_human_fb, dendrogram=False, n_genes=35)

In [None]:
sc.tl.rank_genes_groups(rindler_2021_ctrl_human_fb, groups=['21', '25', '30'], reference='rest', groupby='leiden')
sc.pl.rank_genes_groups_tracksplot(rindler_2021_ctrl_human_fb, dendrogram=False, n_genes=75)

### Solé-Boldo et al. 2020

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'

In [None]:
sole_2020_young_human = sc.read(sole_dir + '/adata_sole_2020_young_human.h5')
sole_2020_young_human.var_names_make_unique()

In [None]:
sole_2020_young_human.var_names = [dict_rep[i] if i in dict_rep else i for i in sole_2020_young_human.var_names ]

In [None]:
sc.pp.filter_genes(sole_2020_young_human, min_counts=1)

In [None]:
# Basic QC filtering
sole_2020_young_human.var['mt'] = sole_2020_young_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(sole_2020_young_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(sole_2020_young_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(sole_2020_young_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(sole_2020_young_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': sole_2020_young_human.obs['Internal sample identifier'], 'y': sole_2020_young_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
sole_2020_young_human = sole_2020_young_human[((sole_2020_young_human.obs.n_genes_by_counts < 2500) & 
                                    (sole_2020_young_human.obs.n_genes_by_counts > 350)).values, :]
sole_2020_young_human = sole_2020_young_human[sole_2020_young_human.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(sole_2020_young_human, min_counts=1)
sc.pp.normalize_total(sole_2020_young_human)
sc.pp.log1p(sole_2020_young_human)

In [None]:
sc.pp.pca(sole_2020_young_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(sole_2020_young_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(sole_2020_young_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(sole_2020_young_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(sole_2020_young_human)

In [None]:
sc.tl.umap(sole_2020_young_human, min_dist=0.6, random_state=seed)
sc.tl.leiden(sole_2020_young_human, resolution=10, random_state=seed)

In [None]:
sc.pl.umap(sole_2020_young_human, color=['Internal sample identifier', 'leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(sole_2020_young_human, color=['leiden', 'S100B', 'MPZ', 'PLP1', 'MLANA', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(sole_2020_young_human, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.8) # 0,2

In [None]:
sc.pl.umap(sole_2020_young_human, color=['Internal sample identifier', 'leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(sole_2020_young_human, color=['assigned_cats'] + [i for i in val if i in sole_2020_young_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sole_2020_young_human_fb = sole_2020_young_human[sole_2020_young_human.obs['assigned_cats'].isin(['fibro', 'fibro - ANGPTL7'])]

In [None]:
sc.pp.filter_genes(sole_2020_young_human_fb, min_counts=1)

In [None]:
sc.pp.pca(sole_2020_young_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(sole_2020_young_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(sole_2020_young_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(sole_2020_young_human_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(sole_2020_young_human_fb)

In [None]:
sc.tl.umap(sole_2020_young_human_fb, min_dist=0.3, random_state=seed)
sc.tl.leiden(sole_2020_young_human_fb, resolution=8, random_state=seed)

In [None]:
assign_cats(sole_2020_young_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(sole_2020_young_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
sole_2020_young_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(sole_2020_young_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(sole_2020_young_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
plot_score_graph(sole_2020_young_human_fb)

In [None]:
del sole_2020_young_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(sole_2020_young_human_fb, color=['cluster'] + [i for i in val if i in sole_2020_young_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
clear_adata(sole_2020_young_human_fb)
sole_2020_young_human_fb.write_h5ad(sole_dir + '/sole_2020_young_human_fb_processed.h5')
sole_2020_young_human.write_h5ad(sole_dir + '/sole_2020_young_human_processed.h5')

In [None]:
sole_2020_young_human_fb = sc.read(sole_dir + '/sole_2020_young_human_fb_processed.h5')

### Tabib et al. 2018

In [None]:
tabib_2018_dir = data_dir + '/Tabib_2018'

In [None]:
tabib_2018_ctrl_human = sc.read(tabib_2018_dir + '/adata_tabib_2018_ctrl_human.h5')

In [None]:
df_mettabib = pd.read_csv(tabib_2018_dir + '/Skin_6Control_Metadata.csv', index_col=0)

In [None]:
tabib_2018_ctrl_human

df metadata has 8366 cells, although the paper states that 8522 cells were analyzed. The rest of cells are erithrocytes, which were filtered out from the analysis.

In [None]:
tabib_2018_ctrl_human.raw = tabib_2018_ctrl_human

In [None]:
dict_reverse_mappings = {'Fibroblast': ['0', '3', '4'], 
                 'Keratinocyte': ['1', '5', '7', '11', '14',], 
                 'Endothelial cell': ['2'], 
                 'Pericyte': ['6', '10'], 
                 'Macrophage/DC': ['8'], 
                 'Lymphocyte': ['9'], 
                 'Secretory Epith': ['12'], 
                 'Smooth Muscle': ['13'], 
                 'Melanocyte': ['15'], 
                 'Neural Cell': ['16'],
                 'Cornified Env': ['17'],
                 'B cell': ['18'], 
                 'Erithrocyte': [np.NaN]}  # This is ours!

dict_mappings = {}

for key, val in dict_reverse_mappings.items():
    for val_i in val:
        dict_mappings[val_i] = key

In [None]:
tabib_2018_ctrl_human.obs['res.0.6'] = df_mettabib['res.0.6'].astype(str)
tabib_2018_ctrl_human.obs['assigned_cats'] = [dict_mappings[i] for i in tabib_2018_ctrl_human.obs['res.0.6']]

In [None]:
tabib_2018_ctrl_human_fb = tabib_2018_ctrl_human[tabib_2018_ctrl_human.obs['assigned_cats'].isin(['Fibroblast']), :].copy()
sc.pp.filter_genes(tabib_2018_ctrl_human_fb, min_counts=1)
tabib_2018_ctrl_human_fb.X = spr.csr.csr_matrix(tabib_2018_ctrl_human_fb.X).copy()
tabib_2018_ctrl_human_fb.raw = tabib_2018_ctrl_human_fb

In [None]:
sc.pp.normalize_total(tabib_2018_ctrl_human_fb)
sc.pp.log1p(tabib_2018_ctrl_human_fb)

In [None]:
sc.pp.pca(tabib_2018_ctrl_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(tabib_2018_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(tabib_2018_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(tabib_2018_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(tabib_2018_ctrl_human_fb)

In [None]:
sc.tl.umap(tabib_2018_ctrl_human_fb, min_dist=0.35, random_state=seed)
sc.tl.leiden(tabib_2018_ctrl_human_fb, resolution=16, random_state=seed)

In [None]:
assign_cats(tabib_2018_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(tabib_2018_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
tabib_2018_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(tabib_2018_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(tabib_2018_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del tabib_2018_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(tabib_2018_ctrl_human_fb, color=['cluster'] + [i for i in val if i in tabib_2018_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(tabib_2018_ctrl_human_fb)

In [None]:
clear_adata(tabib_2018_ctrl_human_fb)
tabib_2018_ctrl_human_fb.write_h5ad(tabib_2018_dir + '/tabib_2018_ctrl_human_fb_processed.h5')
tabib_2018_ctrl_human.write_h5ad(tabib_2018_dir + '/tabib_2018_ctrl_human_processed.h5')

In [None]:
tabib_2018_ctrl_human_fb = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_human_fb_processed.h5')

### Tabib et al. 2021

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'

In [None]:
tabib_2021_ctrl_human = sc.read(tabib_2021_dir + '/adata_tabib_2021_ctrl_human.h5')

In [None]:
tabib_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in tabib_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
tabib_2021_ctrl_human.var['mt'] = tabib_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(tabib_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(tabib_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(tabib_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(tabib_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': tabib_2021_ctrl_human.obs['Internal sample identifier'], 'y': tabib_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': tabib_2021_ctrl_human.obs['Internal sample identifier'], 'y': tabib_2021_ctrl_human.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
tabib_2021_ctrl_human = tabib_2021_ctrl_human[(((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC1') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC4') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC18') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC32') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC33') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC34') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC50') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC68') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC124') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((tabib_2021_ctrl_human.obs['Internal sample identifier'] == 'SC125') & (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (tabib_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.5))).values, :]
tabib_2021_ctrl_human = tabib_2021_ctrl_human[tabib_2021_ctrl_human.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(tabib_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(tabib_2021_ctrl_human)
sc.pp.log1p(tabib_2021_ctrl_human)

In [None]:
sc.pp.pca(tabib_2021_ctrl_human, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(tabib_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(tabib_2021_ctrl_human, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(tabib_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(tabib_2021_ctrl_human)

In [None]:
sc.tl.umap(tabib_2021_ctrl_human, min_dist=0.2, random_state=seed)
sc.tl.leiden(tabib_2021_ctrl_human, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(tabib_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(tabib_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(tabib_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'KRT5', 'DMKN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(tabib_2021_ctrl_human, color=['leiden', 'C2orf40', 'CDH19', 'ANGPTL7','PLEKHB1','ENTPD2', 
                                   'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5', 'WNT6', ], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(tabib_2021_ctrl_human, dict_cats=dict_cats_fb)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(tabib_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in tabib_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(tabib_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
tabib_2021_ctrl_human_fb = tabib_2021_ctrl_human[tabib_2021_ctrl_human.obs['assigned_cats'].isin(['fibro', 'fibro - ANGPTL7'])]

In [None]:
sc.pp.filter_genes(tabib_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(tabib_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(tabib_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(tabib_2021_ctrl_human_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(tabib_2021_ctrl_human_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(tabib_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(tabib_2021_ctrl_human_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(tabib_2021_ctrl_human_fb, resolution=18, random_state=seed)

In [None]:
assign_cats(tabib_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(tabib_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
tabib_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(tabib_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(tabib_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del tabib_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(tabib_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in tabib_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(tabib_2021_ctrl_human_fb)

In [None]:
clear_adata(tabib_2021_ctrl_human_fb)
tabib_2021_ctrl_human_fb.write_h5ad(tabib_2021_dir + '/tabib_2021_ctrl_human_fb_processed.h5')
tabib_2021_ctrl_human.write_h5ad(tabib_2021_dir + '/tabib_2021_ctrl_human_processed.h5')

In [None]:
tabib_2021_ctrl_human_fb = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_human_fb_processed.h5')

### Tabula Sapiens Consortium 2021 [Not included because they do not yield good quality populations]

In [None]:
tsc_dir = data_dir + '/Tabula_Sapiens_Consortium_2021'

In [None]:
tsc_2021_ctrl_human = sc.read(tsc_dir + '/adata_tsc_2021_ctrl_human.h5')

In [None]:
sc.pp.filter_genes(tsc_2021_ctrl_human, min_counts=25)

In [None]:
tsc_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in tsc_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
tsc_2021_ctrl_human.var['mt'] = tsc_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(tsc_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(tsc_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(tsc_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(tsc_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': tsc_2021_ctrl_human.obs['Internal sample identifier'], 'y': tsc_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': tsc_2021_ctrl_human.obs['Internal sample identifier'], 'y': tsc_2021_ctrl_human.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
tsc_2021_ctrl_human = tsc_2021_ctrl_human[(((tsc_2021_ctrl_human.obs['Internal sample identifier'] == 'T10_S5') & (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                        (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.4)) | 
                      ((tsc_2021_ctrl_human.obs['Internal sample identifier'] == 'T10_S6') & (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                        (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.4)) | 
                      ((tsc_2021_ctrl_human.obs['Internal sample identifier'] == 'T14_S17') & (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8.1) & 
                        (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.7)) | 
                      ((tsc_2021_ctrl_human.obs['Internal sample identifier'] == 'T14_S18') & (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8.1) & 
                        (tsc_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.7)) ).values, :]
tsc_2021_ctrl_human = tsc_2021_ctrl_human[tsc_2021_ctrl_human.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(tsc_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(tsc_2021_ctrl_human)
sc.pp.log1p(tsc_2021_ctrl_human)

In [None]:
sc.pp.pca(tsc_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(tsc_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(tsc_2021_ctrl_human, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(tsc_2021_ctrl_human) ** 0.5), metric='cosine')
tk.tl.triku(tsc_2021_ctrl_human)

In [None]:
sc.tl.umap(tsc_2021_ctrl_human, min_dist=0.3, random_state=seed)
sc.tl.leiden(tsc_2021_ctrl_human, resolution=1.8, random_state=seed)

In [None]:
sc.pl.umap(tsc_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(tsc_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(tsc_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in tsc_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(tsc_2021_ctrl_human, color=['Internal sample identifier', 'leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
tsc_2021_ctrl_human_fb = tsc_2021_ctrl_human[tsc_2021_ctrl_human.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(tsc_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(tsc_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(tsc_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(tsc_2021_ctrl_human_fb, use_rep='X_pca_harmony', n_neighbors=int(0.3 * len(tsc_2021_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(tsc_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(tsc_2021_ctrl_human_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(tsc_2021_ctrl_human_fb, resolution=12, random_state=seed)

In [None]:
assign_cats(tsc_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.99, key_added='cluster', others_name='U')
assign_cats(tsc_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
tsc_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(tsc_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(tsc_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del tsc_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(tsc_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in tsc_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(tsc_2021_ctrl_human_fb)

In [None]:
clear_adata(tsc_2021_ctrl_human_fb)
tsc_2021_ctrl_human_fb.write_h5ad(tsc_dir + '/tsc_2021_ctrl_human_fb_processed.h5')
tsc_2021_ctrl_human.write_h5ad(tsc_dir + '/tsc_2021_ctrl_human_processed.h5')

In [None]:
tsc_2021_ctrl_human_fb = sc.read(tsc_dir + '/tsc_2021_ctrl_human_fb_processed.h5')

### Theocarditis 2020

In [None]:
theo_dir_2020 = data_dir + '/Theocharidis_2020/'

In [None]:
theo_2020_ctrl_human = sc.read(theo_dir_2020 + '/adata_theo_ctrl_human.h5')
theo_2020_dm = sc.read(theo_dir_2020 + '/adata_theo_dm_noDFU.h5')

theo_2020_ctrl_human_dm = sc.AnnData.concatenate(theo_2020_ctrl_human, theo_2020_dm, batch_key='condition', batch_categories=['healthy', 'DM'])

In [None]:
sc.pp.filter_genes(theo_2020_ctrl_human_dm, min_counts=1)

In [None]:
theo_2020_ctrl_human_dm.var_names = [dict_rep[i] if i in dict_rep else i for i in theo_2020_ctrl_human_dm.var_names ]

In [None]:
# Basic QC filtering
theo_2020_ctrl_human_dm.var['mt'] = theo_2020_ctrl_human_dm.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(theo_2020_ctrl_human_dm, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
theo_2020_ctrl_human_dm.X = np.array(theo_2020_ctrl_human_dm.X.todense())

In [None]:
sc.pl.violin(theo_2020_ctrl_human_dm, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(theo_2020_ctrl_human_dm, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(theo_2020_ctrl_human_dm, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': theo_2020_ctrl_human_dm.obs['Internal sample identifier'], 'y': theo_2020_ctrl_human_dm.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': theo_2020_ctrl_human_dm.obs['Internal sample identifier'], 'y': theo_2020_ctrl_human_dm.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
theo_2020_ctrl_human_dm = theo_2020_ctrl_human_dm[(((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'H1') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 7.6) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.2)) | 
                      ((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'H2') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 7.6) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.3)) | 
                      ((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'H3') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 7.6) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.4)) | 
                      ((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'H4') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 7.6) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.4)) |
                      ((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'DM2') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 8) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.7)) | 
                      ((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'DM3') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 7.4) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.5)) | 
                                         ((theo_2020_ctrl_human_dm.obs['Internal sample identifier'] == 'DM4') & (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts < 7.6) & 
                        (theo_2020_ctrl_human_dm.obs.log1p_n_genes_by_counts > 6.5))  ).values, :]
theo_2020_ctrl_human_dm = theo_2020_ctrl_human_dm[theo_2020_ctrl_human_dm.obs.pct_counts_mt < 12, :]

In [None]:
# Basic QC filtering
theo_2020_ctrl_human_dm.var['mt'] = theo_2020_ctrl_human_dm.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(theo_2020_ctrl_human_dm, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pp.filter_genes(theo_2020_ctrl_human_dm, min_counts=1)
sc.pp.normalize_total(theo_2020_ctrl_human_dm)
sc.pp.log1p(theo_2020_ctrl_human_dm)

In [None]:
sc.pp.pca(theo_2020_ctrl_human_dm, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(theo_2020_ctrl_human_dm, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(theo_2020_ctrl_human_dm, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(theo_2020_ctrl_human_dm) ** 0.5 // 4), metric='cosine')
tk.tl.triku(theo_2020_ctrl_human_dm)

In [None]:
sc.tl.umap(theo_2020_ctrl_human_dm, min_dist=0.3, random_state=seed)
sc.tl.leiden(theo_2020_ctrl_human_dm, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(theo_2020_ctrl_human_dm, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(theo_2020_ctrl_human_dm, dict_cats=dict_cats_fb, min_score=0.4, quantile_gene_sel=0.97)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(theo_2020_ctrl_human_dm, color=['assigned_cats'] + [i for i in val if i in theo_2020_ctrl_human_dm.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(theo_2020_ctrl_human_dm, color=['leiden', 'Internal sample identifier', 'condition', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(theo_2020_ctrl_human_dm, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19', 'GGT5',  'CHRDL1', 'GPX3', 'BGN', 'ASPN', 'TNN', 'COL11A1', 'COCH', 
                                     'IGFBP3', 'ANGPTL7', 'SCN7A', 'C2orf40', 'NGFR', 'CLDN1', 'SBSPON', 'FGFBP2', 'DIO3', 'LUZP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
theo_2020_ctrl_human_dm_fb = theo_2020_ctrl_human_dm[theo_2020_ctrl_human_dm.obs['assigned_cats'].isin(['fibro', 'fibro - ANGPTL7', 'F'])]

In [None]:
sc.pp.filter_genes(theo_2020_ctrl_human_dm_fb, min_counts=1)

In [None]:
sc.pp.pca(theo_2020_ctrl_human_dm_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(theo_2020_ctrl_human_dm_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(theo_2020_ctrl_human_dm_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(theo_2020_ctrl_human_dm_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(theo_2020_ctrl_human_dm_fb)

In [None]:
sc.tl.umap(theo_2020_ctrl_human_dm_fb, min_dist=0.15, random_state=seed)
sc.tl.leiden(theo_2020_ctrl_human_dm_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(theo_2020_ctrl_human_dm_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster', others_name='U')
assign_cats(theo_2020_ctrl_human_dm_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
theo_2020_ctrl_human_dm_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(theo_2020_ctrl_human_dm_fb.obs['cluster']))]

In [None]:
sc.pl.umap(theo_2020_ctrl_human_dm_fb, color=['Internal sample identifier', 'leiden', 'axis',  'cluster', 'condition'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del theo_2020_ctrl_human_dm_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(theo_2020_ctrl_human_dm_fb, color=['cluster'] + [i for i in val if i in theo_2020_ctrl_human_dm_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(theo_2020_ctrl_human_dm_fb)

In [None]:
clear_adata(theo_2020_ctrl_human_dm_fb)
theo_2020_ctrl_human_dm_fb.write_h5ad(theo_dir_2020 + '/theo_2020_ctrl_human_dm_fb_processed.h5')
theo_2020_ctrl_human_dm.write_h5ad(theo_dir_2020 + '/theo_2020_ctrl_human_dm_processed.h5')

In [None]:
theo_2020_ctrl_human_dm_fb = sc.read(theo_dir_2020 + '/theo_2020_ctrl_human_dm_fb_processed.h5')

### Theocarditis 2021

The dataset presents two B1 populations (CXCL8, CA12, COL7A1, CD82, TMEM158, WNT5A, C15orf48, SAT1, MT2A, IER3, PTGS2, SRGN, CXCL1, CXCL3, CCL3, HMGA1, STC1, UPP1, ATP13A3, POU2F2, BCL2A1, F3, S1PR3, PMAIP1, LAMB3, IL1B, MMP9, SLC7A5, CDCP1, EGLN3, GMFG) and (IGFBP4, A2M, IGF1, FGF7, MEDAG, CHI3L1, SFRP1, IGFBP2, LXN, GPC3, PALMD, ALPL, CXCL2, CCDC69). These two populations could not be replicated in the rest of datasets, so we do not consider them as two distinct populations, but rather as a possible artifact.

In [None]:
theo_dir_2021 = data_dir + '/Theocharidis_2021/'

In [None]:
theo_2021_all = sc.read(theo_dir_2021 + '/adata_theo_2021.h5')
theo_2021_ctrl_human = theo_2021_all[theo_2021_all.obs['Condition'] == 'Healthy']

In [None]:
sc.pp.filter_genes(theo_2021_ctrl_human, min_counts=1)

In [None]:
theo_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in theo_2021_ctrl_human.var_names ]

In [None]:
theo_2021_ctrl_human

In [None]:
# Basic QC filtering
theo_2021_ctrl_human.var['mt'] = theo_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(theo_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(theo_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(theo_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(theo_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': theo_2021_ctrl_human.obs['Internal sample identifier'], 'y': theo_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': theo_2021_ctrl_human.obs['Internal sample identifier'], 'y': theo_2021_ctrl_human.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
theo_2021_ctrl_human = theo_2021_ctrl_human[((theo_2021_ctrl_human.obs.n_genes_by_counts < 2750) & 
                                    (theo_2021_ctrl_human.obs.n_genes_by_counts > 900)).values, :]
theo_2021_ctrl_human = theo_2021_ctrl_human[theo_2021_ctrl_human.obs.pct_counts_mt < 15, :]

In [None]:
# Basic QC filtering
theo_2021_ctrl_human.var['mt'] = theo_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(theo_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pp.filter_genes(theo_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(theo_2021_ctrl_human)
sc.pp.log1p(theo_2021_ctrl_human)

In [None]:
sc.pp.pca(theo_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(theo_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(theo_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(theo_2021_ctrl_human) ** 0.5 // 4), metric='cosine')
tk.tl.triku(theo_2021_ctrl_human)

In [None]:
sc.tl.umap(theo_2021_ctrl_human, min_dist=0.3, random_state=seed)
sc.tl.leiden(theo_2021_ctrl_human, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(theo_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(theo_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.4, quantile_gene_sel=0.5)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(theo_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in theo_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(theo_2021_ctrl_human, color=['Internal sample identifier', 'leiden', 'Condition', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
theo_2021_ctrl_human_fb = theo_2021_ctrl_human[theo_2021_ctrl_human.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(theo_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(theo_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(theo_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(theo_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(theo_2021_ctrl_human_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(theo_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(theo_2021_ctrl_human_fb, min_dist=0.15, random_state=seed)
sc.tl.leiden(theo_2021_ctrl_human_fb, resolution=4, random_state=seed)

In [None]:
assign_cats(theo_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(theo_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
theo_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(theo_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(theo_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )
sc.pl.umap(theo_2021_ctrl_human_fb, color=['Condition', 'Sample location'], cmap=magma, use_raw=False, )

In [None]:
del theo_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(theo_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in theo_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(theo_2021_ctrl_human_fb)

In [None]:
clear_adata(theo_2021_ctrl_human_fb)
theo_2021_ctrl_human_fb.write_h5ad(theo_dir_2021 + '/theo_2021_ctrl_human_fb_processed.h5')
theo_2021_ctrl_human.write_h5ad(theo_dir_2021 + '/theo_2021_ctrl_human_processed.h5')

In [None]:
theo_2021_ctrl_human_fb = sc.read(theo_dir_2021 + '/theo_2021_ctrl_human_fb_processed.h5')

### Vorstandlechner et al. 2020

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'

In [None]:
vors_2020_ctrl_human = sc.read(vors_dir + '/adata_vors_2020_ctrl_human.h5', cache=True)

In [None]:
sc.pp.filter_genes(vors_2020_ctrl_human, min_counts=1)

In [None]:
# Basic QC filtering
vors_2020_ctrl_human.var['mt'] = vors_2020_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(vors_2020_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(vors_2020_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(vors_2020_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(vors_2020_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': vors_2020_ctrl_human.obs['Internal sample identifier'], 'y': vors_2020_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': vors_2020_ctrl_human.obs['Internal sample identifier'], 'y': vors_2020_ctrl_human.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
vors_2020_ctrl_human = vors_2020_ctrl_human[(((vors_2020_ctrl_human.obs['Internal sample identifier'] == '1') & (vors_2020_ctrl_human.obs.log1p_n_genes_by_counts < 6.2) & 
                        (vors_2020_ctrl_human.obs.log1p_n_genes_by_counts > 5.6)) | 
                      ((vors_2020_ctrl_human.obs['Internal sample identifier'] == '2') & (vors_2020_ctrl_human.obs.log1p_n_genes_by_counts < 7.4) & 
                        (vors_2020_ctrl_human.obs.log1p_n_genes_by_counts > 6)) | 
                      ((vors_2020_ctrl_human.obs['Internal sample identifier'] == '3') & (vors_2020_ctrl_human.obs.log1p_n_genes_by_counts < 7.4) & 
                        (vors_2020_ctrl_human.obs.log1p_n_genes_by_counts > 5.8))).values, :]
vors_2020_ctrl_human = vors_2020_ctrl_human[vors_2020_ctrl_human.obs.pct_counts_mt < 10, :]

In [None]:
sc.pp.filter_genes(vors_2020_ctrl_human, min_counts=1)
sc.pp.normalize_total(vors_2020_ctrl_human)
sc.pp.log1p(vors_2020_ctrl_human)

In [None]:
sc.pp.pca(vors_2020_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(vors_2020_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vors_2020_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(vors_2020_ctrl_human) ** 0.5), metric='cosine')
tk.tl.triku(vors_2020_ctrl_human)

In [None]:
sc.tl.umap(vors_2020_ctrl_human, min_dist=0.6, random_state=seed)
sc.tl.leiden(vors_2020_ctrl_human, resolution=3, random_state=seed)

In [None]:
assign_cats(vors_2020_ctrl_human, dict_cats=dict_cats_fb, min_score=0.5, quantile_gene_sel=0.95)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(vors_2020_ctrl_human, color=['assigned_cats'] + [i for i in val if i in vors_2020_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(vors_2020_ctrl_human, color=['PDGFRA', 'LUM', 'DCN', 'COL1A1'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(vors_2020_ctrl_human, color=['Internal sample identifier', 'assigned_cats', 'leiden'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
vors_2020_ctrl_human_fb = vors_2020_ctrl_human[vors_2020_ctrl_human.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(vors_2020_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(vors_2020_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(vors_2020_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vors_2020_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(vors_2020_ctrl_human_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(vors_2020_ctrl_human_fb, use_raw=False)

In [None]:
sc.tl.umap(vors_2020_ctrl_human_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(vors_2020_ctrl_human_fb, resolution=3, random_state=seed)

In [None]:
assign_cats(vors_2020_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(vors_2020_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
vors_2020_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(vors_2020_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(vors_2020_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
del vors_2020_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(vors_2020_ctrl_human_fb, color=['cluster'] + [i for i in val if i in vors_2020_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(vors_2020_ctrl_human_fb)

In [None]:
clear_adata(vors_2020_ctrl_human_fb)
vors_2020_ctrl_human_fb.write_h5ad(vors_dir + '/vors_2020_ctrl_human_fb_processed.h5')
vors_2020_ctrl_human.write_h5ad(vors_dir + '/vors_2020_ctrl_human_processed.h5')

In [None]:
vors_2020_ctrl_human_fb = sc.read(vors_dir + '/vors_2020_ctrl_human_fb_processed.h5')

### Vorstandlechner et al. 2021

In [None]:
vors_2021_dir = data_dir + '/Vorstandlechner_2021'

In [None]:
vors_2021_ctrl_human = sc.read(vors_2021_dir + '/vorstandlechner_2021_ctrl_human.h5', cache=True)

In [None]:
sc.pp.filter_genes(vors_2021_ctrl_human, min_counts=1)

In [None]:
# Basic QC filtering
vors_2021_ctrl_human.var['mt'] = vors_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(vors_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(vors_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(vors_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(vors_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': vors_2021_ctrl_human.obs['Internal sample identifier'], 'y': vors_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': vors_2021_ctrl_human.obs['Internal sample identifier'], 'y': vors_2021_ctrl_human.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
vors_2021_ctrl_human = vors_2021_ctrl_human[(((vors_2021_ctrl_human.obs['Internal sample identifier'] == 'human_skin_1') & (vors_2021_ctrl_human.obs.log1p_n_genes_by_counts < 6) & 
                        (vors_2021_ctrl_human.obs.log1p_n_genes_by_counts > 4.9)) | 
                      ((vors_2021_ctrl_human.obs['Internal sample identifier'] == 'human_skin_2') & (vors_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.5) & 
                        (vors_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.3)) | 
                      ((vors_2021_ctrl_human.obs['Internal sample identifier'] == 'human_skin_3') & (vors_2021_ctrl_human.obs.log1p_n_genes_by_counts < 7.7) & 
                        (vors_2021_ctrl_human.obs.log1p_n_genes_by_counts > 6.3))).values, :]
vors_2021_ctrl_human = vors_2021_ctrl_human[vors_2021_ctrl_human.obs.pct_counts_mt < 10, :]

In [None]:
sc.pp.filter_genes(vors_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(vors_2021_ctrl_human)
sc.pp.log1p(vors_2021_ctrl_human)

In [None]:
sc.pp.pca(vors_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(vors_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vors_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(vors_2021_ctrl_human) ** 0.5), metric='cosine')
tk.tl.triku(vors_2021_ctrl_human)

In [None]:
sc.tl.umap(vors_2021_ctrl_human, min_dist=0.6, random_state=seed)
sc.tl.leiden(vors_2021_ctrl_human, resolution=3, random_state=seed)

In [None]:
assign_cats(vors_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.5, quantile_gene_sel=0.95)

In [None]:
sc.pl.umap(vors_2021_ctrl_human, color=['PDGFRA', 'LUM', 'DCN', 'COL1A1'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(vors_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in vors_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(vors_2021_ctrl_human, color=['Internal sample identifier', 'assigned_cats', 'leiden'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
vors_2021_ctrl_human_fb = vors_2021_ctrl_human[vors_2021_ctrl_human.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(vors_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(vors_2021_ctrl_human_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(vors_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vors_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(vors_2021_ctrl_human_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(vors_2021_ctrl_human_fb, use_raw=False)

In [None]:
sc.tl.umap(vors_2021_ctrl_human_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(vors_2021_ctrl_human_fb, resolution=3, random_state=seed)

In [None]:
assign_cats(vors_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(vors_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
vors_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(vors_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(vors_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
del vors_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(vors_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in vors_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(vors_2021_ctrl_human_fb)

In [None]:
clear_adata(vors_2021_ctrl_human_fb)
vors_2021_ctrl_human_fb.write_h5ad(vors_2021_dir + '/vors_2021_ctrl_human_fb_processed.h5')
vors_2021_ctrl_human.write_h5ad(vors_2021_dir + '/vors_2021_ctrl_human_processed.h5')

In [None]:
vors_2021_ctrl_human_fb = sc.read(vors_2021_dir + '/vors_2021_ctrl_human_fb_processed.h5')

### Xu et al. 2021

In [None]:
xu_2021_dir = data_dir + '/xu_2021'
os.makedirs(xu_2021_dir, exist_ok=True)

In [None]:
xu_2021_ctrl_human = sc.read(xu_2021_dir + '/xu_2021_healthy.h5')

In [None]:
xu_2021_ctrl_human.var_names = [dict_rep[i] if i in dict_rep else i for i in xu_2021_ctrl_human.var_names ]

In [None]:
# Basic QC filtering
xu_2021_ctrl_human.var['mt'] = xu_2021_ctrl_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(xu_2021_ctrl_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(xu_2021_ctrl_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(xu_2021_ctrl_human, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(xu_2021_ctrl_human, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': xu_2021_ctrl_human.obs['Internal sample identifier'], 'y': xu_2021_ctrl_human.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
xu_2021_ctrl_human = xu_2021_ctrl_human[(((xu_2021_ctrl_human.obs['Internal sample identifier'] == 'H01') & (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.2)) | 
                                              ((xu_2021_ctrl_human.obs['Internal sample identifier'] == 'H02') & (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.5)) | 
                                              ((xu_2021_ctrl_human.obs['Internal sample identifier'] == 'H03') & (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.3) |
                                              ((xu_2021_ctrl_human.obs['Internal sample identifier'] == 'H04') & (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.2)) | 
                                              ((xu_2021_ctrl_human.obs['Internal sample identifier'] == 'H05') & (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts < 8) & 
                                                (xu_2021_ctrl_human.obs.log1p_n_genes_by_counts > 7.2))) 
                                             ).values, :]
xu_2021_ctrl_human = xu_2021_ctrl_human[xu_2021_ctrl_human.obs.pct_counts_mt < 7, :]

In [None]:
sc.pp.filter_genes(xu_2021_ctrl_human, min_counts=1)
sc.pp.normalize_total(xu_2021_ctrl_human)
sc.pp.log1p(xu_2021_ctrl_human)

In [None]:
sc.pp.pca(xu_2021_ctrl_human, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(xu_2021_ctrl_human, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(xu_2021_ctrl_human, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(xu_2021_ctrl_human) ** 0.5 // 2), metric='cosine')
tk.tl.triku(xu_2021_ctrl_human)

In [None]:
sc.tl.umap(xu_2021_ctrl_human, min_dist=0.4, random_state=seed)
sc.tl.leiden(xu_2021_ctrl_human, resolution=5, random_state=seed)

In [None]:
sc.pp.subsample(xu_2021_ctrl_human, fraction=1, random_state=0, copy=False)
sc.pl.umap(xu_2021_ctrl_human, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(xu_2021_ctrl_human, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL', 'HBB'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(xu_2021_ctrl_human, dict_cats=dict_cats_fb, min_score=0.45, quantile_gene_sel=0.95)

In [None]:
sc.pl.umap(xu_2021_ctrl_human, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(xu_2021_ctrl_human, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19', 'GGT5',  'CHRDL1', 'GPX3', 'BGN', 'ASPN', 'TNN', 'COL11A1', 'COCH', 
                                     'IGFBP3', 'ANGPTL7', 'SCN7A', 'C2orf40', 'NGFR', 'CLDN1', 'SBSPON', 'FGFBP2', 'DIO3', 'LUZP2'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
for key, val in dict_cats_fb.items():
    print(key)
    sc.pl.umap(xu_2021_ctrl_human, color=['assigned_cats'] + [i for i in val if i in xu_2021_ctrl_human.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
xu_2021_ctrl_human_fb = xu_2021_ctrl_human[xu_2021_ctrl_human.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(xu_2021_ctrl_human_fb, min_counts=1)

In [None]:
sc.pp.pca(xu_2021_ctrl_human_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(xu_2021_ctrl_human_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(xu_2021_ctrl_human_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(xu_2021_ctrl_human_fb) ** 0.5), metric='cosine')
tk.tl.triku(xu_2021_ctrl_human_fb)

In [None]:
sc.tl.umap(xu_2021_ctrl_human_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(xu_2021_ctrl_human_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(xu_2021_ctrl_human_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(xu_2021_ctrl_human_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
xu_2021_ctrl_human_fb.uns['cluster_colors'] = [dict_colors_human[i] if i in dict_colors_human else '#bcbcbc' for 
                                      i in sorted(set(xu_2021_ctrl_human_fb.obs['cluster']))]

In [None]:
sc.pl.umap(xu_2021_ctrl_human_fb, color=['Internal sample identifier', 'leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del xu_2021_ctrl_human_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(xu_2021_ctrl_human_fb, color=['cluster'] + [i for i in val if i in xu_2021_ctrl_human_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(xu_2021_ctrl_human_fb)

In [None]:
clear_adata(xu_2021_ctrl_human_fb)
xu_2021_ctrl_human_fb.write_h5ad(xu_2021_dir + '/xu_2021_ctrl_human_fb_processed.h5')
xu_2021_ctrl_human.write_h5ad(xu_2021_dir + '/xu_2021_ctrl_human_processed.h5')

In [None]:
xu_2021_ctrl_human_fb = sc.read(xu_2021_dir + '/xu_2021_ctrl_human_fb_processed.h5')

## Presence of clusters for each dataset
In this representation we will exclude Kim 2021, because being nail it is interesting to study for another case, but not exactly here.

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
list_datasets = [ahlers_2022_young_human_fb, boothby_2021_ctrl_human_fb, 
                 deng_2021_scar_human_fb, gao_2021_ctrl_human_fb, gaydosik_2020_ctrl_human_fb, gur_2022_ctrl_human_fb,
                 he_2020_ctrl_human_fb, hughes_2020_ctrl_human_fb, kim_2020_ctrl_human_fb, 
                 liu_2021_ctrl_human_fb, mariottoni_2021_ctrl_human_fb, 
                 mirizio_2020_scl_human_fb, reynolds_2021_ctrl_human_fb, rindler_2021_ctrl_human_fb,
                 sole_2020_young_human_fb, tabib_2018_ctrl_human_fb, tabib_2021_ctrl_human_fb,
                 theo_2020_ctrl_human_dm_fb, theo_2021_ctrl_human_fb,
                 vors_2020_ctrl_human_fb, xu_2021_ctrl_human_fb]

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='presence', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='percentage', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, list_datasets=list_datasets, what='axis', cluster_name='cluster', axis_name='axis')

## Reevaluate the presence of clusters for each dataset

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
list_all_datasets = [ahlers_2022_young_human_fb, boothby_2021_ctrl_human_fb, 
                 deng_2021_scar_human_fb, gao_2021_ctrl_human_fb, gaydosik_2020_ctrl_human_fb,  gur_2022_ctrl_human_fb,
                 he_2020_ctrl_human_fb, hughes_2020_ctrl_human_fb, kim_2020_ctrl_human_fb, 
                 liu_2021_ctrl_human_fb, mariottoni_2021_ctrl_human_fb, 
                 mirizio_2020_scl_human_fb, reynolds_2021_ctrl_human_fb, rindler_2021_ctrl_human_fb,
                 sole_2020_young_human_fb, tabib_2018_ctrl_human_fb, tabib_2021_ctrl_human_fb,
                 theo_2020_ctrl_human_dm_fb, theo_2021_ctrl_human_fb,
                 vors_2020_ctrl_human_fb, xu_2021_ctrl_human_fb]

list_names = [adata.obs['Author'].values[0] for adata in list_datasets]

## Plotting all Adatas

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(5 * 4, 5 * 4))

for ax in axs.ravel()[len(list_all_datasets) - len(axs) :]:
    ax.set_axis_off()

for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
    sc.pl.umap(adata, color=['cluster'], legend_loc='on data', show=False, ax = axs.ravel()[idx], 
               title=str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])), size=15, cmap=magma, frameon=False)

## PAGA 

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(5 * 4, 5 * 4))

for ax in axs.ravel()[len(list_all_datasets) - len(axs) :]:
    ax.set_axis_off()
    
for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
    sc.tl.paga(adata, groups='cluster')
    sc.pl.paga(adata, ax=axs.ravel()[idx], frameon=False, show=False, 
               title=str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])))

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(5 * 4, 5 * 4))

for ax in axs.ravel()[len(list_all_datasets) - len(axs) :]:
    ax.set_axis_off()
    
for adata, name, idx in zip(list_all_datasets, list_names, range(len(list_all_datasets))):
    sc.tl.paga(adata, groups='cluster')
    sc.pl.paga(adata, ax=axs.ravel()[idx], frameon=False, show=False, solid_edges='connectivities_tree', 
               title=str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])))