# Comparison of fibroblast populations

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
# import ray
# import subprocess
# import time
# import scvelo as scv
# import gc
import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0
%store seed

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

%store magma

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'COL1A1', 'DCN', 'SFRP2', 'APOE'], 
                'melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'], 
                'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 
                'eritro': ['HBB', 'HBA2', 'HBA1', 'HBD'],
                'muscle': ['DES', 'PCP4', 'ACTG2', 'SYNPO2', 'PRUNE2', 'SORBS1', 'P2RX1'],
                'immune': ['TPSB2', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 
                'endo': ['PLVAP', 'CLDN5', 'ACKR1', 'LMCD1', 'NPDC1', 'A2M', 
                         'PECAM1', 'CLU', 'VWF', 'CD74', 'RAMP2', 'IFI27', 'GNG11'], 
                'lymph': ['CCL21', 'LYVE1', 'CLDN5'],
                'kerato': ['DMKN', 'KRT1', 'KRT5', 'KRT14', 'AQP3', 'SFN' ], 
                'krt7/8/19': ['S100A1','KRT19','PPP1R1B','KRT7','KRT8','SNORC','NCALD','CA6',
                              'AKR1C2','TPD52L1','PDK3','ROPN1B','QDPR'],
                'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6'], 
                'neuro': ['GPM6B','PLP1','S100B','SCN7A','NRXN1','GFRA3','MPZ'],
                'T cells': ['CD52', 'CD3D', 'TRAC'],
                'B cells': ['IGHM','CD74','CD79A','NIBAN3','TCL1A','NCF1','MS4A1', 'BTK', 'CD19'],
                'APC': ['HLA-DQA1', 'HLA-DRB6', 'TYROBP', 'FCER1G', 'AIF1'], 
                'mast_cells': ['IL1RL1', 'CPA3', 'HPGDS', 'TPSB2', 'HPGD', 'RGS13', 'CTSG', 
                               'TPSAB1', 'GATA2'],
                'F': ['B4GALT1', 'TMSB4X', 'PPP1CB', 'WTAP', 'PTPRS', 'CTNNB1', 'INSR', 'BICC1', 'CTNNB1'], 
               }

dict_cats_axes = {'A': ['PTGIS', 'SFRP2', 'MMP2', 'RGCC', 'COL14A1', 'AQP1', 'PTGIS', 'QPCT', 'ELN', 'COL14A1', 'LEPR', 
                        'ISM1', 'CES1', 'WIF1', 'RECK', 'SGCA', 'FBN1', 'NBL1', 'CTSB', 'COL5A1', 'SMOC2', 'SGCG'],
                  'B': ['GGT5', 'APOE', 'APOC1', 'CYGB', 'C7', 'IGFBP7', 'TNFSF13B', 'APOC1', 'RARRES2', 'CCDC146', 'CXCL12', 
                        'ITM2A', 'EBF1', 'CCL2', 'IGFBP3', 'EBF1', 'CXCL2', 'EFEMP1', 'TMEM176A', 'C3', 'EGR1', 'LGALS3BP', 
                        'BST2', 'ANGPTL4', 'ABCA8', ],
                  'C': ['TNN', 'TNMD', 'ASPN', 'CYP1B1', 'GPC1', 'PPP1R14A', 'GPM6B', 'COL11A1', 'DKK3', 'OGN', 'SDC1', 
                        'PDE1A', 'MDK', 'NRP2', 'POSTN', 'F2R', 'KIF26B', 'TENM3', 'ALX4', 'PMEPA1', 'FIBIN', 'PCDH15',                        ]}

# provisional_manual_dict_cats_clusters
dict_cats_clusters= {
                     'A1': ['SLPI', 'C1QTNF3', 'CD70', 'SEMA3C', 'TSPAN8', 'DPP4', 'CHRDL1', 
                             'SEMA3B', 'CTHRC1', 'WISP2', 'CCN5', 'PI16', 'QPCT'],
                     'A2': ['CLEC2A', 'COL13A1', 'COL23A1', 'COL6A5', 'HSPB3', 'NPTX2', 'COMP', 'APCDD1', 'NKD2', 
                            'PHACTR3', 'AKAP6', 'LEPR', 'STC2', 'WIF1', 'CDC42EP3', 'COL18A1',], 
                     'A3': ['WIF1', 'HAS1', 'SGCG', 'CORIN', 'ACKR4', 'C1QTNF3', 'CFD', 'QPCT', 'SGCA', ], # 'HAS1', 'CORIN', 'SGCG', 'F13A1', 'RETREG1', 
                     'A4': ['SFRP4', 'HSD3B7', 'CD70', 'C1QTNF3', 'PRG4', 'GLIPR2', 'PAMR1', 'FNDC1',],
                      
                     'B1': ['CXCL2', 'SPSB1', 'CXCL1', 'IL6', 'MYOC', 'CCL2', 'CXCL3', 'TNFSF14', 'MEDAG', 'ZC3H12A', 
                            'C11orf96', 'IRF8', 'ITM2A', 'KDM6B', 'SOD2', 'CDKN1A', 'CSRNP1', 'CSRP2', 'ERRFI1', 'FMO1', ], 
                     'B2': ['CCL19', 'C7', 'IGFBP3', 'RBP5', 'CCDC146', 'CH25H', 'TNFSF13B', 'CTSH', 'CD74' ], 
                     'B3': ['CHRDL1', 'GPX3', 'GPC3', 'ITM2A', 'MGP', 'TSPAN8', 'ADH1B', 'C7', 
                            'GGT5', 'MGST1', 'ROBO3', 'DNASE1L3', 'AADAC', 'HHIP-AS1', ],  # ITM2A, GPC3      

                     'C1': ['DPEP1', 'UGT3A2', 'COL11A1', 'MME', 'RBP4', 'MYL4', 'WFDC1', 'CYYR1', 'EDNRA', 
                            'MEF2C', 'TNMD', 'CDH11', 'COL21A1', 'GPC3', 'KIF26B', 'PPP1R14A', 'EDIL3', 'SLC26A7' ],
                     'C2': ['COCH', 'SLITRK6', 'MKX', 'CHADL', 'RSPO4', 'CRABP1', 'NDNF', 'SLC22A16', 'FIBIN', 'TNN', 'CCK', 
                            'RHPN1', 'MAB21L2', 'ASPN', 'OGN', 'PLXDC1', 'SPARCL1', 'PLPP5', 'WNT10A', 'NECAB1', 'TNMD' ], # 'SLITRK6', 'MKX'
                     'C3': ['BGN', 'F2R', 'TNN', 'POSTN', 'GPM6B', 'PRSS23', 'FAP', 'EMID1', ],
                     'C5': ['IGFBP3', 'LUZP2', 'CENPW', 'TFAP2A', 'TPD52', 'LEF1', 'CPE', 'BMP7', 'DIO3', 'ACTR3B', 
                            'BAMBI', 'INHBA', 'SERPINE2', 'WNT5A', 'BMP4', 'STMN1', 'SEMA3G', 'NOG', 'DCXR', 'EDNRA'],
    
                     'D1': ['ANGPTL7', 'SCN7A', 'C2orf40', 'APOD', 'CLDN1', 'CYP1B1', 'FGFBP2', 'MARCKSL1', 'PODNL1', 'KLK1', 'TM4SF1', 
                             'NRP2', 'COL8A1', 'P2RY14', 'SFRP4', 'MRAS', 'GPC3', 'ETV1', 'TIAM1', 'SPARCL1'], 
                     'D2': ['NGFR', 'CLDN1', 'SBSPON', 'TAGLN', 'TM4SF1', 'SLC2A1', 'SLC22A3', 'TNNC1', 'BNC2', 'KLF5', 'C2orf40', 'AQP3', 
                             'CSRP1', 'PALMD', 'SFRP4', 'PLEKHA4', 'NR2F2', 'ISYNA1', 'SCN7A', 'IGFBP6'],
                     'D3': ['FGFBP2', 'CPE', 'OLFML2A', 'SLC22A3', 'IGFBP2', 'SPON2', 'APOD', 'EGR2', 
                            'RAMP1', 'IGF1', 'KLK1', 'RGMA', 'PDGFD', 'PRSS23', 'TIMP3',], 
    
                     'T1': ['ASPN', 'MOB3B', 'PLEKHH2', 'PCDH15', 'RAI2', 'SPARCL1', 'TMEM176B', 'COL6A6', 'TMEM176A', 
                            'CYP1B1', 'MCTP2', 'BCL11A', 'LMO2', 'NTRK3', 'GLI2', 'PLPP4', 'CHN1', 'PDGFRL', 'IDE', 'LGR6', 'PI16', 'TELO2'],
    
                     'Glial': ['SOX10', 'S100B', 'NRNX1', 'L1CAM', 'AATK', 'SCN7A', 'GFRA3'], 
                     'Melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'],
                     'Endo': ['PLVAP', 'CLDN5', 'ACKR1', 'LMCD1', 'NPDC1', 'A2M', 
                         'PECAM1', 'CLU', 'VWF', 'CD74', 'RAMP2', 'IFI27', 'GNG11'], 
                      'Peri': ['RGS5', 'ITGA7', 'GJA4', 'MYH11', 'ANGPT2', 'LAMA5'], 
                    }

In [None]:
dict_colors = {'A1': '#c93038', 'A2': '#de6a38', 'A3': '#ffad3b', 'A4': '#852d66',
               'B1': '#b4d645', 'B2': '#51c43f', 'B3': '#309c63',
               'C1': '#93dfe4', 'C2': '#63c2c9', 'C3': '#4c93ad', 'C5': '#264f6e',
               'D1': '#fcbf8a', 'D2': '#b58057', 'D3': '#956642', 
               'T1': '#29c297', 'U': '#dedede'}

%store dict_colors

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
def plot_score_graph(adatax):
    df_cats_own = pd.DataFrame(index=adatax.obs_names, columns=['clusters', 'score'])
    for cluster in adatax.obs['cluster'].cat.categories:
        adata_sub = adatax[adatax.obs['cluster'] == cluster]
        try:
            df_cats_own.loc[adata_sub.obs_names, 'score'] = adata_sub.obs[f'cluster_{cluster}']
            df_cats_own.loc[adata_sub.obs_names, 'clusters'] = cluster
        except:
            pass

    df_cats_own = df_cats_own.sort_values('clusters')
    sns.barplot(x='clusters', y='score', data=df_cats_own, palette=adatax.uns['cluster_colors'])

In [None]:
data_dir = os.getcwd() + '/data/'
print(data_dir)
%store data_dir

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## data extraction and processing

### Deng et al. 2021

In [None]:
deng_dir = data_dir + '/deng_2021'

In [None]:
adata_deng_scar_1 = sc.read_10x_mtx(deng_dir + '/NF1_matrix')
adata_deng_scar_2 = sc.read_10x_mtx(deng_dir + '/NF2_matrix')
adata_deng_scar_3 = sc.read_10x_mtx(deng_dir + '/NF3_matrix')

adata_deng_scar = sc.AnnData.concatenate(adata_deng_scar_1, adata_deng_scar_2, adata_deng_scar_3, 
                                         batch_categories=['Normal_1', 'Normal_2', 'Normal_3'])

In [None]:
adata_deng_scar.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_deng_scar.var_names ]

In [None]:
# Basic QC filtering
adata_deng_scar.var['mt'] = adata_deng_scar.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_deng_scar, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_deng_scar, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_deng_scar, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_deng_scar, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_deng_scar.obs['batch'], 'y': adata_deng_scar.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_deng_scar = adata_deng_scar[((adata_deng_scar.obs.n_genes_by_counts < 4500) & 
                                    (adata_deng_scar.obs.n_genes_by_counts > 1000)).values, :]
adata_deng_scar = adata_deng_scar[adata_deng_scar.obs.pct_counts_mt < 18, :]

In [None]:
sc.pp.filter_genes(adata_deng_scar, min_counts=1)
sc.pp.normalize_total(adata_deng_scar)
sc.pp.log1p(adata_deng_scar)

In [None]:
sc.pp.pca(adata_deng_scar, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_deng_scar, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_deng_scar, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_deng_scar) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_deng_scar)

In [None]:
sc.tl.umap(adata_deng_scar, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_deng_scar, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(adata_deng_scar, fraction=1, random_state=0, copy=False)
sc.pl.umap(adata_deng_scar, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_deng_scar, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'KRT5', 'DMKN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_deng_scar, color=['leiden', 'C2orf40', 'CDH19', 'ANGPTL7','PLEKHB1','ENTPD2', 
                                   'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5', 'WNT6', ], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_deng_scar, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_deng_scar, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_deng_scar_fb = adata_deng_scar[adata_deng_scar.obs['assigned_cats'].isin(['fibro', 'unassigned'])]

In [None]:
sc.pp.filter_genes(adata_deng_scar_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_deng_scar_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_deng_scar_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_deng_scar_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_deng_scar_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(adata_deng_scar_fb)

In [None]:
sc.tl.umap(adata_deng_scar_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(adata_deng_scar_fb, resolution=18, random_state=seed)

In [None]:
assign_cats

In [None]:
assign_cats(adata_deng_scar_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(adata_deng_scar_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_deng_scar_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_deng_scar_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_deng_scar_fb, color=['leiden', 'axis', 'cluster', 'batch'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del adata_deng_scar_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_deng_scar_fb, color=['cluster'] + [i for i in val if i in adata_deng_scar_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
adata_deng_scar_fb.write_h5ad(deng_dir + '/adata_deng_scar_fb.h5')

In [None]:
adata_deng_scar_fb = sc.read(deng_dir + '/adata_deng_scar_fb.h5')

### Gao et al. 2021

In [None]:
gao_dir = data_dir + '/gao_2021'

In [None]:
adata_gao = sc.read(gao_dir + '/gao_2021.loom')
adata_gao = adata_gao[adata_gao.obs['Patient'].isin(['Ctrl1', 'Ctrl2', 'Ctrl3'])]

In [None]:
# Basic QC filtering
adata_gao.var['mt'] = adata_gao.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_gao, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_gao, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_gao, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_gao, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_gao = adata_gao[((adata_gao.obs.n_genes_by_counts < 7000) & 
                                    (adata_gao.obs.n_genes_by_counts > 500)).values, :]
adata_gao = adata_gao[adata_gao.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(adata_gao, min_counts=1)
sc.pp.normalize_total(adata_gao)
sc.pp.log1p(adata_gao)

In [None]:
sc.pp.pca(adata_gao, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_gao, key='Patient', max_iter_harmony=50)
sc.pp.neighbors(adata_gao, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_gao) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_gao)

In [None]:
sc.tl.umap(adata_gao, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gao, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'Patient'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_gao, dict_cats=dict_cats_fb, min_score=0.4,  quantile_gene_sel=0.4)

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'assigned_cats'], legend_loc='on data', 
           cmap=magma, use_raw=False)

In [None]:
adata_gao_fb = adata_gao[adata_gao.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_gao_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_gao_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_gao_fb, key='Patient', max_iter_harmony=50)
sc.pp.neighbors(adata_gao_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_gao_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_gao_fb)

In [None]:
sc.tl.umap(adata_gao_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_gao_fb, resolution=10, random_state=seed)

In [None]:
assign_cats(adata_gao_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(adata_gao_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.25, quantile_gene_sel=0.6,
            key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_gao_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_gao_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_gao_fb, color=['Patient', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

The unassigned cluster doesn't show any relevant DEGs, so it is just "rubbish".

In [None]:
del adata_gao_fb.obs['C3']
del adata_gao_fb.obs['C2']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_gao_fb, color=['cluster'] + [i for i in val if i in adata_gao_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_gao_fb)

In [None]:
adata_gao_fb.write_h5ad(gao_dir + '/gao_2021_fb.h5')

In [None]:
adata_gao_fb = sc.read(gao_dir + '/gao_2021_fb.h5')

### Gaydosik et al. 2020

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'

In [None]:
adata_gaydosik_CTCL = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_CTCL.h5ad')
adata_gaydosik_HC = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_HC.h5ad')

In [None]:
# Basic QC filtering
adata_gaydosik_HC.var['mt'] = adata_gaydosik_HC.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_gaydosik_HC, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_gaydosik_HC, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_gaydosik_HC, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_gaydosik_HC, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_gaydosik_HC = adata_gaydosik_HC[((adata_gaydosik_HC.obs.n_genes_by_counts < 5500) & 
                                    (adata_gaydosik_HC.obs.n_genes_by_counts > 400)).values, :]
adata_gaydosik_HC = adata_gaydosik_HC[adata_gaydosik_HC.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_gaydosik_HC, min_counts=1)
sc.pp.normalize_total(adata_gaydosik_HC)
sc.pp.log1p(adata_gaydosik_HC)

In [None]:
sc.pp.pca(adata_gaydosik_HC, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_gaydosik_HC, key='sample', max_iter_harmony=50)
sc.pp.neighbors(adata_gaydosik_HC, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_gaydosik_HC) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_gaydosik_HC)

In [None]:
sc.tl.umap(adata_gaydosik_HC, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gaydosik_HC, resolution=0.3, random_state=seed)

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'sample'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_gaydosik_HC, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_gaydosik_HC_fb = adata_gaydosik_HC[adata_gaydosik_HC.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_gaydosik_HC_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_gaydosik_HC_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_gaydosik_HC_fb, key='sample', max_iter_harmony=50)
sc.pp.neighbors(adata_gaydosik_HC_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_gaydosik_HC_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_gaydosik_HC_fb)

In [None]:
sc.tl.umap(adata_gaydosik_HC_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(adata_gaydosik_HC_fb, resolution=6, random_state=seed)

In [None]:
assign_cats(adata_gaydosik_HC_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster', others_name='U')
assign_cats(adata_gaydosik_HC_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_gaydosik_HC_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_gaydosik_HC_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_gaydosik_HC_fb, color=['leiden', 'sample', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del adata_gaydosik_HC_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_gaydosik_HC_fb, color=['cluster'] + [i for i in val if i in adata_gaydosik_HC_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_gaydosik_HC_fb)

In [None]:
adata_gaydosik_HC_fb.write_h5ad(gaydosik_dir + '/gaydosik_2020_HC_fb.h5ad')

In [None]:
adata_gaydosik_HC_fb = sc.read(gaydosik_dir + '/gaydosik_2020_HC_fb.h5ad')

### He et al. 2020

In [None]:
he_dir = data_dir + '/He_2020'

In [None]:
adata_he = sc.read_loom(he_dir + '/He2020.loom')
adata_he.var_names_make_unique()

In [None]:
# Replace CCN5 by WISP2 because it is a key gene
adata_he.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_he.var_names]

In [None]:
# Basic QC filtering
adata_he.var['mt'] = adata_he.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_he, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_he, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_he, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_he, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_he = adata_he[adata_he.obs.n_genes_by_counts < 5000, :]
adata_he = adata_he[adata_he.obs.n_genes_by_counts > 600, :]
adata_he = adata_he[adata_he.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_he, min_counts=1)
sc.pp.normalize_total(adata_he)
sc.pp.log1p(adata_he)

In [None]:
sc.pp.pca(adata_he, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_he, random_state=seed, n_neighbors=int(0.5 * len(adata_he) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_he)

In [None]:
sc.tl.umap(adata_he, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_he, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(adata_he, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'assigned_cats', 'CDH19', 'ANGPTL7', 'PLEKHB1', 'ENTPD2', 'C2orf40', 
                           'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_he_fb = adata_he[adata_he.obs['assigned_cats'].isin(['fibro', 'neuro', 'muscle'])]

In [None]:
sc.pp.filter_genes(adata_he_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_he_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_he_fb, random_state=seed, n_neighbors=int(0.5 * len(adata_he_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he_fb)

In [None]:
sc.tl.umap(adata_he_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_he_fb, resolution=15, random_state=seed)

In [None]:
assign_cats(adata_he_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster')
assign_cats(adata_he_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
            quantile_gene_sel=0.4, intermediate_states=True, diff=0.15)

In [None]:
adata_he_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_he_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_he_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_he_fb, color=['DMKN', 'KRT5', 'KRT14'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
adata_he_fb = adata_he_fb[~ adata_he_fb.obs['cluster'].isin(['Glial', 'unassigned', 'endo'])]

In [None]:
sc.pp.filter_genes(adata_he_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_he_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_he_fb, random_state=seed, n_neighbors=int(0.5 * len(adata_he_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he_fb)

In [None]:
sc.tl.umap(adata_he_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_he_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(adata_he_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(adata_he_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
            quantile_gene_sel=0.4, intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_he_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_he_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_he_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_he_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_he_fb, groupby='leiden', groups=['1', '2', '3'])
sc.pl.rank_genes_groups_tracksplot(adata_he_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_he_fb, color=['cluster'] + [i for i in val if i in adata_he_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_he_fb)

In [None]:
adata_he_fb.write_h5ad(he_dir + '/He2020_fb.h5')

In [None]:
adata_he_fb = sc.read(he_dir + '/He2020_fb.h5')

### Hughes et al. 2020

In [None]:
hughes_dir = data_dir + '/hughes_2020'

In [None]:
adata_hughes = sc.read(hughes_dir + '/hughes_2020.h5ad')
adata_hughes = adata_hughes[adata_hughes.obs['donor_id'].isin(['Normal', 'Normal2', 'Normal3'])]
adata_hughes.obs['batch'] = adata_hughes.obs['donor_id']

In [None]:
adata_hughes

In [None]:
sc.pp.filter_genes(adata_hughes, min_counts=1)

In [None]:
# Basic QC filtering
adata_hughes.var['mt'] = adata_hughes.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_hughes, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_hughes, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_hughes, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_hughes, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_hughes = adata_hughes[((adata_hughes.obs.n_genes_by_counts < 3000) & 
                                    (adata_hughes.obs.n_genes_by_counts > 200)).values, :]
adata_hughes = adata_hughes[adata_hughes.obs.pct_counts_mt < 15, :]

In [None]:
batches = sorted(list(set(adata_hughes.obs['batch'].values)))
for batch in batches:
    counts = adata_hughes.obs['n_genes_by_counts'].loc[adata_hughes.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_hughes, min_counts=1)
sc.pp.normalize_total(adata_hughes)
sc.pp.log1p(adata_hughes)

In [None]:
sc.pp.pca(adata_hughes, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_hughes, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_hughes, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_hughes) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_hughes)

In [None]:
sc.tl.umap(adata_hughes, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_hughes, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_hughes, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_hughes, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_hughes, dict_cats=dict_cats_fb, min_score=0.3)

In [None]:
sc.pl.umap(adata_hughes, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_hughes_fb = adata_hughes[adata_hughes.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_hughes_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_hughes_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_hughes_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_hughes_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_hughes_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_hughes_fb)

In [None]:
sc.tl.umap(adata_hughes_fb, min_dist=0.35, random_state=seed)
sc.tl.leiden(adata_hughes_fb, resolution=9, random_state=seed)

In [None]:
assign_cats(adata_hughes_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(adata_hughes_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_hughes_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_hughes_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_hughes_fb, color=['leiden', 'axis', 'batch', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del adata_hughes_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_hughes_fb, color=['cluster'] + [i for i in val if i in adata_hughes_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_hughes_fb)

In [None]:
adata_hughes_fb.write_h5ad(hughes_dir + '/hughes_2020_fb.h5')

In [None]:
adata_hughes_fb = sc.read(hughes_dir + '/hughes_2020_fb.h5')

### Kim et al. 2020

In [None]:
kim_dir_2020 = data_dir + '/Kim_2020'

In [None]:
adata_kim_2020 = sc.read(kim_dir_2020 + '/Kim_2020.h5ad')
adata_kim_2020.var_names_make_unique()

In [None]:
adata_kim_2020.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_kim_2020.var_names ]

In [None]:
sc.pp.filter_genes(adata_kim_2020, min_counts=1)

In [None]:
adata_kim_2020.X = np.array(adata_kim_2020.X.todense())

In [None]:
# Basic QC filtering
adata_kim_2020.var['mt'] = adata_kim_2020.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_kim_2020, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_kim_2020, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_kim_2020, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_kim_2020, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_kim_2020 = adata_kim_2020[((adata_kim_2020.obs.n_genes_by_counts < 4000) & 
                                    (adata_kim_2020.obs.n_genes_by_counts > 500)).values, :]
adata_kim_2020 = adata_kim_2020[adata_kim_2020.obs.pct_counts_mt < 25, :]

In [None]:
batches = sorted(list(set(adata_kim_2020.obs['batch'].values)))
for batch in batches:
    counts = adata_kim_2020.obs['n_genes_by_counts'].loc[adata_kim_2020.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_kim_2020, min_counts=1)
sc.pp.normalize_total(adata_kim_2020)
sc.pp.log1p(adata_kim_2020)

In [None]:
sc.pp.pca(adata_kim_2020, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_kim_2020, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_kim_2020, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_kim_2020) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_kim_2020)

In [None]:
sc.tl.umap(adata_kim_2020, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim_2020, resolution=0.7, random_state=seed)

In [None]:
sc.pl.umap(adata_kim_2020, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_kim_2020, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'S100B', 'MPZ', 'DMKN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_kim_2020, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_kim_2020, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_kim_2020_fb = adata_kim_2020[adata_kim_2020.obs['assigned_cats'] == 'fibro']

In [None]:
np.unique(adata_kim_2020_fb.obs['batch'].values, return_counts=True)

In [None]:
adata_kim_2020_fb = adata_kim_2020_fb[adata_kim_2020_fb.obs['batch'].isin(['0', '1', '2', '3', '5'])]

In [None]:
sc.pp.filter_genes(adata_kim_2020_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_kim_2020_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_kim_2020_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_kim_2020_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_kim_2020_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_kim_2020_fb)

In [None]:
sc.tl.umap(adata_kim_2020_fb, min_dist=0.8, random_state=seed)
sc.tl.leiden(adata_kim_2020_fb, resolution=10, random_state=seed)

In [None]:
assign_cats(adata_kim_2020_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(adata_kim_2020_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_kim_2020_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_kim_2020_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_kim_2020_fb, color=['leiden', 'batch', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
del adata_kim_2020_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_kim_2020_fb, groupby='cluster', groups=['U'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_kim_2020_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_kim_2020_fb, color=['cluster'] + [i for i in val if i in adata_kim_2020_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_kim_2020_fb)

In [None]:
adata_kim_2020_fb.write_h5ad(kim_dir_2020 + '/Kim_2020_fb.h5ad')

In [None]:
adata_kim_2020_fb = sc.read(kim_dir_2020 + '/Kim_2020_fb.h5ad')

### Kim et al. 2021 [NAIL, EXPECTING DIFFERENT RESULTS]

In [None]:
kim_dir_2021 = data_dir + '/kim_2021'

In [None]:
adata_kim_2021 = sc.read(kim_dir_2021 + '/kim_2021.h5ad')
adata_kim_2021.var_names_make_unique()

In [None]:
adata_kim_2021.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_kim_2021.var_names ]

In [None]:
sc.pp.filter_genes(adata_kim_2021, min_counts=1)

In [None]:
adata_kim_2021.X = np.array(adata_kim_2021.X.todense())

In [None]:
# Basic QC filtering
adata_kim_2021.var['mt'] = adata_kim_2021.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_kim_2021, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_kim_2021, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_kim_2021, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_kim_2021, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_kiadata_kim_2021m = adata_kim_2021[((adata_kim_2021.obs.n_genes_by_counts < 6000) & 
                                    (adata_kim_2021.obs.n_genes_by_counts > 1000)).values, :]
adata_kim_2021 = adata_kim_2021[adata_kim_2021.obs.pct_counts_mt < 40, :]

In [None]:
adata_kim_2021

In [None]:
batches = sorted(list(set(adata_kim_2021.obs['batch'].values)))
for batch in batches:
    counts = adata_kim_2021.obs['n_genes_by_counts'].loc[adata_kim_2021.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_kim_2021, min_counts=1)
sc.pp.normalize_total(adata_kim_2021)
sc.pp.log1p(adata_kim_2021)

In [None]:
sc.pp.pca(adata_kim_2021, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_kim_2021, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_kim_2021, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_kim_2021) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_kim_2021)

In [None]:
sc.tl.umap(adata_kim_2021, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim_2021, resolution=3, random_state=seed)

In [None]:
sc.pp.subsample(adata_kim_2021, fraction=1, random_state=0, copy=False)
sc.pl.umap(adata_kim_2021, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_kim_2021, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_kim_2021, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_kim_2021, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_kim_2021, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_kim_2021_fb = adata_kim_2021[adata_kim_2021.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_kim_2021_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_kim_2021_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_kim_2021_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_kim_2021_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_kim_2021_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_kim_2021_fb)

In [None]:
sc.tl.umap(adata_kim_2021_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_kim_2021_fb, resolution=9, random_state=seed)

In [None]:
adata_kim_2021_fb = sc.read(kim_dir_2021 + '/kim_2021_fb.h5')

In [None]:
assign_cats(adata_kim_2021_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(adata_kim_2021_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_kim_2021_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_kim_2021_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_kim_2021_fb, color=['leiden', 'axis', 'cluster', 'batch'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_kim_2021_fb, color=['cluster'] + [i for i in val if i in adata_kim_2021_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_kim_2021_fb)

In [None]:
adata_kim_2021_fb.write_h5ad(kim_dir_2021 + '/kim_2021_fb.h5')

In [None]:
adata_kim_2021_fb = sc.read(kim_dir_2021 + '/kim_2021_fb.h5')

### Liu et al. 2021

In [None]:
liu_dir = data_dir + '/liu_2021'
os.makedirs(liu_dir, exist_ok=True)

In [None]:
adata_liu = sc.read(liu_dir + '/liu_2021.h5')
adata_liu_ctrl = adata_liu[adata_liu.obs['Group'] == 'CTRL']

In [None]:
adata_liu_ctrl.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_liu_ctrl.var_names ]

In [None]:
# Basic QC filtering
adata_liu_ctrl.var['mt'] = adata_liu_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_liu_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_liu_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_liu_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_liu_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
sc.pp.filter_genes(adata_liu_ctrl, min_counts=1)
sc.pp.normalize_total(adata_liu_ctrl)
sc.pp.log1p(adata_liu_ctrl)

In [None]:
sc.pp.pca(adata_liu_ctrl, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_liu_ctrl, key='Patient', max_iter_harmony=50)
sc.pp.neighbors(adata_liu_ctrl, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_liu_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_liu_ctrl)

In [None]:
sc.tl.umap(adata_liu_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_liu_ctrl, resolution=14, random_state=seed)

In [None]:
sc.pp.subsample(adata_liu_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(adata_liu_ctrl, color=['leiden', 'Patient'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_liu_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_liu_ctrl, dict_cats=dict_cats_fb, min_score=0.7)

In [None]:
sc.pl.umap(adata_liu_ctrl, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_liu_ctrl, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_liu_ctrl_fb = adata_liu_ctrl[adata_liu_ctrl.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_liu_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_liu_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_liu_ctrl_fb, key='Patient', max_iter_harmony=50)
sc.pp.neighbors(adata_liu_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_liu_ctrl_fb) ** 0.5 // 3), metric='cosine')
tk.tl.triku(adata_liu_ctrl_fb)

In [None]:
sc.tl.umap(adata_liu_ctrl_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_liu_ctrl_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_liu_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(adata_liu_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_liu_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_liu_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_liu_ctrl_fb, color=['leiden', 'axis', 'cluster', 'Patient'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
# Remove peri population
adata_liu_ctrl_fb = adata_liu_ctrl_fb[adata_liu_ctrl_fb.obs['cluster'] != 'Peri']
sc.pp.filter_genes(adata_liu_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_liu_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_liu_ctrl_fb, key='Patient', max_iter_harmony=50)
sc.pp.neighbors(adata_liu_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_liu_ctrl_fb) ** 0.5 // 3), metric='cosine')
tk.tl.triku(adata_liu_ctrl_fb)

In [None]:
sc.tl.umap(adata_liu_ctrl_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_liu_ctrl_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_liu_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(adata_liu_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_liu_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_liu_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_liu_ctrl_fb, color=['leiden', 'axis', 'cluster', 'Patient'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del adata_liu_ctrl_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_liu_ctrl_fb, groupby='cluster', groups=['U'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_liu_ctrl_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_liu_ctrl_fb, color=['cluster'] + [i for i in val if i in adata_liu_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_liu_ctrl_fb)

In [None]:
adata_liu_ctrl_fb.write_h5ad(liu_dir + '/liu_2021_fb.h5')

In [None]:
adata_liu_ctrl_fb = sc.read(liu_dir + '/liu_2021_fb.h5')

### Mirizio et al. 2020

In [None]:
mirizio_dir = data_dir + '/mirizio_2020'

In [None]:
adata_mirizio = sc.read(mirizio_dir + '/Mirizio_2020.h5ad')
adata_mirizio.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_mirizio.var['mt'] = adata_mirizio.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_mirizio, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_mirizio, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mirizio, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mirizio, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_mirizio = adata_mirizio[((adata_mirizio.obs.n_genes_by_counts < 4000) & 
                                    (adata_mirizio.obs.n_genes_by_counts > 250)).values, :]
adata_mirizio = adata_mirizio[adata_mirizio.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(adata_mirizio, min_counts=1)
sc.pp.normalize_total(adata_mirizio)
sc.pp.log1p(adata_mirizio)

In [None]:
sc.pp.pca(adata_mirizio, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_mirizio, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_mirizio, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_mirizio) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_mirizio)

In [None]:
sc.tl.umap(adata_mirizio, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_mirizio, resolution=7, random_state=seed)

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_mirizio, dict_cats=dict_cats_fb, min_score=0.4)

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_mirizio_fb = adata_mirizio[adata_mirizio.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_mirizio_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_mirizio_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_mirizio_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_mirizio_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_mirizio_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_mirizio_fb)

In [None]:
sc.tl.umap(adata_mirizio_fb, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_mirizio_fb, resolution=8, random_state=seed)

In [None]:
assign_cats(adata_mirizio_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(adata_mirizio_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, quantile_gene_sel=0.75,
            key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_mirizio_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_mirizio_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_mirizio_fb, color=['leiden', 'batch', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
# UNASSIGNED cells may refer to stress
sc.tl.rank_genes_groups(adata_mirizio_fb, groupby='leiden', groups=['28'])
sc.pl.rank_genes_groups_tracksplot(adata_mirizio_fb, dendrogram=False, n_genes=150)

In [None]:
del adata_mirizio_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_mirizio_fb, color=['cluster'] + [i for i in val if i in adata_mirizio_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_mirizio_fb)

In [None]:
adata_mirizio_fb.write_h5ad(mirizio_dir + '/Mirizio_2020_fb.h5ad')

In [None]:
adata_mirizio_fb = sc.read(mirizio_dir + '/Mirizio_2020_fb.h5ad')

### Reynolds et al. 2021 [Discarded because of bad quality cells]

In [None]:
reynolds_dir = data_dir + '/reynolds_2021'

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', 
                                    backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

del adata_reynolds_healthy_fb.uns

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
adata_reynolds_healthy_fb = adata_reynolds_healthy_fb[adata_reynolds_healthy_fb.obs['hypoxia_stress'] == 'Normal']

In [None]:
adata_reynolds_healthy_fb

In [None]:
sc.pp.filter_genes(adata_reynolds_healthy_fb, min_counts=1)
sc.pp.highly_variable_genes(adata_reynolds_healthy_fb)

In [None]:
sc.pp.pca(adata_reynolds_healthy_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_reynolds_healthy_fb, key='sample_id', max_iter_harmony=50)
sc.pp.neighbors(adata_reynolds_healthy_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_reynolds_healthy_fb) ** 0.5 // 4), metric='cosine')
# tk.tl.triku(adata_reynolds_healthy_fb)

In [None]:
sc.tl.umap(adata_reynolds_healthy_fb, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_healthy_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(adata_reynolds_healthy_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(adata_reynolds_healthy_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, quantile_gene_sel=0.75,
            key_added='axis', intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_reynolds_healthy_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_reynolds_healthy_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['COL18A1', 'CLEC2A', 'COL13A1', 'COL6A5', 'NPTX2', 'HSPB3', 'COMP', 'APCDD1', 'NKD2', 'AKAP6'], 
           legend_loc='on data', cmap=magma, use_raw=False, palette='Dark2')

In [None]:
del adata_reynolds_healthy_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_healthy_fb, groupby='leiden', groups=['4', '21'])
sc.pl.rank_genes_groups_tracksplot(adata_reynolds_healthy_fb, dendrogram=False, use_raw=False, n_genes=50)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_reynolds_healthy_fb, color=['cluster'] + [i for i in val if i in adata_reynolds_healthy_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_reynolds_healthy_fb)

In [None]:
adata_reynolds_healthy_fb.write_h5ad(reynolds_dir + '/adata_reynolds_healthy_fb.h5')

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5')

### Solé-Boldo et al. 2020

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'

In [None]:
adata_sole_young = sc.read_loom(sole_dir + '/SB2020.loom')
adata_sole_young.var_names_make_unique()

In [None]:
adata_sole_young.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_sole_young.var_names ]

In [None]:
adata_sole_young.X = np.array(adata_sole_young.X.todense())

In [None]:
sc.pp.filter_genes(adata_sole_young, min_counts=1)

In [None]:
# Basic QC filtering
adata_sole_young.var['mt'] = adata_sole_young.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_sole_young, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_sole_young, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_sole_young, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_sole_young, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_sole_young = adata_sole_young[((adata_sole_young.obs.n_genes_by_counts < 2500) & 
                                    (adata_sole_young.obs.n_genes_by_counts > 200)).values, :]
adata_sole_young = adata_sole_young[adata_sole_young.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_sole_young, min_counts=1)
sc.pp.normalize_total(adata_sole_young)
sc.pp.log1p(adata_sole_young)

In [None]:
sc.pp.pca(adata_sole_young, random_state=seed, n_comps=35)
sc.pp.neighbors(adata_sole_young, random_state=seed, n_neighbors=int(len(adata_sole_young) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_sole_young)

In [None]:
sc.tl.umap(adata_sole_young, min_dist=0.6, random_state=seed)
sc.tl.leiden(adata_sole_young, resolution=10, random_state=seed)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'S100B', 'MPZ', 'PLP1', 'MLANA', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_sole_young, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_sole_young, color=['SAT1', 'RBFOX3', 'SELE', 'TPSAB1', 'CTSG', 'CST7'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_sole_young_fb = adata_sole_young[adata_sole_young.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_sole_young_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_sole_young_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_sole_young_fb, random_state=seed, n_neighbors=int(len(adata_sole_young_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_sole_young_fb)

In [None]:
sc.tl.umap(adata_sole_young_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_sole_young_fb, resolution=8, random_state=seed)

In [None]:
assign_cats(adata_sole_young_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster', others_name='U')
assign_cats(adata_sole_young_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_sole_young_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_sole_young_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_sole_young_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
plot_score_graph(adata_sole_young_fb)

In [None]:
del adata_sole_young_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_sole_young_fb, color=['cluster'] + [i for i in val if i in adata_sole_young_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
adata_sole_young_fb.write_h5ad(sole_dir + '/SB2020_fb.h5')

In [None]:
adata_sole_young_fb = sc.read(sole_dir + '/SB2020_fb.h5')

### Tabib et al. 2018

In [None]:
tabib_dir = data_dir + '/Tabib_2018'

In [None]:
adata_tabib = sc.read_csv(tabib_dir + '/Skin_6Control_rawUMI.csv')
adata_tabib = adata_tabib.transpose()

In [None]:
adata_tabib.obs['batch'] = [i.split('_')[0] for i in adata_tabib.obs_names]

In [None]:
df_metadata_tabib = pd.read_csv(tabib_dir + '/Skin_6Control_Metadata.csv', index_col=0)

df metadata has 8366 cells, although the paper states that 8522 cells were analyzed. The rest of cells are erithrocytes, which were filtered out from the analysis.

In [None]:
adata_tabib.raw = adata_tabib

In [None]:
dict_reverse_mappings = {'Fibroblast': ['0', '3', '4'], 
                 'Keratinocyte': ['1', '5', '7', '11', '14',], 
                 'Endothelial cell': ['2'], 
                 'Pericyte': ['6', '10'], 
                 'Macrophage/DC': ['8'], 
                 'Lymphocyte': ['9'], 
                 'Secretory Epith': ['12'], 
                 'Smooth Muscle': ['13'], 
                 'Melanocyte': ['15'], 
                 'Neural Cell': ['16'],
                 'Cornified Env': ['17'],
                 'B cell': ['18'], 
                 'Erithrocyte': [np.NaN]}  # This is ours!

dict_mappings = {}

for key, val in dict_reverse_mappings.items():
    for val_i in val:
        dict_mappings[val_i] = key

In [None]:
adata_tabib.obs['res.0.6'] = df_metadata_tabib['res.0.6'].astype(str)
adata_tabib.obs['cluster'] = [dict_mappings[i] for i in adata_tabib.obs['res.0.6']]

Since we are interested in fibros, we are going to filter their specific populations

In [None]:
adata_tabib_fb = adata_tabib[adata_tabib.obs['cluster'].isin(['Fibroblast']), :].copy()
sc.pp.filter_genes(adata_tabib_fb, min_counts=1)
adata_tabib_fb.X = spr.csr.csr_matrix(adata_tabib_fb.X).copy()
adata_tabib_fb.raw = adata_tabib_fb

In [None]:
sc.pp.normalize_total(adata_tabib_fb)
sc.pp.log1p(adata_tabib_fb)

In [None]:
sc.pp.pca(adata_tabib_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_tabib_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_tabib_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(adata_tabib_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_tabib_fb)

In [None]:
sc.tl.umap(adata_tabib_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(adata_tabib_fb, resolution=16, random_state=seed)

In [None]:
assign_cats(adata_tabib_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(adata_tabib_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.45, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_tabib_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_tabib_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_tabib_fb, color=['leiden', 'axis', 'cluster', 'batch'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_tabib_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_tabib_fb, color=['cluster'] + [i for i in val if i in adata_tabib_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_tabib_fb)

In [None]:
adata_tabib_fb.write_h5ad(tabib_dir + '/tabib_2018_fb.h5')

In [None]:
adata_tabib_fb = sc.read(tabib_dir + '/tabib_2018_fb.h5')

### Tabib et al. 2021

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'

In [None]:
adata_tabib_2021_ctrl = sc.read(tabib_2021_dir + '/adata_tabib_2021_ctrl.h5')

In [None]:
adata_tabib_2021_ctrl.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_tabib_2021_ctrl.var_names ]

In [None]:
# Basic QC filtering
adata_tabib_2021_ctrl.var['mt'] = adata_tabib_2021_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_tabib_2021_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_tabib_2021_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_tabib_2021_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_tabib_2021_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_tabib_2021_ctrl.obs['batch'], 'y': adata_tabib_2021_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_tabib_2021_ctrl.obs['batch'], 'y': adata_tabib_2021_ctrl.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_tabib_2021_ctrl = adata_tabib_2021_ctrl[(((adata_tabib_2021_ctrl.obs.batch == 'SC1') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC4') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC18') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC32') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC33') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC34') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC50') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC68') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC124') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5)) | 
                                              ((adata_tabib_2021_ctrl.obs.batch == 'SC125') & (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts < 7.8) & 
                                                (adata_tabib_2021_ctrl.obs.log1p_n_genes_by_counts > 6.5))).values, :]
adata_tabib_2021_ctrl = adata_tabib_2021_ctrl[adata_tabib_2021_ctrl.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_tabib_2021_ctrl, min_counts=1)
sc.pp.normalize_total(adata_tabib_2021_ctrl)
sc.pp.log1p(adata_tabib_2021_ctrl)

In [None]:
sc.pp.pca(adata_tabib_2021_ctrl, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_tabib_2021_ctrl, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_tabib_2021_ctrl, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_tabib_2021_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_tabib_2021_ctrl)

In [None]:
sc.tl.umap(adata_tabib_2021_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_tabib_2021_ctrl, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(adata_tabib_2021_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(adata_tabib_2021_ctrl, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_tabib_2021_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'KRT5', 'DMKN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_tabib_2021_ctrl, color=['leiden', 'C2orf40', 'CDH19', 'ANGPTL7','PLEKHB1','ENTPD2', 
                                   'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5', 'WNT6', ], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_tabib_2021_ctrl, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_tabib_2021_ctrl, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_tabib_2021_ctrl_fb = adata_tabib_2021_ctrl[adata_tabib_2021_ctrl.obs['assigned_cats'].isin(['fibro', 'unassigned'])]

In [None]:
sc.pp.filter_genes(adata_tabib_2021_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_tabib_2021_ctrl_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_tabib_2021_ctrl_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_tabib_2021_ctrl_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_tabib_2021_ctrl_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(adata_tabib_2021_ctrl_fb)

In [None]:
sc.tl.umap(adata_tabib_2021_ctrl_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(adata_tabib_2021_ctrl_fb, resolution=18, random_state=seed)

In [None]:
assign_cats(adata_tabib_2021_ctrl_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster', others_name='U')
assign_cats(adata_tabib_2021_ctrl_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_tabib_2021_ctrl_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_tabib_2021_ctrl_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_tabib_2021_ctrl_fb, color=['leiden', 'axis', 'cluster', 'batch'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del adata_tabib_2021_ctrl_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_tabib_2021_ctrl_fb, color=['cluster'] + [i for i in val if i in adata_tabib_2021_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_tabib_2021_ctrl_fb)

In [None]:
adata_tabib_2021_ctrl_fb.write_h5ad(tabib_2021_dir + '/adata_tabib_2021_ctrl_fb.h5')

In [None]:
adata_tabib_2021_ctrl_fb = sc.read(tabib_2021_dir + '/adata_tabib_2021_ctrl_fb.h5')

### Tabula Sapiens Consortium 2021 [Not included because they do not yield good quality populations]

In [None]:
tsc_dir = data_dir + '/Tabula_Sapiens_Consortium_2021'

In [None]:
adata_tsc_T10_S5 = sc.read(tsc_dir + '/TSP10_S5.loom')
adata_tsc_T10_S5.var_names_make_unique()
adata_tsc_T10_S6 = sc.read(tsc_dir + '/TSP10_S6.loom')
adata_tsc_T10_S6.var_names_make_unique()

adata_tsc_T14_S17 = sc.read(tsc_dir + '/TSP14_S17.loom')
adata_tsc_T14_S17.var_names_make_unique()
adata_tsc_T14_S18 = sc.read(tsc_dir + '/TSP14_S18.loom')
adata_tsc_T14_S18.var_names_make_unique()

In [None]:
adata_tsc = sc.AnnData.concatenate(adata_tsc_T10_S5, adata_tsc_T10_S6, adata_tsc_T14_S17, adata_tsc_T14_S18, batch_categories=['T10_S5', 'T10_S6', 'T14_S17', 'T14_S18'])

In [None]:
sc.pp.filter_genes(adata_tsc, min_counts=25)

In [None]:
adata_tsc.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_tsc.var_names ]

In [None]:
# Basic QC filtering
adata_tsc.var['mt'] = adata_tsc.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_tsc, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_tsc, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_tsc, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_tsc, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_tsc.obs['batch'], 'y': adata_tsc.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_tsc.obs['batch'], 'y': adata_tsc.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_tsc = adata_tsc[(((adata_tsc.obs.batch == 'T10_S5') & (adata_tsc.obs.log1p_n_genes_by_counts < 8) & 
                        (adata_tsc.obs.log1p_n_genes_by_counts > 6.4)) | 
                      ((adata_tsc.obs.batch == 'T10_S6') & (adata_tsc.obs.log1p_n_genes_by_counts < 8) & 
                        (adata_tsc.obs.log1p_n_genes_by_counts > 6.4)) | 
                      ((adata_tsc.obs.batch == 'T14_S17') & (adata_tsc.obs.log1p_n_genes_by_counts < 8.1) & 
                        (adata_tsc.obs.log1p_n_genes_by_counts > 6.7)) | 
                      ((adata_tsc.obs.batch == 'T14_S18') & (adata_tsc.obs.log1p_n_genes_by_counts < 8.1) & 
                        (adata_tsc.obs.log1p_n_genes_by_counts > 6.7)) ).values, :]
adata_tsc = adata_tsc[adata_tsc.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_tsc, min_counts=1)
sc.pp.normalize_total(adata_tsc)
sc.pp.log1p(adata_tsc)

In [None]:
sc.pp.pca(adata_tsc, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_tsc, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_tsc, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(adata_tsc) ** 0.5), metric='cosine')
tk.tl.triku(adata_tsc)

In [None]:
sc.tl.umap(adata_tsc, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_tsc, resolution=1.8, random_state=seed)

In [None]:
sc.pl.umap(adata_tsc, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_tsc, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_tsc, color=['leiden', 'assigned_cats', 'batch'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_tsc_fb = adata_tsc[adata_tsc.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_tsc_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_tsc_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_tsc_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_tsc_fb, use_rep='X_pca_harmony', n_neighbors=int(0.3 * len(adata_tsc_fb) ** 0.5), metric='cosine')
tk.tl.triku(adata_tsc_fb)

In [None]:
sc.tl.umap(adata_tsc_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(adata_tsc_fb, resolution=12, random_state=seed)

In [None]:
assign_cats(adata_tsc_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.99, key_added='cluster', others_name='U')
assign_cats(adata_tsc_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_tsc_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_tsc_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_tsc_fb, color=['leiden', 'axis', 'cluster', 'batch'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_tsc_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_tsc_fb, color=['cluster'] + [i for i in val if i in adata_tsc_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_tsc_fb)

In [None]:
adata_tsc_fb.write_h5ad(tsc_dir + '/adata_tsc_fb.h5')

In [None]:
adata_tsc_fb = sc.read(tsc_dir + '/adata_tsc_fb.h5')

### Theocarditis 2020

In [None]:
theo_dir = data_dir + '/Theocharidis_2020/'

In [None]:
adata_theo_healthy = sc.read(theo_dir + '/adata_theo_healthy.h5')
adata_theo_dm = sc.read(theo_dir + '/adata_theo_DM.h5')

adata_theo = sc.AnnData.concatenate(adata_theo_healthy, adata_theo_dm, batch_key='condition', batch_categories=['healthy', 'DM'])

In [None]:
sc.pp.filter_genes(adata_theo, min_counts=1)

In [None]:
adata_theo.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_theo.var_names ]

In [None]:
# Basic QC filtering
adata_theo.var['mt'] = adata_theo.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_theo, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
adata_theo.X = np.array(adata_theo.X.todense())

In [None]:
sc.pl.violin(adata_theo, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_theo, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_theo, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_theo.obs['batch'], 'y': adata_theo.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_theo.obs['batch'], 'y': adata_theo.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_theo = adata_theo[(((adata_theo.obs.batch == 'H1') & (adata_theo.obs.log1p_n_genes_by_counts < 7.6) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.2)) | 
                      ((adata_theo.obs.batch == 'H2') & (adata_theo.obs.log1p_n_genes_by_counts < 7.6) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.3)) | 
                      ((adata_theo.obs.batch == 'H3') & (adata_theo.obs.log1p_n_genes_by_counts < 7.6) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.4)) | 
                      ((adata_theo.obs.batch == 'H4') & (adata_theo.obs.log1p_n_genes_by_counts < 7.6) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.4)) |
                      ((adata_theo.obs.batch == 'DM2') & (adata_theo.obs.log1p_n_genes_by_counts < 8) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.7)) | 
                      ((adata_theo.obs.batch == 'DM3') & (adata_theo.obs.log1p_n_genes_by_counts < 7.4) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.5)) | 
                                         ((adata_theo.obs.batch == 'DM4') & (adata_theo.obs.log1p_n_genes_by_counts < 7.6) & 
                        (adata_theo.obs.log1p_n_genes_by_counts > 6.5))  ).values, :]
adata_theo = adata_theo[adata_theo.obs.pct_counts_mt < 12, :]

In [None]:
# Basic QC filtering
adata_theo.var['mt'] = adata_theo.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_theo, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pp.filter_genes(adata_theo, min_counts=1)
sc.pp.normalize_total(adata_theo)
sc.pp.log1p(adata_theo)

In [None]:
sc.pp.pca(adata_theo, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_theo, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_theo, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_theo) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_theo)

In [None]:
sc.tl.umap(adata_theo, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_theo, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(adata_theo, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_theo, dict_cats=dict_cats_fb, min_score=0.4, quantile_gene_sel=0.5)

In [None]:
sc.pl.umap(adata_theo, color=['leiden', 'batch', 'condition', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_theo_fb = adata_theo[adata_theo.obs['assigned_cats'].isin(['fibro', 'unassigned', 'F'])]

In [None]:
sc.pp.filter_genes(adata_theo_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_theo_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_theo_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_theo_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(adata_theo_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_theo_fb)

In [None]:
sc.tl.umap(adata_theo_fb, min_dist=0.15, random_state=seed)
sc.tl.leiden(adata_theo_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(adata_theo_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster', others_name='U')
assign_cats(adata_theo_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_theo_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_theo_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_theo_fb, color=['leiden', 'axis', 'batch', 'cluster', 'condition'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_theo_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_theo_fb, color=['cluster'] + [i for i in val if i in adata_theo_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_theo_fb)

In [None]:
adata_theo_fb.write_h5ad(theo_dir + '/adata_theo_fb.h5')

In [None]:
adata_theo_fb = sc.read(theo_dir + '/adata_theo_fb.h5')

### Theocarditis 2021

The dataset presents two B1 populations (CXCL8, CA12, COL7A1, CD82, TMEM158, WNT5A, C15orf48, SAT1, MT2A, IER3, PTGS2, SRGN, CXCL1, CXCL3, CCL3, HMGA1, STC1, UPP1, ATP13A3, POU2F2, BCL2A1, F3, S1PR3, PMAIP1, LAMB3, IL1B, MMP9, SLC7A5, CDCP1, EGLN3, GMFG) and (IGFBP4, A2M, IGF1, FGF7, MEDAG, CHI3L1, SFRP1, IGFBP2, LXN, GPC3, PALMD, ALPL, CXCL2, CCDC69). These two populations could not be replicated in the rest of datasets, so we do not consider them as two distinct populations, but rather as a possible artifact.

In [None]:
theo_dir_2021 = data_dir + '/Theocharidis_2021/'

In [None]:
adata_theo_2021 = sc.read(theo_dir_2021 + '/adata_theo_2021.h5')
adata_theo_2021 = adata_theo_2021[adata_theo_2021.obs['Condition'].isin(['healthy',])]

In [None]:
adata_theo_2021.obs['Condition']

In [None]:
sc.pp.filter_genes(adata_theo_2021, min_counts=1)

In [None]:
adata_theo_2021.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_theo_2021.var_names ]

In [None]:
adata_theo_2021

In [None]:
# Basic QC filtering
adata_theo_2021.var['mt'] = adata_theo_2021.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_theo_2021, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_theo_2021, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_theo_2021, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_theo_2021, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_theo_2021.obs['batch'], 'y': adata_theo_2021.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_theo_2021.obs['batch'], 'y': adata_theo_2021.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_theo_2021 = adata_theo_2021[((adata_theo_2021.obs.n_genes_by_counts < 2750) & 
                                    (adata_theo_2021.obs.n_genes_by_counts > 500)).values, :]
adata_theo_2021 = adata_theo_2021[adata_theo_2021.obs.pct_counts_mt < 15, :]

In [None]:
# Basic QC filtering
adata_theo_2021.var['mt'] = adata_theo_2021.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_theo_2021, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pp.filter_genes(adata_theo_2021, min_counts=1)
sc.pp.normalize_total(adata_theo_2021)
sc.pp.log1p(adata_theo_2021)

In [None]:
sc.pp.pca(adata_theo_2021, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_theo_2021, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_theo_2021, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_theo_2021) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_theo_2021)

In [None]:
sc.tl.umap(adata_theo_2021, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_theo_2021, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(adata_theo_2021, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_theo_2021, dict_cats=dict_cats_fb, min_score=0.4, quantile_gene_sel=0.5)

In [None]:
sc.pl.umap(adata_theo_2021, color=['leiden', 'batch', 'Condition', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_theo_2021_fb = adata_theo_2021[adata_theo_2021.obs['assigned_cats'].isin(['fibro'])]

In [None]:
sc.pp.filter_genes(adata_theo_2021_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_theo_2021_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_theo_2021_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_theo_2021_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(adata_theo_2021_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_theo_2021_fb)

In [None]:
sc.tl.umap(adata_theo_2021_fb, min_dist=0.15, random_state=seed)
sc.tl.leiden(adata_theo_2021_fb, resolution=4, random_state=seed)

In [None]:
assign_cats(adata_theo_2021_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.75, key_added='cluster', others_name='U')
assign_cats(adata_theo_2021_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_theo_2021_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_theo_2021_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_theo_2021_fb, color=['leiden', 'axis', 'batch', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )
sc.pl.umap(adata_theo_2021_fb, color=['Condition', 'Sample location'], cmap=magma, use_raw=False, )

In [None]:
# First pass to quit unnecessary populations

adata_theo_2021_fb = adata_theo_2021_fb[~adata_theo_2021_fb.obs['cluster'].isin(['Endo', 'Peri', 'U'])]

In [None]:
sc.pp.filter_genes(adata_theo_2021_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_theo_2021_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_theo_2021_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_theo_2021_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(adata_theo_2021_fb) ** 0.5 // 3), metric='cosine')
tk.tl.triku(adata_theo_2021_fb)

In [None]:
sc.tl.umap(adata_theo_2021_fb, min_dist=0.15, random_state=seed)
sc.tl.leiden(adata_theo_2021_fb, resolution=4, random_state=seed)

In [None]:
assign_cats(adata_theo_2021_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.65, key_added='cluster', others_name='U')
assign_cats(adata_theo_2021_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_theo_2021_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_theo_2021_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_theo_2021_fb, color=['leiden', 'axis', 'batch', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )
sc.pl.umap(adata_theo_2021_fb, color=['Condition', 'Sample location'], cmap=magma, use_raw=False, )

In [None]:
del adata_theo_2021_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_theo_2021_fb, color=['cluster'] + [i for i in val if i in adata_theo_2021_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_theo_2021_fb)

In [None]:
adata_theo_2021_fb.write_h5ad(theo_dir_2021 + '/adata_theo_2021_fb.h5')

In [None]:
adata_theo_2021_fb = sc.read(theo_dir_2021 + '/adata_theo_2021_fb.h5')

### Vorstandlechner et al. 2020

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'

In [None]:
adata_vors = sc.read(vors_dir + '/skin_vorstandlechner.loom', cache=True)

In [None]:
adata_vors.obs['batch'] = [i.split('-')[1] for i in adata_vors.obs_names]
adata_vors.obs['batch'].astype('category')

In [None]:
sc.pp.filter_genes(adata_vors, min_counts=1)

In [None]:
# Basic QC filtering
adata_vors.var['mt'] = adata_vors.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_vors, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_vors, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_vors, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_vors, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_vors.obs['batch'], 'y': adata_vors.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_vors.obs['batch'], 'y': adata_vors.obs['pct_counts_mt']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_vors = adata_vors[(((adata_vors.obs.batch == '1') & (adata_vors.obs.log1p_n_genes_by_counts < 6.2) & 
                        (adata_vors.obs.log1p_n_genes_by_counts > 5.6)) | 
                      ((adata_vors.obs.batch == '2') & (adata_vors.obs.log1p_n_genes_by_counts < 7.4) & 
                        (adata_vors.obs.log1p_n_genes_by_counts > 6)) | 
                      ((adata_vors.obs.batch == '3') & (adata_vors.obs.log1p_n_genes_by_counts < 7.4) & 
                        (adata_vors.obs.log1p_n_genes_by_counts > 5.8))).values, :]
adata_vors = adata_vors[adata_vors.obs.pct_counts_mt < 10, :]

In [None]:
sc.pp.filter_genes(adata_vors, min_counts=1)
sc.pp.normalize_total(adata_vors)
sc.pp.log1p(adata_vors)

In [None]:
sc.pp.pca(adata_vors, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_vors, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_vors, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_vors) ** 0.5), metric='cosine')
tk.tl.triku(adata_vors)

In [None]:
sc.tl.umap(adata_vors, min_dist=0.6, random_state=seed)
sc.tl.leiden(adata_vors, resolution=3, random_state=seed)

In [None]:
assign_cats(adata_vors, dict_cats=dict_cats_fb, min_score=0.5, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_vors, color=['PDGFRA', 'LUM', 'DCN', 'COL1A1'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_vors, color=['assigned_cats', 'leiden', 'batch'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_vors_fb = adata_vors[adata_vors.obs['assigned_cats'].isin(['fibro'])]
adata_vors_fb_raw = adata_vors_fb.copy()

In [None]:
sc.pp.filter_genes(adata_vors_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_vors_fb, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_vors_fb, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_vors_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(adata_vors_fb) ** 0.5 // 5), metric='cosine')
tk.tl.triku(adata_vors_fb)

In [None]:
sc.tl.umap(adata_vors_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_vors_fb, resolution=3, random_state=seed)

In [None]:
assign_cats(adata_vors_fb, dict_cats=dict_cats_clusters, min_score=0.45, quantile_gene_sel=0.7, key_added='cluster', others_name='U')
assign_cats(adata_vors_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15, others_name='U')

In [None]:
adata_vors_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_vors_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_vors_fb, color=['axis', 'cluster', 'batch'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
del adata_vors_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_vors_fb, color=['cluster'] + [i for i in val if i in adata_vors_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_vors_fb)

In [None]:
adata_vors_fb.write_h5ad(vors_dir + '/adata_vors_fb_2020.h5')

In [None]:
adata_vors_fb = sc.read(vors_dir + '/adata_vors_fb_2020.h5')

## Presence of clusters for each dataset
In this representation we will exclude Kim 2021, because being nail it is interesting to study for another case, but not exactly here.

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
dict_datasets = {'adata_deng_scar_fb':           ['Deng', 'Scar', '2021', adata_deng_scar_fb],
                 'adata_gao_fb':                 ['Gao', 'Healthy', '2021', adata_gao_fb],
                 'adata_gaydosik_HC_fb':         ['Gaydosik', 'Healthy', '2020', adata_gaydosik_HC_fb],
                 'adata_he_fb':                  ['He', 'Healthy', '2020', adata_he_fb],
                 'adata_hughes_fb':              ['Hughes', 'Healthy', '2020', adata_hughes_fb],
                 'adata_kim_2020_fb':            ['Kim', 'Healthy', '2020', adata_kim_2020_fb],
                 'adata_liu_ctrl_fb':            ['Liu', 'Healthy', '2021', adata_liu_ctrl_fb],
                 'adata_mirizio_fb':             ['Mirizio', 'Healthy', '2020', adata_mirizio_fb],
                 'adata_reynolds_healthy_fb':    ['Reynolds', 'Healthy', '2021', adata_reynolds_healthy_fb],
                 'adata_sole_young_fb':          ['Solé-Boldo', 'Young', '2020', adata_sole_young_fb],
                 'adata_tabib_fb':               ['Tabib', 'Healthy', '2018', adata_tabib_fb],
                 'adata_tabib_2021_ctrl_fb':     ['Tabib', 'Healthy', '2021', adata_tabib_2021_ctrl_fb],
                 'adata_tsc_fb':                 ['Tabula Sapiens', 'Healthy', '2021', adata_tsc_fb],
                 'adata_theo_fb':                ['Theocarditis', 'Healthy', '2020', adata_theo_fb],
                 'adata_theo_2021_fb':           ['Theocarditis', 'Healthy', '2021', adata_theo_2021_fb], 
                 'adata_vors_fb':                ['Vorstandlechner', 'Healthy', '2020', adata_vors_fb],}

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='presence', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='score', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='percentage', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='axis', cluster_name='cluster', axis_name='axis')