# ANALYSIS OF NON-SKIN FIBROBLAST DATASETS

In this analysis we are not going to go deep onto the fibroblast characterization, but more to see the overlap od the distinct populations from human skin with the populations of fibroblasts from other organs. 

**YOU NEED TO RUN NOTEBOOH 4H FIRST**

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx
from matplotlib import pylab

In [None]:
!pip install cellassign

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r seed
%store -r magma
%store -r data_dir

In [None]:
%store -r dict_make_gene_scoring_robust
%store -r dict_make_gene_scoring_axis_robust

In [None]:
%store -r plot_params

pylab.rcParams.update(plot_params)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = "{:,.2f}".format

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

In [None]:
dict_cats_clusters_robust = {i: np.array(dict_make_gene_scoring_robust[i].index[:30]) for i in dict_make_gene_scoring_robust.keys()}
dict_cats_axes_robust = {i: np.array(dict_make_gene_scoring_axis_robust[i].index[:30]) for i in dict_make_gene_scoring_axis_robust.keys()}

## Kuppe et al. 2021 [Kidney]

In [None]:
kuppe_2021_dir = data_dir + '/kuppe_2021'
kuppe_2021_kidney_ctrl = sc.read(kuppe_2021_dir + '/kupper_2021_kidney_pdgfrb_ctrl.h5')

In [None]:
kuppe_2021_kidney_ctrl.var_names

In [None]:
# Basic QC filtering
kuppe_2021_kidney_ctrl.var['mt'] = kuppe_2021_kidney_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(kuppe_2021_kidney_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(kuppe_2021_kidney_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(kuppe_2021_kidney_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(kuppe_2021_kidney_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': kuppe_2021_kidney_ctrl.obs['Internal sample identifier'], 'y': kuppe_2021_kidney_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
kuppe_2021_kidney_ctrl = kuppe_2021_kidney_ctrl[(((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb1') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.2) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 7.3)) | 
                                              ((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb2') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.2) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 7.3)) | 
                                              ((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb4') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.4) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 7.8)) | 
                                              ((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb5') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.6) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 8))).values | 
                                                 ((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb6') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.6) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 8)) | 
                                                ((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb8') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.2) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 7.5)) | 
                                              ((kuppe_2021_kidney_ctrl.obs['Internal sample identifier'] == 'Pb9') & (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts < 8.2) & 
                                                (kuppe_2021_kidney_ctrl.obs.log1p_n_genes_by_counts > 7.5)) , :]
kuppe_2021_kidney_ctrl = kuppe_2021_kidney_ctrl[kuppe_2021_kidney_ctrl.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(kuppe_2021_kidney_ctrl, min_counts=1)
sc.pp.normalize_total(kuppe_2021_kidney_ctrl)
sc.pp.log1p(kuppe_2021_kidney_ctrl)

In [None]:
sc.pp.pca(kuppe_2021_kidney_ctrl, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(kuppe_2021_kidney_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(kuppe_2021_kidney_ctrl, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(kuppe_2021_kidney_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(kuppe_2021_kidney_ctrl)

In [None]:
sc.tl.umap(kuppe_2021_kidney_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(kuppe_2021_kidney_ctrl, resolution=1, random_state=seed)

In [None]:
sc.pp.subsample(kuppe_2021_kidney_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(kuppe_2021_kidney_ctrl, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(kuppe_2021_kidney_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN', 'RGS5', 'MYH11', 'MLANA', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(kuppe_2021_kidney_ctrl, color=['leiden', 'Annotation.Level.1', 'Annotation.Level.2', 'Annotation.Level.3'], legend_loc='on data', ncols=2)

In [None]:
sc.pl.umap(kuppe_2021_kidney_ctrl, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI', 'CCL19'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
kuppe_2021_kidney_ctrl_fb = kuppe_2021_kidney_ctrl[kuppe_2021_kidney_ctrl.obs['Annotation.Level.2'].isin(['Myofibroblasts', 'Fibroblasts'])].copy()

In [None]:
sc.pp.filter_genes(kuppe_2021_kidney_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(kuppe_2021_kidney_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(kuppe_2021_kidney_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(kuppe_2021_kidney_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.1 * len(kuppe_2021_kidney_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(kuppe_2021_kidney_ctrl_fb)

In [None]:
sc.tl.umap(kuppe_2021_kidney_ctrl_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(kuppe_2021_kidney_ctrl_fb, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(kuppe_2021_kidney_ctrl_fb, color=['Internal sample identifier', 'Annotation.Level.3', 'leiden'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(kuppe_2021_kidney_ctrl_fb, groupby='leiden')
sc.tl.dendrogram(kuppe_2021_kidney_ctrl_fb, groupby='leiden')
sc.pl.rank_genes_groups_tracksplot(kuppe_2021_kidney_ctrl_fb, dendrogram=True, n_genes=35)

In [None]:
kuppe_2021_kidney_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(kuppe_2021_kidney_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(kuppe_2021_kidney_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del kuppe_2021_kidney_ctrl_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters_robust.items():
    print(key)
    sc.pl.umap(kuppe_2021_kidney_ctrl_fb, color=['cluster_robust'] + [i for i in val if i in kuppe_2021_kidney_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(kuppe_2021_kidney_ctrl_fb, cluster_column='cluster_robust')

In [None]:
clear_adata(kuppe_2021_kidney_ctrl_fb)
kuppe_2021_kidney_ctrl_fb.write_h5ad(kuppe_2021_dir + '/kuppe_2021_kidney_ctrl_fb_processed.h5')
kuppe_2021_kidney_ctrl.write_h5ad(kuppe_2021_dir + '/kuppe_2021_kidney_ctrl.h5')

In [None]:
liu_2021_ctrl_fb = sc.read(kuppe_2021_dir + '/kuppe_2021_kidney_ctrl_fb_processed.h5')

## Travaglini et al. 2020 [Lung]

In [None]:
travaglini_2020_dir = data_dir + '/travaglini_2020'
travaglini_2020_lung_ctrl = sc.read(travaglini_2020_dir + '/travaglini_2020_lung_ctrl.h5')

In [None]:
# Basic QC filtering
travaglini_2020_lung_ctrl.var['mt'] = travaglini_2020_lung_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(travaglini_2020_lung_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(travaglini_2020_lung_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(travaglini_2020_lung_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(travaglini_2020_lung_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': travaglini_2020_lung_ctrl.obs['Internal sample identifier'], 'y': travaglini_2020_lung_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
travaglini_2020_lung_ctrl = travaglini_2020_lung_ctrl[(((travaglini_2020_lung_ctrl.obs['Internal sample identifier'] == 'P1') & (travaglini_2020_lung_ctrl.obs.log1p_n_genes_by_counts < 8.5) & 
                                                (travaglini_2020_lung_ctrl.obs.log1p_n_genes_by_counts > 6.8)) | 
                                              ((travaglini_2020_lung_ctrl.obs['Internal sample identifier'] == 'P2') & (travaglini_2020_lung_ctrl.obs.log1p_n_genes_by_counts < 8.5) & 
                                                (travaglini_2020_lung_ctrl.obs.log1p_n_genes_by_counts > 6.8)) | 
                                              ((travaglini_2020_lung_ctrl.obs['Internal sample identifier'] == 'P3') & (travaglini_2020_lung_ctrl.obs.log1p_n_genes_by_counts < 8.5) & 
                                                (travaglini_2020_lung_ctrl.obs.log1p_n_genes_by_counts > 6.8))) , :]

In [None]:
sc.pp.filter_genes(travaglini_2020_lung_ctrl, min_counts=1)
sc.pp.normalize_total(travaglini_2020_lung_ctrl)
sc.pp.log1p(travaglini_2020_lung_ctrl)

In [None]:
sc.pp.pca(travaglini_2020_lung_ctrl, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(travaglini_2020_lung_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(travaglini_2020_lung_ctrl, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(travaglini_2020_lung_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(travaglini_2020_lung_ctrl)

In [None]:
sc.tl.umap(travaglini_2020_lung_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(travaglini_2020_lung_ctrl, resolution=1, random_state=seed)

In [None]:
sc.pp.subsample(travaglini_2020_lung_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(travaglini_2020_lung_ctrl, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(travaglini_2020_lung_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'DCN', 'ELN',
                                             'RGS5', 'MYH11', 'NDUFA4L2', 
                                             'PECAM1', 'CLDN5', 'VWF', 
                                             'EPCAM', 'PTPRC'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'DCN'], 
                'endo': ['PECAM1', 'CLDN5', 'VWF'],
                'peri': ['RGS5', 'MYH11', 'NDUFA4L2'],
                'epithelial': ['EPCAM', ], 
                'immune': ['PTPRC']
               }

In [None]:
assign_cats(travaglini_2020_lung_ctrl, dict_cats=dict_cats_fb, min_score=0.6, quantile_gene_sel=0.999)

In [None]:
sc.pl.umap(travaglini_2020_lung_ctrl, color=['leiden', 'assigned_cats',], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
travaglini_2020_lung_ctrl_fb = travaglini_2020_lung_ctrl[travaglini_2020_lung_ctrl.obs['assigned_cats'].isin(['fibro'])].copy()

In [None]:
sc.pp.filter_genes(travaglini_2020_lung_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(travaglini_2020_lung_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(travaglini_2020_lung_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(travaglini_2020_lung_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.3 * len(travaglini_2020_lung_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(travaglini_2020_lung_ctrl_fb)

In [None]:
sc.tl.umap(travaglini_2020_lung_ctrl_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(travaglini_2020_lung_ctrl_fb, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(travaglini_2020_lung_ctrl_fb, color=['Internal sample identifier', 'leiden', 
                                                'PDGFRA', 'BGN', 
                                                'SFRP2', 'PDLIM4', 'PDGFRL', 'IGFBP4', 
                                                'DKK3', 'SPINT2', 'FGFR4', 'GPM6B'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(travaglini_2020_lung_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.4, quantile_gene_sel=0.6, key_added='cluster_robust', others_name='U', verbose=False)
assign_cats(travaglini_2020_lung_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            quantile_gene_sel=0.95, intermediate_states=True, diff=0.05, others_name='U', verbose=False)

travaglini_2020_lung_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(travaglini_2020_lung_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(travaglini_2020_lung_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del travaglini_2020_lung_ctrl_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters_robust.items():
    print(key)
    sc.pl.umap(travaglini_2020_lung_ctrl_fb, color=['cluster_robust'] + [i for i in val if i in travaglini_2020_lung_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.tl.rank_genes_groups(travaglini_2020_lung_ctrl_fb, groupby='leiden')
sc.tl.dendrogram(travaglini_2020_lung_ctrl_fb, groupby='leiden')
sc.pl.rank_genes_groups_tracksplot(travaglini_2020_lung_ctrl_fb, dendrogram=True, n_genes=35)

## Litviňuková et al. 2020 [Heart]

In [None]:
litvinukova_2020_dir = data_dir + '/litvinukova_2020'
litvinukova_2020_heart_ctrl = sc.read(litvinukova_2020_dir + '/litvinukova_2020_heart_ctrl.h5')

In [None]:
# Basic QC filtering
litvinukova_2020_heart_ctrl.var['mt'] = litvinukova_2020_heart_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(litvinukova_2020_heart_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(litvinukova_2020_heart_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(litvinukova_2020_heart_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(litvinukova_2020_heart_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': litvinukova_2020_heart_ctrl.obs['Internal sample identifier'], 'y': litvinukova_2020_heart_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
litvinukova_2020_heart_ctrl = litvinukova_2020_heart_ctrl[(((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D1') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.1) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 6.3)) | 
                                              ((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D3') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.7) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 7)) | 
                                              ((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D4') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.1) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 6.3))) |
                                               ((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D5') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.1) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 6.3)) |
                                               ((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D6') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.7) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 7)) |
                                               ((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D7') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.7) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 7)) |
                                               ((litvinukova_2020_heart_ctrl.obs['Internal sample identifier'] == 'D11') & (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts < 7.7) & 
                                                (litvinukova_2020_heart_ctrl.obs.log1p_n_genes_by_counts > 7)) , :]

In [None]:
sc.pp.filter_genes(litvinukova_2020_heart_ctrl, min_counts=1)
sc.pp.normalize_total(litvinukova_2020_heart_ctrl)
sc.pp.log1p(litvinukova_2020_heart_ctrl)

In [None]:
sc.pp.pca(litvinukova_2020_heart_ctrl, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(litvinukova_2020_heart_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(litvinukova_2020_heart_ctrl, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(litvinukova_2020_heart_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(litvinukova_2020_heart_ctrl)

In [None]:
sc.tl.umap(litvinukova_2020_heart_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(litvinukova_2020_heart_ctrl, resolution=1, random_state=seed)

In [None]:
sc.pp.subsample(litvinukova_2020_heart_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(litvinukova_2020_heart_ctrl, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(litvinukova_2020_heart_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'DCN', 'ELN',
                                             'RGS5', 'MYH11', 'NDUFA4L2', 
                                             'PECAM1', 'CLDN5', 'VWF', 
                                             'EPCAM', 'PTPRC'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'DCN'], 
                'endo': ['PECAM1', 'CLDN5', 'VWF'],
                'peri': ['RGS5', 'MYH11', 'NDUFA4L2'],
                'epithelial': ['EPCAM', ], 
                'immune': ['PTPRC']
               }

In [None]:
assign_cats(litvinukova_2020_heart_ctrl, dict_cats=dict_cats_fb, min_score=0.6, quantile_gene_sel=0.999)

In [None]:
sc.pl.umap(litvinukova_2020_heart_ctrl, color=['leiden', 'assigned_cats',], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
litvinukova_2020_heart_ctrl_fb = litvinukova_2020_heart_ctrl[litvinukova_2020_heart_ctrl.obs['assigned_cats'].isin(['fibro'])].copy()

In [None]:
sc.pp.filter_genes(litvinukova_2020_heart_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(litvinukova_2020_heart_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(litvinukova_2020_heart_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(litvinukova_2020_heart_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.3 * len(litvinukova_2020_heart_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(litvinukova_2020_heart_ctrl_fb)

In [None]:
sc.tl.umap(litvinukova_2020_heart_ctrl_fb, min_dist=0.6, random_state=seed)
sc.tl.leiden(litvinukova_2020_heart_ctrl_fb, resolution=0.5, random_state=seed)

In [None]:
sc.pl.umap(litvinukova_2020_heart_ctrl_fb, color=['Internal sample identifier', 'leiden', 
                                               ], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

## Elmentaite et al. 2021 [Instesinal tract]

In [None]:
elmentaite_2021_dir = data_dir + '/elmentaite_2021'
elmentaite_2021_intestine_ctrl = sc.read(elmentaite_2021_dir + '/elmentaite_2021_intestine_ctrl.h5')

In [None]:
# Basic QC filtering
elmentaite_2021_intestine_ctrl.var['mt'] = elmentaite_2021_intestine_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(elmentaite_2021_intestine_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(elmentaite_2021_intestine_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(elmentaite_2021_intestine_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(elmentaite_2021_intestine_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': elmentaite_2021_intestine_ctrl.obs['Internal sample identifier'], 'y': elmentaite_2021_intestine_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
elmentaite_2021_intestine_ctrl = elmentaite_2021_intestine_ctrl[((elmentaite_2021_intestine_ctrl.obs.n_genes_by_counts < 3000) & 
                                                          (elmentaite_2021_intestine_ctrl.obs.n_genes_by_counts > 1000) & 
                                                          (elmentaite_2021_intestine_ctrl.obs.pct_counts_mt < 8)), :]

In [None]:
sc.pp.filter_genes(elmentaite_2021_intestine_ctrl, min_counts=1)
sc.pp.normalize_total(elmentaite_2021_intestine_ctrl)
sc.pp.log1p(elmentaite_2021_intestine_ctrl)

In [None]:
sc.pp.pca(elmentaite_2021_intestine_ctrl, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(elmentaite_2021_intestine_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(elmentaite_2021_intestine_ctrl, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(elmentaite_2021_intestine_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(elmentaite_2021_intestine_ctrl)

In [None]:
sc.tl.umap(elmentaite_2021_intestine_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(elmentaite_2021_intestine_ctrl, resolution=1, random_state=seed)

In [None]:
sc.pp.subsample(elmentaite_2021_intestine_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(elmentaite_2021_intestine_ctrl, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(elmentaite_2021_intestine_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'DCN', 'ELN',
                                             'RGS5', 'MYH11', 'NDUFA4L2', 
                                             'PECAM1', 'CLDN5', 'VWF', 
                                             'EPCAM', 'PTPRC'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'DCN'], 
                'endo': ['PECAM1', 'CLDN5', 'VWF'],
                'peri': ['RGS5', 'MYH11', 'NDUFA4L2'],
                'epithelial': ['EPCAM', ], 
                'immune': ['PTPRC']
               }

In [None]:
assign_cats(elmentaite_2021_intestine_ctrl, dict_cats=dict_cats_fb, min_score=0.6, quantile_gene_sel=0.999)

In [None]:
sc.pl.umap(elmentaite_2021_intestine_ctrl, color=['leiden', 'assigned_cats',], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
elmentaite_2021_intestine_ctrl_fb = elmentaite_2021_intestine_ctrl[elmentaite_2021_intestine_ctrl.obs['assigned_cats'].isin(['fibro'])].copy()

In [None]:
sc.pp.filter_genes(elmentaite_2021_intestine_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(elmentaite_2021_intestine_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(elmentaite_2021_intestine_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(elmentaite_2021_intestine_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(0.3 * len(elmentaite_2021_intestine_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(elmentaite_2021_intestine_ctrl_fb)

In [None]:
sc.tl.umap(elmentaite_2021_intestine_ctrl_fb, min_dist=0.6, random_state=seed)
sc.tl.leiden(elmentaite_2021_intestine_ctrl_fb, resolution=0.5, random_state=seed)

In [None]:
sc.pl.umap(elmentaite_2021_intestine_ctrl_fb, color=['Internal sample identifier', 'leiden', 
                                               ], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(elmentaite_2021_intestine_ctrl_fb, color=['Internal sample identifier', 'leiden', 
                                               ], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(elmentaite_2021_intestine_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster_robust', others_name='U')
assign_cats(elmentaite_2021_intestine_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis_robust', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
sc.pl.umap(elmentaite_2021_intestine_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del elmentaite_2021_intestine_ctrl_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters_robust.items():
    print(key)
    sc.pl.umap(elmentaite_2021_intestine_ctrl_fb, color=['cluster_robust'] + [i for i in val if i in elmentaite_2021_intestine_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

## Ramachandran et al. 2019 [Liver]

In [None]:
ramachandran_2019_dir = data_dir + '/ramachandran_2019'
ramachandran_2019_liver_ctrl = sc.read(ramachandran_2019_dir + '/ramachandran_2019_liver_ctrl.h5')

In [None]:
# Basic QC filtering
ramachandran_2019_liver_ctrl.var['mt'] = ramachandran_2019_liver_ctrl.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(ramachandran_2019_liver_ctrl, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(ramachandran_2019_liver_ctrl, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(ramachandran_2019_liver_ctrl, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(ramachandran_2019_liver_ctrl, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': ramachandran_2019_liver_ctrl.obs['Internal sample identifier'], 'y': ramachandran_2019_liver_ctrl.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
ramachandran_2019_liver_ctrl = ramachandran_2019_liver_ctrl[((ramachandran_2019_liver_ctrl.obs.n_genes_by_counts < 3000) & 
                                                          (ramachandran_2019_liver_ctrl.obs.n_genes_by_counts > 1000) & 
                                                          (ramachandran_2019_liver_ctrl.obs.pct_counts_mt < 8)), :]

In [None]:
sc.pp.filter_genes(ramachandran_2019_liver_ctrl, min_counts=1)
sc.pp.normalize_total(ramachandran_2019_liver_ctrl)
sc.pp.log1p(ramachandran_2019_liver_ctrl)

In [None]:
sc.pp.pca(ramachandran_2019_liver_ctrl, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(ramachandran_2019_liver_ctrl, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(ramachandran_2019_liver_ctrl, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(ramachandran_2019_liver_ctrl) ** 0.5 // 2), metric='cosine')
tk.tl.triku(ramachandran_2019_liver_ctrl)

In [None]:
sc.tl.umap(ramachandran_2019_liver_ctrl, min_dist=0.2, random_state=seed)
sc.tl.leiden(ramachandran_2019_liver_ctrl, resolution=1, random_state=seed)

In [None]:
sc.pp.subsample(ramachandran_2019_liver_ctrl, fraction=1, random_state=0, copy=False)
sc.pl.umap(ramachandran_2019_liver_ctrl, color=['leiden', 'Internal sample identifier'], legend_loc='on data')

In [None]:
sc.pl.umap(ramachandran_2019_liver_ctrl, color=['leiden', 'LUM', 'PDGFRA', 'DCN', 'ELN',
                                             'RGS5', 'MYH11', 'NDUFA4L2', 
                                             'PECAM1', 'CLDN5', 'VWF', 
                                             'EPCAM', 'PTPRC'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'DCN'], 
                'endo': ['PECAM1', 'CLDN5', 'VWF'],
                'peri': ['RGS5', 'MYH11', 'NDUFA4L2'],
                'epithelial': ['EPCAM', ], 
                'immune': ['PTPRC']
               }

In [None]:
assign_cats(ramachandran_2019_liver_ctrl, dict_cats=dict_cats_fb, min_score=0.6, quantile_gene_sel=0.999)

In [None]:
sc.pl.umap(ramachandran_2019_liver_ctrl, color=['leiden', 'assigned_cats',], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
ramachandran_2019_liver_ctrl_fb = ramachandran_2019_liver_ctrl[ramachandran_2019_liver_ctrl.obs['assigned_cats'].isin(['fibro'])].copy()

In [None]:
sc.pp.filter_genes(ramachandran_2019_liver_ctrl_fb, min_counts=1)

In [None]:
sc.pp.pca(ramachandran_2019_liver_ctrl_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(ramachandran_2019_liver_ctrl_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(ramachandran_2019_liver_ctrl_fb, use_rep='X_pca_harmony',  n_neighbors=int(len(ramachandran_2019_liver_ctrl_fb) ** 0.5), metric='cosine')
tk.tl.triku(ramachandran_2019_liver_ctrl_fb)

In [None]:
sc.tl.umap(ramachandran_2019_liver_ctrl_fb, min_dist=0.6, random_state=seed)
sc.tl.leiden(ramachandran_2019_liver_ctrl_fb, resolution=0.5, random_state=seed)

In [None]:
sc.pl.umap(ramachandran_2019_liver_ctrl_fb, color=['Internal sample identifier', 'leiden', 
                                               ], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(ramachandran_2019_liver_ctrl_fb, color=['Internal sample identifier', 'leiden', 
                                               ], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(ramachandran_2019_liver_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster_robust', others_name='U')
assign_cats(ramachandran_2019_liver_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.5, quantile_gene_sel=0.9,  key_added='axis_robust', 
             intermediate_states=True, diff=0.15, others_name='U')

In [None]:
sc.pl.umap(ramachandran_2019_liver_ctrl_fb, color=['Internal sample identifier', 'leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
del ramachandran_2019_liver_ctrl_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters_robust.items():
    print(key)
    sc.pl.umap(ramachandran_2019_liver_ctrl_fb, color=['cluster_robust'] + [i for i in val if i in ramachandran_2019_liver_ctrl_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 