# Comparison of fibroblast populations (review after JID)

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

**After the publication in JID we will include the following papers, as confirmatory results**
* Kim et al. 
* Gaydosik et al.
* McCarthy et al.

## imports

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import ray
import subprocess
import time
import scvelo as scv
import gc

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'COL1A1', 'DCN']}
dict_cats_subpops = {'A1': ['PI16', 'QPCT', 'SLPI', 'CPE', 'CTHRC1', 'MFAP5', 'PCOLCE2', 'SCARA5', 'TSPAN8'],
                     'A2': ['APCDD1', 'COL18A1', 'COMP', 'NKD2', 'F13A1', 'HSPB3', 'LEPR', 'TGFBI'], 
                     'A3': ['CD9', 'COL6A1', 'LEPR', 'WIF1', 'CCN5', 'RGCC'],
                     'A4': ['SFRP4', 'PCOLCE2', 'C1QTNF3', 'PRG4', 'IGFBP6'],
                     'B1': ['CXCL2', 'MYC', 'C7', 'SPSB1', 'ITM2A'], 
                     'B2': ['SOCS3', 'CCL19', 'CD74', 'RARRES2', 'CCDC146', 'IGFBP3', 'TNFSF13B'], 
                     'C1': ['SPARC', 'GPC3', 'COL11A1', 'DPEP1', 'MEF2C', 'MME'],
                     'C2': ['CDK', 'COCH', 'CRABP1', 'DKK2', 'HSPA2', 'OGN', 'RSPO4', 'SLITRK6', 'MKX'],
                     'C3': ['ASPN', 'F2R', 'LRRC15', 'POSTN'],
                     'C4': ['ANGPTL7', 'APOD', 'C2orf40', 'SCN7A', 'SFRP4', 'TM4SF1'], 
                     'C5': ['CPE', 'SPON2', 'SPRY1', 'ITM2A', 'FGFBP2', 'IGFBP2', 'APOD', 'A2M', 'RGMA', 'ALDH1A1', 'PRSS23', 'KLK1', 'PLA2G5'], 
                     }

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## data extraction and processing

In [None]:
data_dir = os.getcwd()
print(data_dir)

### Kim et al. 2020

In [None]:
kim_dir = data_dir + '/Kim_2020'

In [None]:
adata_kim = sc.read(kim_dir + '/Kim_2020.h5ad')
adata_kim.var_names_make_unique()

In [None]:
adata_kim.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_kim.var_names ]

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)

In [None]:
adata_kim.X = np.array(adata_kim.X.todense())

In [None]:
# Basic QC filtering
adata_kim.var['mt'] = adata_kim.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_kim, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_kim, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_kim, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_kim, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_kim = adata_kim[((adata_kim.obs.n_genes_by_counts < 4500) & 
                                    (adata_kim.obs.n_genes_by_counts > 400)).values, :]
adata_kim = adata_kim[adata_kim.obs.pct_counts_mt < 25, :]

In [None]:
adata_kim

In [None]:
batches = sorted(list(set(adata_kim.obs['batch'].values)))
for batch in batches:
    counts = adata_kim.obs['n_genes_by_counts'].loc[adata_kim.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)
sc.pp.normalize_total(adata_kim)
sc.pp.log1p(adata_kim)

In [None]:
adata_kim

In [None]:
sc.pp.pca(adata_kim, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_kim, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_kim, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_kim, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_kim_fb = adata_kim[adata_kim.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_kim_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_kim_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_kim_fb, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_kim_fb, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim_fb, resolution=2.5, random_state=seed)

In [None]:
assign_cats(adata_kim_fb, dict_cats=dict_cats_subpops, min_score=0.6)

In [None]:
sc.pl.umap(adata_kim_fb, color=['leiden', 'batch', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_kim_fb, color=['WISP2', 'SLPI', 'PI16', 'IGFBP6',  # A1
                                'APCDD1', 'COMP', 'NKD2', 'COL18A1',  # A2
                                'SFRP4', 'PCOLCE2', 'PRG4', 'FBN1',  # A4
                                'WIF1', 'RGCC', 'ELN', 'SGCA',  # A3
                                'CCL2', 'ITM2A', 'SPSB1', 'TNFAIP6',  # B1
                                'CCL19', 'CCDC146', 'CD74', 'TNFSF13B',  # B2
                                'COL11A1', 'DPEP1', 'TNMD', 'WFDC1',   # C1
                                'COCH', 'FIBIN', 'CRABP1', 'RSPO4',   # C2
                                'ASPN', 'F2R', 'GPM6B', 'POSTN',   # C3
                                'ANGPTL7', 'APOD', 'C2orf40', 'TM4SF1',   # C4
                                'CPE', 'SPON2', 'SPRY1', 'FGFBP2' # C5
                               ],
           legend_loc='on data', cmap=magma, use_raw=False, ncols=4)

### Gaydosik et al. 2020

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'

In [None]:
adata_CTCL = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_CTCL.h5ad')
adata_HC = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_HC.h5ad')

In [None]:
# Basic QC filtering
adata_HC.var['mt'] = adata_HC.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_HC, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_HC, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_HC, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_HC, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_HC = adata_HC[((adata_HC.obs.n_genes_by_counts < 5500) & 
                                    (adata_HC.obs.n_genes_by_counts > 400)).values, :]
adata_HC = adata_HC[adata_HC.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_HC, min_counts=1)
sc.pp.log1p(adata_HC)
sc.pp.normalize_total(adata_HC)

In [None]:
sc.pp.pca(adata_HC, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_HC, metric='angular', batch_key='sample')
tk.tl.triku(adata_HC, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_HC, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_HC, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_HC, color=['leiden', 'sample'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_HC, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_HC_fb = adata_HC[adata_HC.obs['leiden'].isin(['0', '1', '3', '8', '9', '31'])]

In [None]:
sc.pp.filter_genes(adata_HC_fb, min_counts=1)
sc.pp.pca(adata_HC_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_HC_fb, metric='angular', batch_key='sample')
tk.tl.triku(adata_HC_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_HC_fb, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_HC_fb, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_HC_fb, color=['leiden'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_HC_fb, color=['leiden', 'COL18A1', 'COMP', 'APCDD1', 'SLPI', 'WIF1'], legend_loc='on data', ncols=2, cmap=magma)

In [None]:
sc.pl.umap(adata_HC_fb, color=['leiden', 'CCL19', 'CD74', 'APOE', 
                              ], legend_loc='on data', ncols=2, cmap=magma)

In [None]:
sc.pl.umap(adata_HC_fb, color=['leiden', 'COL11A1', 'DPEP1', 'COCH', 'CRABP1', 
                               'ASPN', 'POSTN', 'ANGPTL7', 'C2orf40'], legend_loc='on data', ncols=2, cmap=magma)

### McCarthy et al. 2020

In [None]:
adata_mccarthy = sc.read_loom(mccarthy_dir + '/mccarthy_2020.loom')

In [None]:
# Basic QC filtering
adata_mccarthy.var['mt'] = adata_mccarthy.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_mccarthy, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_mccarthy, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mccarthy, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mccarthy, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['pct_counts_mt'] < 17]
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['total_counts'] < 3_000_000]
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['n_genes_by_counts'] > 4000]

In [None]:
sc.pl.violin(adata_mccarthy, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mccarthy, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mccarthy, x='total_counts', y='n_genes_by_counts')

In [None]:
G1S_genes = ['MCM5','PCNA','TYMS','FEN1','MCM2','MCM4','RRM1','UNG','GINS2','MCM6','CDCA7','DTL','PRIM1','UHRF1','MLF1IP',
'HELLS','RFC2','RPA2','NASP','RAD51AP1','GMNN','WDR76','SLBP','CCNE2','UBR7','POLD3','MSH2','ATAD2','RAD51',
'RRM2','CDC45','CDC6','EXO1','TIPIN','DSCC1','BLM','CASP8AP2','USP1','CLSPN','POLA1','CHAF1B','BRIP1','E2F8',]

G2M_genes = ['HMGB2','CDK1','NUSAP1','UBE2C','BIRC5','TPX2','TOP2A','NDC80','CKS2','NUF2','CKS1B','MKI67',
'TMPO','CENPF','TACC3','FAM64A','SMC4','CCNB2','CKAP2L','CKAP2','AURKB','BUB1','KIF11','ANP32E','TUBB4B',
'GTSE1','KIF20B','HJURP','CDCA3','HN1','CDC20','TTK','CDC25C','KIF2C','RANGAP1','NCAPD2','DLGAP5','CDCA2',
'CDCA8','ECT2','KIF23','HMMR', 'AURKA','PSRC1','ANLN','LBR','CKAP5','CENPE','CTCF','NEK2','G2E3','GAS2L3','CBX5','CENPA',]


In [None]:
sc.tl.score_genes_cell_cycle(adata_mccarthy, s_genes=G1S_genes, g2m_genes=G2M_genes)

In [None]:
adata_mccarthy.obs['sex'] = df_meta['Characteristics[sex]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['individual'] = df_meta['Characteristics[individual]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['age'] = df_meta['Characteristics[age]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['organism part'] = df_meta['Characteristics[organism part]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['cell type'] = df_meta['Characteristics[cell type]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['disease'] = df_meta['Characteristics[disease]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['single cell quality'] = df_meta['Characteristics[single cell quality]'].loc[adata_mccarthy.obs_names.values]

In [None]:
sc.pp.filter_genes(adata_mccarthy, min_counts=1)
sc.pp.log1p(adata_mccarthy)
sc.pp.normalize_per_cell(adata_mccarthy)
tk.tl.triku(adata_mccarthy, n_procs=1, random_state=seed)
sc.pp.pca(adata_mccarthy, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_mccarthy, random_state=seed, knn=len(adata_mccarthy) ** 0.5 // 2, metric='cosine')

In [None]:
sc.tl.umap(adata_mccarthy, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_mccarthy, resolution=0.01, random_state=seed)
sc.pl.umap(adata_mccarthy, color=['leiden', 'n_genes_by_counts', 'pct_counts_mt', 'phase'], 
           legend_loc='on data', ncols=2)

In [None]:
sc.pl.umap(adata_mccarthy, color=['leiden', 'sex', 'individual', 'age', 
                                 'organism part', 'cell type', 'disease', 'single cell quality'], legend_loc='on data', ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_mccarthy, groupby='leiden', n_genes=350)
sc.pl.rank_genes_groups_tracksplot(adata_mccarthy, dendrogram=False, n_genes=50)

In [None]:
for i in adata_mccarthy.uns['rank_genes_groups']['names']['0']:
    print(i)

In [None]:
for i in adata_mccarthy.uns['rank_genes_groups']['names']['1']:
    print(i)

In [None]:
sc.pl.umap(adata_mccarthy, color=['DPP4', 'SFRP2', 'PI16', 'CCN5', 'SLPI', 'COL18A1', 'APCDD1', 'COMP'], 
           legend_loc='on data', ncols=3, cmap=magma)

In [None]:
sc.pl.umap(adata_mccarthy, color=['APOE', 'CCL2', 'ITM2A', 'MYC', 'GPC3', 'SOD2', 'IGFBP3'], 
           legend_loc='on data', ncols=3, cmap=magma)

In [None]:
sc.pl.umap(adata_mccarthy, color=['COL11A1', 'POSTN', 'CRABP1', 'COCH', 'APOD', 'SFRP4'], 
           legend_loc='on data', ncols=3, cmap=magma)