# Comparison of fibroblast populations (review after JID)

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

**After the publication in JID we will include the following papers, as confirmatory results**
* Kim et al. 
* Gaydosik et al.
* McCarthy et al.
* Gao et al.
* Mirizio et al.

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import ray
import subprocess
import time
import scvelo as scv
import gc
import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'COL1A1', 'DCN', 'SFRP2', 'APOE'], 
                'melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'], 
                'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 
                'eritro': ['HBB', 'HBA2', 'HBA1', 'HBD'],
                'muscle': ['DES', 'PCP4', 'ACTG2', 'SYNPO2', 'PRUNE2', 'SORBS1', 'P2RX1'],
                'immune': ['TPSB2', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 
                'endo': ['PLVAP', 'CLDN5', 'ACKR1', 'LMCD1', 'NPDC1', 'A2M', 
                         'PECAM1', 'CLU', 'VWF', 'CD74', 'RAMP2', 'IFI27', 'GNG11'], 
                'lymph': ['CCL21', 'LYVE1', 'CLDN5'],
                'kerato': ['DMKN', 'KRT1', 'KRT5', 'KRT14', 'AQP3', 'SFN' ], 
                'krt7/8/19': ['S100A1','KRT19','PPP1R1B','KRT7','KRT8','SNORC','NCALD','CA6',
                              'AKR1C2','TPD52L1','PDK3','ROPN1B','QDPR'],
                'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6'], 
                'neuro': ['GPM6B','PLP1','S100B','SCN7A','NRXN1','GFRA3','MPZ'],
                'stress': ['JUNB', 'ZFP36', 'FOS', 'SOCS3', 'GADD45B',  'DNAJB1', 'IER2', 
                                                    'FOSB', 'JUN', 'ATF3', 'IER3'], 
                'T cells': ['CD52', 'CD3D', 'TRAC'],
                'B cells': ['IGHM','CD74','CD79A','NIBAN3','TCL1A','NCF1','MS4A1', 'BTK', 'CD19'],
                'APC': ['HLA-DQA1', 'HLA-DRB6', 'TYROBP', 'FCER1G', 'AIF1'], 
                'mast_cells': ['IL1RL1', 'CPA3', 'HPGDS', 'TPSB2', 'HPGD', 'RGS13', 'CTSG', 
                               'TPSAB1', 'GATA2'],
                'F': ['B4GALT1', 'TMSB4X', 'PPP1CB', 'WTAP', 'PTPRS', 'CTNNB1', 'INSR', 'BICC1', 'CTNNB1'], 
               }

dict_cats_axes = {'A': ['AEBP1', 'AQP1', 'CD9', 'COL1A2', 'COL1A1', 'COL6A1', 'ELN', 
                        'FBN1', 'MMP2', 'NBL1', 'PAM', 'QPCT', 'RGCC', 'SFRP2', 'THBS2'],
                  'B': ['APOE', 'C3', 'C7', 'CXCL12', 'CYGB', 'GGT5', 'IGFBP7', 'RARRES2',
                        'TMEM176A', 'TMEM176B', 'TNFSF13B'],
                  'C': ['DKK3', 'EMID1', 'COL1A2', 'GPM6B', 'INHBA', 'SPARCL1', 'TNMD', 'TNN']}

dict_cats_clusters = {'A1': ['PI16', 'QPCT', 'SLPI', 'CPE', 'CTHRC1', 'MFAP5', 'PCOLCE2', 'SCARA5', 'TSPAN8'],
                     'A2': ['APCDD1', 'COL18A1', 'COMP', 'NKD2', 'F13A1', 'HSPB3', 'LEPR', 'TGFBI'], 
                     'A3': ['LEPR', 'WIF1', 'CCN5', 'RGCC', 'QPCT', 'SGCA'], # 'HAS1', 'CORIN', 'SGCG', 'F13A1', 'RETREG1', 
                     'A4': ['SFRP4', 'PCOLCE2', 'C1QTNF3', 'PRG4', 'IGFBP6'],
                     'B1': ['CXCL2', 'MYC', 'C7', 'SPSB1', 'ITM2A'], 
                     'B2': ['SOCS3', 'CCL19', 'CD74', 'RARRES2', 'CCDC146', 'IGFBP3', 'TNFSF13B'], 
                     'C1': ['SPARC', 'GPC3', 'COL11A1', 'DPEP1', 'MEF2C', 'MME'],
                     'C2': ['CDK', 'COCH', 'CRABP1', 'DKK2', 'HSPA2', 'OGN', 'RSPO4'], # 'SLITRK6', 'MKX'
                     'C3': ['ASPN', 'F2R', 'LRRC15', 'POSTN'],
                     'C4': ['ANGPTL7', 'APOD', 'C2orf40', 'ECRG4', 'SCN7A', 'SFRP4', 'TM4SF1'], 
                     'C4*': ['MFAP5', 'TNNC1', 'PDLIM1', 'CLTB', 'CSRP1', 'SLC2A1', 'CAV1', 'NBL1', 'MGP', 'LMO7', 
                              'CTSH', 'C19orf33'],
                     'C5': ['CPE', 'SPON2', 'SPRY1', 'IFITM1', 'FGFBP2', 'IGFBP2', 'APOD', 'A2M', 'RGMA', 
                            'ALDH1A1', 'PRSS23', 'KLK1', 'PLA2G5', 'TIMP3', 'LSP1', 
                            'IGF1', 'PDGFD', 'HTRA3', 'BBX', 'PGF'], 
                     'C6': ['LEF1', 'TFAP2A', 'IQGAP2', 'LUZP2', 'WNT5A', 'GRIK1'],
                     'D1': ['AKAP12', 'FGL2', 'GDF10', 'SULT1A1'],  # ITM2A, GPC3
                     'U1': ['SAT1', 'HNRNPH1', 'RBFOX3', 'NEAT1', 'TM4SF1', 'CLDN5', 'RNASE1',
                            'SELE', 'TPSB2', 'CCNL2', 'CTSG'], 
                     'U2': ['APOD', 'CXCL1', 'CXCL3', 'PTGES', 'FGFBP2', 'NR4A3', 
                            'EIF4A3', 'TNFAIP2', 'MLLT11'],
                     'U3': ['C11orf96', 'IFI16', 'NFKB1', 'POSTN', 'GPC3', 'HSPH1', 'PIM1', 'HES4', 
                            'HILPDA', 'VCL', 'GBP1', 'DIO2', 'CNKSR3'],
                     'U4': ['APOD', 'GPC3', 'PLA2G2A', 'ITM2A', 'CPNE3', 'ABCA8', 'NBEAL1', 
                           'THY1', 'SH3KBP1', 'PDCD1LG2', 'ABCA6', 'GREM1', 'C5orf28'], 
                      'U5': ['RGS16', 'ACTA2', 'S100A2', 'BIRC3', 'SEPT2', 'ADRA2A', 'TACC1', 
                             'RERG', 'SHNG15', 'ADAMTS', 'DDR2'], 
                      'U6': ['GPC3', 'WSB1', 'IGFBP5', 'AMD1', 'TCF7L2', 'CHD2', 'MME', 'IFI16', 'HSPH1', 'NEGR1', 
                             'ADAMTS5', 'REL', 'KPNA1', 'MIR22HG', 'GJA1', 'PDE4C', 'COL5A2', 'WAC'],
                      'U7': ['SOX10', 'S100B']
                    }

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## data extraction and processing

In [None]:
data_dir = os.getcwd()
print(data_dir)

### Tabib et al. 2018

In [None]:
tabib_dir = data_dir + '/Tabib_2018'

In [None]:
adata_tabib = sc.read_csv(tabib_dir + '/Skin_6Control_rawUMI.csv')
adata_tabib = adata_tabib.transpose()

In [None]:
df_metadata_tabib = pd.read_csv(tabib_dir + '/Skin_6Control_Metadata.csv', index_col=0)

df metadata has 8366 cells, although the paper states that 8522 cells were analyzed. The rest of cells are erithrocytes, which were filtered out from the analysis.

In [None]:
adata_tabib.raw = adata_tabib

In [None]:
dict_reverse_mappings = {'Fibroblast': ['0', '3', '4'], 
                 'Keratinocyte': ['1', '5', '7', '11', '14',], 
                 'Endothelial cell': ['2'], 
                 'Pericyte': ['6', '10'], 
                 'Macrophage/DC': ['8'], 
                 'Lymphocyte': ['9'], 
                 'Secretory Epith': ['12'], 
                 'Smooth Muscle': ['13'], 
                 'Melanocyte': ['15'], 
                 'Neural Cell': ['16'],
                 'Cornified Env': ['17'],
                 'B cell': ['18'], 
                 'Erithrocyte': [np.NaN]}  # This is ours!

dict_mappings = {}

for key, val in dict_reverse_mappings.items():
    for val_i in val:
        dict_mappings[val_i] = key

In [None]:
adata_tabib.obs['res.0.6'] = df_metadata_tabib['res.0.6'].astype(str)
adata_tabib.obs['cluster'] = [dict_mappings[i] for i in adata_tabib.obs['res.0.6']]

Since we are interested in fibros, we are going to filter their specific populations (0, 3, 4)

In [None]:
adata_tabib_fb = adata_tabib[adata_tabib.obs['cluster'].isin(['Fibroblast']), :].copy()
adata_tabib_fb_raw = adata_tabib_fb.copy()

In [None]:
sc.pp.filter_genes(adata_tabib_fb, min_counts=1)
sc.pp.normalize_total(adata_tabib_fb)
sc.pp.log1p(adata_tabib_fb)

In [None]:
sc.pp.pca(adata_tabib_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_tabib_fb, random_state=seed, n_neighbors=int(3.5 * len(adata_tabib_fb) ** 0.5), metric='cosine')
tk.tl.triku(adata_tabib_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_tabib_fb, min_dist=0.25, random_state=seed)
sc.tl.leiden(adata_tabib_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_tabib_fb, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_tabib_fb, dict_cats=dict_cats_clusters, min_score=0.6, 
            quantile_gene_sel=0.7, key_added='cluster')

In [None]:
sc.pl.umap(adata_tabib_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_tabib_fb, color=['CAV1', 'NBL1', 'C19orf33'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.tl.rank_genes_groups(adata_tabib_fb, groupby='leiden', groups=['1'], reference='rest')
sc.pl.rank_genes_groups_tracksplot(adata_tabib_fb, dendrogram=False, n_genes=150, use_raw=False)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_tabib_fb, color=['cluster'] + [i for i in val if i in adata_tabib_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
sc.pl.umap(adata_tabib_fb, color=['cluster'] + ['TNNC1', 'PDLIM1', 'CLTB', 'C2orf40', 'ANGPTL7'], 
           legend_loc='on data', cmap=magma, use_raw=False, ncols=3)


### Solé-Boldo et al. 2020

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'

In [None]:
adata_sole_young = sc.read_loom(sole_dir + '/SB2020.loom')
adata_sole_young.var_names_make_unique()

In [None]:
adata_sole_young.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_sole_young.var_names ]

In [None]:
adata_sole_young.X = np.array(adata_sole_young.X.todense())

In [None]:
sc.pp.filter_genes(adata_sole_young, min_counts=1)

In [None]:
# Basic QC filtering
adata_sole_young.var['mt'] = adata_sole_young.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_sole_young, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_sole_young, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_sole_young, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_sole_young, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_sole_young = adata_sole_young[((adata_sole_young.obs.n_genes_by_counts < 2500) & 
                                    (adata_sole_young.obs.n_genes_by_counts > 200)).values, :]
adata_sole_young = adata_sole_young[adata_sole_young.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_sole_young, min_counts=1)
sc.pp.normalize_total(adata_sole_young)
sc.pp.log1p(adata_sole_young)

In [None]:
sc.pp.pca(adata_sole_young, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_sole_young, random_state=seed, n_neighbors=int(len(adata_sole_young) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_sole_young, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_sole_young, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_sole_young, resolution=6, random_state=seed)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'S100B', 'MPZ', 'PLP1', 'MLANA', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_sole_young, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_sole_young_fb = adata_sole_young[adata_sole_young.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_sole_young_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_sole_young_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_sole_young_fb, random_state=seed, n_neighbors=int(len(adata_sole_young_fb) ** 0.5), metric='cosine')
tk.tl.triku(adata_sole_young_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_sole_young_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_sole_young_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_sole_young_fb, dict_cats=dict_cats_axes, min_score=0.35, key_added='axis')
assign_cats(adata_sole_young_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_sole_young_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_sole_young_fb, color=['SLC2A1', 'CAV1', 'NBL1', 'LMO7', 'CTSH', 'C19orf33'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.tl.rank_genes_groups(adata_tabib_fb, groupby='leiden', groups=['3'], reference='rest')
sc.pl.rank_genes_groups_tracksplot(adata_tabib_fb, dendrogram=False, n_genes=150, use_raw=False)

In [None]:
A3_genes = ['WIF1', 'IGFBP4', 'RGCC', 'LY6E', 'PTGDS', 'NKD2', 'COMP', 'APCDD1', 'BAX', 'HSPB3', 'DUSP4',  'SGCA', 'LEPR']
sc.pl.umap(adata_sole_young_fb, color=['cluster'] + A3_genes, legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_sole_young_fb, color=['cluster'] + [i for i in val if i in adata_sole_young_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### Vorstandlechner et al. 2020

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'

In [None]:
adata_vors = sc.read(vors_dir + '/skin_vorstandlechner.loom', cache=True)

In [None]:
sc.pp.filter_genes(adata_vors, min_counts=1)

In [None]:
# Basic QC filtering
adata_vors.var['mt'] = adata_vors.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_vors, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_vors, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_vors, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_vors, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_vors = adata_vors[((adata_vors.obs.n_genes_by_counts < 1700) & 
                                    (adata_vors.obs.n_genes_by_counts > 250)).values, :]
adata_vors = adata_vors[adata_vors.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_vors, min_counts=1)
sc.pp.normalize_total(adata_vors)
sc.pp.log1p(adata_vors)

In [None]:
sc.pp.pca(adata_vors, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_vors, random_state=seed, n_neighbors=int(len(adata_vors) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_vors, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_vors, min_dist=1, random_state=seed)
sc.tl.leiden(adata_vors, resolution=0.8, random_state=seed)

In [None]:
assign_cats(adata_vors, dict_cats=dict_cats_fb, min_score=0.5, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_vors, color=['assigned_cats', 'leiden'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_vors, groupby='leiden', groups=['8', '16', '17', '18'])
sc.pl.rank_genes_groups_tracksplot(adata_vors, dendrogram=False, n_genes=50)

In [None]:
adata_vors_fb = adata_vors[adata_vors.obs['assigned_cats'].isin(['fibro'])]
adata_vors_fb_raw = adata_vors_fb.copy()

In [None]:
sc.pp.filter_genes(adata_vors_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_vors_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_vors_fb, random_state=seed, n_neighbors=int(len(adata_vors_fb) ** 0.5 // 3), metric='cosine')
tk.tl.triku(adata_vors_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_vors_fb, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_vors_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_vors_fb, dict_cats=dict_cats_axes, min_score=0.35, key_added='axis')
assign_cats(adata_vors_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_vors_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_vors_fb, color=['DMKN', 'KRT5', 'KRT14'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_vors_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_vors_fb, groupby='leiden', groups=['1', '2', '3'])
sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_vors_fb, color=['cluster'] + [i for i in val if i in adata_vors_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### He et al. 2020

In [None]:
he_dir = os.getcwd() + '/He_2020'

In [None]:
adata_he = sc.read_loom(he_dir + '/He2020.loom')
adata_he.var_names_make_unique()

In [None]:
# Replace CCN5 by WISP2 because it is a key gene
adata_he.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_he.var_names]

In [None]:
# Basic QC filtering
adata_he.var['mt'] = adata_he.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_he, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_he, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_he, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_he, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_he = adata_he[adata_he.obs.n_genes_by_counts < 5000, :]
adata_he = adata_he[adata_he.obs.n_genes_by_counts > 400, :]
adata_he = adata_he[adata_he.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_he, min_counts=1)
sc.pp.normalize_total(adata_he)
sc.pp.log1p(adata_he)

In [None]:
sc.pp.pca(adata_he, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_he, random_state=seed, n_neighbors=int(len(adata_he) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_he, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_he, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', 
           cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_he, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_he_fb = adata_he[adata_he.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_he_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_he_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_he_fb, random_state=seed, n_neighbors=int(len(adata_he_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_he_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_he_fb, resolution=15, random_state=seed)

In [None]:
assign_cats(adata_he_fb, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_he_fb, dict_cats=dict_cats_clusters, min_score=0.6, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_he_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_he_fb, color=['DMKN', 'KRT5', 'KRT14'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_he_fb.obs['C3']

The unassigned region is overexpressing AMFR, GPX3, **XIST**, MTND6P4, PTX3.

In [None]:
sc.tl.rank_genes_groups(adata_he_fb, groupby='leiden', groups=['1', '2', '3'])
sc.pl.rank_genes_groups_tracksplot(adata_he_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_he_fb, color=['cluster'] + [i for i in val if i in adata_he_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### Kim et al. 2020

In [None]:
kim_dir = data_dir + '/Kim_2020'

In [None]:
adata_kim = sc.read(kim_dir + '/Kim_2020.h5ad')
adata_kim.var_names_make_unique()

In [None]:
adata_kim.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_kim.var_names ]

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)

In [None]:
adata_kim.X = np.array(adata_kim.X.todense())

In [None]:
# Basic QC filtering
adata_kim.var['mt'] = adata_kim.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_kim, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_kim, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_kim, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_kim, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_kim = adata_kim[((adata_kim.obs.n_genes_by_counts < 4000) & 
                                    (adata_kim.obs.n_genes_by_counts > 500)).values, :]
adata_kim = adata_kim[adata_kim.obs.pct_counts_mt < 25, :]

In [None]:
adata_kim

In [None]:
batches = sorted(list(set(adata_kim.obs['batch'].values)))
for batch in batches:
    counts = adata_kim.obs['n_genes_by_counts'].loc[adata_kim.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)
sc.pp.normalize_total(adata_kim)
sc.pp.log1p(adata_kim)

In [None]:
adata_kim

In [None]:
sc.pp.pca(adata_kim, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_kim, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_kim, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim, resolution=0.7, random_state=seed)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_kim, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_kim_fb = adata_kim[adata_kim.obs['assigned_cats'] == 'fibro']

In [None]:
np.unique(adata_kim_fb.obs['batch'].values, return_counts=True)

In [None]:
adata_kim_fb = adata_kim_fb[adata_kim_fb.obs['batch'].isin(['0', '1', '2', '3', '4'])]

In [None]:
sc.pp.filter_genes(adata_kim_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_kim_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_kim_fb, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_kim_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_kim_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(adata_kim_fb, dict_cats=dict_cats_axes, min_score=0.35, key_added='axis')
assign_cats(adata_kim_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_kim_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_kim_fb.obs['C3']

The unassigned region is overexpressing AMFR, GPX3, **XIST**, MTND6P4, PTX3.

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_kim_fb, color=['cluster'] + [i for i in val if i in adata_kim_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### Gaydosik et al. 2020

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'

In [None]:
adata_gaydosik_CTCL = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_CTCL.h5ad')
adata_gaydosik_HC = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_HC.h5ad')

In [None]:
# Basic QC filtering
adata_gaydosik_HC.var['mt'] = adata_gaydosik_HC.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_gaydosik_HC, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_gaydosik_HC, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_gaydosik_HC, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_gaydosik_HC, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_gaydosik_HC = adata_gaydosik_HC[((adata_gaydosik_HC.obs.n_genes_by_counts < 5500) & 
                                    (adata_gaydosik_HC.obs.n_genes_by_counts > 400)).values, :]
adata_gaydosik_HC = adata_gaydosik_HC[adata_gaydosik_HC.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_gaydosik_HC, min_counts=1)
sc.pp.normalize_total(adata_gaydosik_HC)
sc.pp.log1p(adata_gaydosik_HC)

In [None]:
sc.pp.pca(adata_gaydosik_HC, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_gaydosik_HC, metric='angular', batch_key='sample', neighbors_within_batch=3)
tk.tl.triku(adata_gaydosik_HC, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_gaydosik_HC, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gaydosik_HC, resolution=0.3, random_state=seed)

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'sample'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_gaydosik_HC, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_gaydosik_HC_fb = adata_gaydosik_HC[adata_gaydosik_HC.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_gaydosik_HC_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_gaydosik_HC_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_gaydosik_HC_fb, metric='angular', batch_key='sample', neighbors_within_batch=3)
tk.tl.triku(adata_gaydosik_HC_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_gaydosik_HC_fb, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gaydosik_HC_fb, resolution=4, random_state=seed)

In [None]:
assign_cats(adata_gaydosik_HC_fb, dict_cats=dict_cats_axes, min_score=0.35, key_added='axis')
assign_cats(adata_gaydosik_HC_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_gaydosik_HC_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_gaydosik_HC_fb.obs['C3']

The unassigned region is overexpressing AMFR, GPX3, **XIST**, MTND6P4, PTX3.

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_gaydosik_HC_fb, color=['cluster'] + [i for i in val if i in adata_gaydosik_HC_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### McCarthy et al. 2020

In [None]:
mccarthy_dir = data_dir + '/McCarthy_2020'

In [None]:
df_meta = pd.read_csv(mccarthy_dir + '/E-MTAB-7167.sdrf.txt', sep='\t')
df_meta = df_meta.drop_duplicates('Comment[ENA_RUN]').set_index('Comment[ENA_RUN]')

In [None]:
adata_mccarthy = sc.read_loom(mccarthy_dir + '/mccarthy_2020.loom')

In [None]:
# Basic QC filtering
adata_mccarthy.var['mt'] = adata_mccarthy.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_mccarthy, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_mccarthy, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mccarthy, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mccarthy, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['pct_counts_mt'] < 17]
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['total_counts'] < 3_000_000]
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['n_genes_by_counts'] > 4000]

In [None]:
sc.pl.violin(adata_mccarthy, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mccarthy, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mccarthy, x='total_counts', y='n_genes_by_counts')

In [None]:
G1S_genes = ['MCM5','PCNA','TYMS','FEN1','MCM2','MCM4','RRM1','UNG','GINS2','MCM6','CDCA7','DTL','PRIM1','UHRF1','MLF1IP',
'HELLS','RFC2','RPA2','NASP','RAD51AP1','GMNN','WDR76','SLBP','CCNE2','UBR7','POLD3','MSH2','ATAD2','RAD51',
'RRM2','CDC45','CDC6','EXO1','TIPIN','DSCC1','BLM','CASP8AP2','USP1','CLSPN','POLA1','CHAF1B','BRIP1','E2F8',]

G2M_genes = ['HMGB2','CDK1','NUSAP1','UBE2C','BIRC5','TPX2','TOP2A','NDC80','CKS2','NUF2','CKS1B','MKI67',
'TMPO','CENPF','TACC3','FAM64A','SMC4','CCNB2','CKAP2L','CKAP2','AURKB','BUB1','KIF11','ANP32E','TUBB4B',
'GTSE1','KIF20B','HJURP','CDCA3','HN1','CDC20','TTK','CDC25C','KIF2C','RANGAP1','NCAPD2','DLGAP5','CDCA2',
'CDCA8','ECT2','KIF23','HMMR', 'AURKA','PSRC1','ANLN','LBR','CKAP5','CENPE','CTCF','NEK2','G2E3','GAS2L3','CBX5','CENPA',]


In [None]:
sc.tl.score_genes_cell_cycle(adata_mccarthy, s_genes=G1S_genes, g2m_genes=G2M_genes)

In [None]:
adata_mccarthy.obs['sex'] = df_meta['Characteristics[sex]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['individual'] = df_meta['Characteristics[individual]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['age'] = df_meta['Characteristics[age]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['organism part'] = df_meta['Characteristics[organism part]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['cell type'] = df_meta['Characteristics[cell type]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['disease'] = df_meta['Characteristics[disease]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['single cell quality'] = df_meta['Characteristics[single cell quality]'].loc[adata_mccarthy.obs_names.values]

In [None]:
sc.pp.filter_genes(adata_mccarthy, min_counts=1)
sc.pp.normalize_per_cell(adata_mccarthy)
sc.pp.log1p(adata_mccarthy)

In [None]:
sc.pp.pca(adata_mccarthy, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_mccarthy, random_state=seed, n_neighbors=int(len(adata_mccarthy) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_mccarthy, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_mccarthy, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_mccarthy, resolution=0.01, random_state=seed)
sc.pl.umap(adata_mccarthy, color=['leiden', 'n_genes_by_counts', 'pct_counts_mt', 'phase'], 
           legend_loc='on data', ncols=2)

In [None]:
sc.pl.umap(adata_mccarthy, color=['leiden', 'sex', 'individual', 'age', 
                                 'organism part', 'cell type', 'disease', 'single cell quality'], legend_loc='on data', ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_mccarthy, groupby='leiden', n_genes=350)
sc.pl.rank_genes_groups_tracksplot(adata_mccarthy, dendrogram=False, n_genes=50)

In [None]:
enr0 = gp.enrichr(gene_list=list(adata_mccarthy.uns['rank_genes_groups']['names']['0']),
                 gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 outdir='test/enrichr_kegg',
                 # no_plot=True,
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

enr1 = gp.enrichr(gene_list=list(adata_mccarthy.uns['rank_genes_groups']['names']['1']),
                 gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 outdir='test/enrichr_kegg',
                 # no_plot=True,
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

In [None]:
enr0.results[enr0.results['Adjusted P-value'] < 0.01]

In [None]:
enr1.results[enr1.results['Adjusted P-value'] < 0.01]

In [None]:
sc.tl.leiden(adata_mccarthy, resolution=2, random_state=seed)

In [None]:
assign_cats(adata_mccarthy, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_mccarthy, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_mccarthy, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_mccarthy.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_mccarthy, color=['cluster'] + [i for i in val if i in adata_mccarthy.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### Gao et al. 2021

In [None]:
gao_dir = data_dir + '/gao_2021'

In [None]:
adata_gao = sc.read(gao_dir + '/gao_2021.loom')
adata_gao = adata_gao[adata_gao.obs['Patient'].isin(['Ctrl1', 'Ctrl2', 'Ctrl3'])]

In [None]:
# Basic QC filtering
adata_gao.var['mt'] = adata_gao.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_gao, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_gao, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_gao, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_gao, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_gao = adata_gao[((adata_gao.obs.n_genes_by_counts < 7000) & 
                                    (adata_gao.obs.n_genes_by_counts > 500)).values, :]
adata_gao = adata_gao[adata_gao.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(adata_gao, min_counts=1)
sc.pp.normalize_total(adata_gao)
sc.pp.log1p(adata_gao)

In [None]:
sc.pp.pca(adata_gao, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_gao, metric='angular', batch_key='Patient', neighbors_within_batch=3)
tk.tl.triku(adata_gao, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_gao, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gao, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'Patient'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_gao, dict_cats=dict_cats_fb, min_score=0.4,  quantile_gene_sel=0.4)

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'assigned_cats'], legend_loc='on data', 
           cmap=magma, use_raw=False)

In [None]:
adata_gao_fb = adata_gao[adata_gao.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_gao_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_gao_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_gao_fb, metric='angular', batch_key='Patient', neighbors_within_batch=3)
tk.tl.triku(adata_gao_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_gao_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_gao_fb, resolution=8, random_state=seed)

In [None]:
assign_cats(adata_gao_fb, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_gao_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_gao_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_gao_fb, color=['ECRG4', 'TNNC1', 'PDLIM1', 'CLTB', 'CSRP1', 'SLC2A1', 'CAV1', 'ANGPTL7'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

The unassigned cluster doesn't show any relevant DEGs, so it is just "rubbish".

In [None]:
del adata_gao_fb.obs['C3']
del adata_gao_fb.obs['C6']

In [None]:
sc.tl.rank_genes_groups(adata_gao_fb, groupby='cluster')
sc.pl.rank_genes_groups_tracksplot(adata_gao_fb, dendrogram=False, n_genes=150)

In [None]:
sc.tl.rank_genes_groups(adata_gao_fb, groupby='leiden', groups=['4'])
sc.pl.rank_genes_groups_tracksplot(adata_gao_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_gao_fb, color=['cluster'] + [i for i in val if i in adata_gao_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### Mirizio et al. 2020

In [None]:
mirizio_dir = data_dir + '/mirizio_2020'

In [None]:
adata_mirizio = sc.read(mirizio_dir + '/Mirizio_2020.h5ad')
adata_mirizio.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_mirizio.var['mt'] = adata_mirizio.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_mirizio, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_mirizio, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mirizio, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mirizio, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_mirizio = adata_mirizio[((adata_mirizio.obs.n_genes_by_counts < 4000) & 
                                    (adata_mirizio.obs.n_genes_by_counts > 250)).values, :]
adata_mirizio = adata_mirizio[adata_mirizio.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(adata_mirizio, min_counts=1)
sc.pp.normalize_total(adata_mirizio)
sc.pp.log1p(adata_mirizio)

In [None]:
sc.pp.pca(adata_mirizio, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_mirizio, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_mirizio, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_mirizio, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_mirizio, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_mirizio, dict_cats=dict_cats_fb, min_score=0.4)

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_mirizio_fb = adata_mirizio[adata_mirizio.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_mirizio_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_mirizio_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_mirizio_fb, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_mirizio_fb, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_mirizio_fb, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_mirizio_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_mirizio_fb, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_mirizio_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_mirizio_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_mirizio_fb, color=['ECRG4', 'ANGPTL7', 'SLC2A1', 'C19orf33'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_mirizio_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_mirizio_fb, groupby='leiden', groups=['5', '6'])
sc.pl.rank_genes_groups_tracksplot(adata_mirizio_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_mirizio_fb, color=['cluster'] + [i for i in val if i in adata_mirizio_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

## Dataset merge

In [None]:
adata_all = sc.AnnData.concatenate(adata_tabib_fb, adata_sole_young_fb, adata_vors_fb, 
                           adata_he_fb, adata_kim_fb, adata_gaydosik_HC_fb, 
                           adata_gao_fb, adata_mirizio_fb, 
                          batch_key='dataset', 
                          batch_categories=['Tabib', 'Solé-Boldo', 'Vorstandlechner', 
                                            'He', 'Kim', 'Gaydosik', 'Gao', 'Mirizio'], 
                          join='outer')

adata_all = adata_all[adata_all.obs['cluster'] != 'unassigned']

In [None]:
sc.pp.filter_genes(adata_all, min_counts=1)

In [None]:
adata_all.obs['dataset_batch'] = adata_all.obs['dataset'].astype(str) + \
                                    adata_all.obs['batch'].astype(str)

In [None]:
# sce.pp.harmony_integrate(adata_all, key='dataset_batch', max_iter_harmony = 30, epsilon_harmony = 5e-5,)
sc.pp.pca(adata_all, random_state=seed, n_comps=30)
# sc.pp.neighbors(adata_all, n_neighbors=int(len(adata_all) ** 0.5 // 2), use_rep='X_pca_harmony')
sce.pp.bbknn(adata_all, metric='angular', batch_key='dataset_batch', neighbors_within_batch=4)
tk.tl.triku(adata_all, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_all, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_all, resolution=5, random_state=seed)

In [None]:
sc.pl.umap(adata_all, color=['leiden', 'axis', 'cluster'], 
           legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_all, color=['axis', 'cluster'], cmap=magma, use_raw=False, )

In [None]:
for cluster in list(dict.fromkeys(adata_all.obs['cluster'].values)):
    adata_all.obs[f'is_{cluster}'] = [i if i == cluster else '-'  for i in adata_all.obs['cluster'].values]
    adata_all.uns[f'is_{cluster}_colors'] = ['#bcbcbc', '#bc0000']

In [None]:
sc.pl.umap(adata_all, color=sorted([f'is_{i}' for i in list(dict.fromkeys(adata_all.obs['cluster'].values))]), 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_all, color=['dataset'], cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_all, color=['axis', 'cluster', 'C2orf40', 'ANGPTL7', 'SFRP4', ], cmap=magma, use_raw=False, )

In [None]:
del adata_sole_young_fb.obs['C2']

In [None]:
sc.tl.rank_genes_groups(adata_all, groupby='axis')
sc.pl.rank_genes_groups_tracksplot(adata_all, dendrogram=False, n_genes=100)

In [None]:
sc.tl.rank_genes_groups(adata_all, groupby='cluster')
sc.pl.rank_genes_groups_tracksplot(adata_all, dendrogram=False, n_genes=100)

In [None]:
sc.pl.umap(adata_vors_fb, color='leiden')

In [None]:
sc.tl.rank_genes_groups(adata_vors_fb, groupby='leiden', groups=['1'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
del adata_tabib_fb.obs['C6']

In [None]:
del adata_mirizio_fb.obs['C3']

In [None]:
adata_base = adata_sole_young_fb

In [None]:
groups = ['D1']

In [None]:
sc.tl.rank_genes_groups(adata_tabib_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_tabib_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_sole_young_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_sole_young_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_vors_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_he_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_he_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_kim_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_kim_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_gaydosik_HC_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_gaydosik_HC_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_gao_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_gao_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_mirizio_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_mirizio_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
adata_mirizio_fb

In [None]:
sc.pl.umap(adata_base, color=['axis', 'cluster'] +  
           list(adata_base.uns['rank_genes_groups']['names']['A3'][:250]), cmap=magma, use_raw=False, )

In [None]:
list_genes = sorted(set(
"""ABCA6
ASPN
BGN
BOLA3
C12orf75
CAV2
COLEC12
COTL1
CRABP1
EMID1
F2R
FAP
FIBIN
FN1
GAS1
GPM6B
IGF1
ITIH5
LOXL1
LRRC15
LRRC17
LTBP2
OGN
PALLD
PDE1A
PLEKHH2
PMEPA1
POSTN
PRSS23
PTN
RASL11B
RSPO4
S100A3
TNN"""
    .split('\n')))

In [None]:
print('\n'.join(list_genes))

In [None]:
# adata_base = adata_tabib_fb
# adata_base = adata_sole_young_fb
# adata_base = adata_vors_fb
# adata_base = adata_he_fb
# adata_base = adata_kim_fb
# adata_base = adata_gaydosik_HC_fb
adata_base = adata_gao_fb
# adata_base = adata_mirizio_fb

In [None]:
del adata_base.obs['C3']

In [None]:
sc.pl.tracksplot(adata_base, var_names=[i if i in adata_base.var_names else 'SOX2' for i in list_genes], 
                                        groupby='cluster', use_raw=False)

In [None]:
sc.pl.umap(adata_base, color=['axis', 'cluster', 'leiden'] +  
           [i if i in adata_base.var_names else 'SOX10' for i in list_genes], 
           cmap=magma, use_raw=False, ncols=3, )

## C4*

In [None]:
adata_all_C4 = adata_all[adata_all.obs['cluster'] == 'C4']

In [None]:
sc.pp.filter_genes(adata_all_C4, min_counts=1)
sc.pp.pca(adata_all_C4, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_all_C4, metric='angular', batch_key='dataset', neighbors_within_batch=3)
tk.tl.triku(adata_all_C4, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_all_C4, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_all_C4, resolution=0.3, random_state=seed)

In [None]:
sc.pl.umap(adata_all_C4, color=['axis', 'cluster', 'dataset',  'leiden'], cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_all_C4, color=['axis', 'cluster', 'leiden'] +  list_genes, cmap=magma, use_raw=False, )

In [None]:
sc.tl.rank_genes_groups(adata_all_C4, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_all_C4, dendrogram=False, n_genes=250, use_raw=False)