# Comparison of fibroblast populations

In this notebook we are going to extract and replicate the main populations from diffrent papers where fibroblast populations are described, and find similarities and differences. The premise of this analysis is that many of the populations described in different papers seem not to match, or to be transcriptomically different, but in reality they are quite similar; that is, the main types of populations are indeed shared by the different papers, which should come as no surprise.

**After the publication in JID we will include the following papers, as confirmatory results**
* Tabib 2018
* Solé-Boldo 2020
* Vorstandlechner 2020
* He 2020
* Kim 2020
* Gaydosik 2020
* McCarthy 2020
* Mirizio 2020
* Gao 2021
* Reynolds 2021

Additionally, we will reanalize the *classic 4* papers, to check that cell populations are assigned as expected. For these papers, UMAPs might vary compared to the ones in our paper, but the main results should still be the same.

## imports

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
# import ray
# import subprocess
# import time
# import scvelo as scv
# import gc
# import gseapy as gp

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
dict_cats_fb = {'fibro': ['LUM', 'PDGFRA', 'COL1A1', 'DCN', 'SFRP2', 'APOE'], 
                'melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'], 
                'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 
                'eritro': ['HBB', 'HBA2', 'HBA1', 'HBD'],
                'muscle': ['DES', 'PCP4', 'ACTG2', 'SYNPO2', 'PRUNE2', 'SORBS1', 'P2RX1'],
                'immune': ['TPSB2', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 
                'endo': ['PLVAP', 'CLDN5', 'ACKR1', 'LMCD1', 'NPDC1', 'A2M', 
                         'PECAM1', 'CLU', 'VWF', 'CD74', 'RAMP2', 'IFI27', 'GNG11'], 
                'lymph': ['CCL21', 'LYVE1', 'CLDN5'],
                'kerato': ['DMKN', 'KRT1', 'KRT5', 'KRT14', 'AQP3', 'SFN' ], 
                'krt7/8/19': ['S100A1','KRT19','PPP1R1B','KRT7','KRT8','SNORC','NCALD','CA6',
                              'AKR1C2','TPD52L1','PDK3','ROPN1B','QDPR'],
                'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6'], 
                'neuro': ['GPM6B','PLP1','S100B','SCN7A','NRXN1','GFRA3','MPZ'],
                'T cells': ['CD52', 'CD3D', 'TRAC'],
                'B cells': ['IGHM','CD74','CD79A','NIBAN3','TCL1A','NCF1','MS4A1', 'BTK', 'CD19'],
                'APC': ['HLA-DQA1', 'HLA-DRB6', 'TYROBP', 'FCER1G', 'AIF1'], 
                'mast_cells': ['IL1RL1', 'CPA3', 'HPGDS', 'TPSB2', 'HPGD', 'RGS13', 'CTSG', 
                               'TPSAB1', 'GATA2'],
                'F': ['B4GALT1', 'TMSB4X', 'PPP1CB', 'WTAP', 'PTPRS', 'CTNNB1', 'INSR', 'BICC1', 'CTNNB1'], 
               }

dict_cats_axes = {'A': ['PTGIS', 'SFRP2', 'MMP2', 'RGCC', 'COL14A1', 'AQP1', 'PTGIS', 'QPCT', 'ELN', 'COL14A1', 'LEPR', 
                        'ISM1', 'CES1', 'WIF1', 'RECK', 'SGCA', 'FBN1', 'NBL1', 'CTSB', 'COL5A1', 'SMOC2', 'SGCG'],
                  'B': ['GGT5', 'APOE', 'APOC1', 'CYGB', 'C7', 'IGFBP7', 'TNFSF13B', 'APOC1', 'RARRES2', 'CCDC146', 'CXCL12', 
                        'ITM2A', 'EBF1', 'CCL2', 'IGFBP3', 'EBF1', 'CXCL2', 'EFEMP1', 'TMEM176A', 'C3', 'EGR1', 'LGALS3BP', 
                        'BST2', 'ANGPTL4', 'ABCA8', ],
                  'C': ['TNN', 'TNMD', 'ASPN', 'CYP1B1', 'GPC1', 'PPP1R14A', 'GPM6B', 'COL11A1', 'DKK3', 'OGN', 'SDC1', 
                        'PDE1A', 'MDK', 'NRP2', 'POSTN', 'F2R', 'KIF26B', 'TENM3', 'ALX4', 'PMEPA1', 'FIBIN', 'PCDH15',                        ]}

dict_cats_clusters = {'A1': ['SLPI', 'C1QTNF3', 'CD70', 'SEMA3C', 'TSPAN8', 'DPP4', 'CHRDL1', 
                             'SEMA3B', 'CTHRC1', 'WISP2', 'CCN5', 'PI16', 'QPCT'],
                     'A2': ['CLEC2A', 'COL13A1', 'COL23A1', 'COL6A5', 'HSPB3', 'NPTX2', 'COMP', 'APCDD1', 'NKD2', 
                            'PHACTR3', 'AKAP6', 'LEPR', 'STC2', 'WIF1', 'CDC42EP3', 'COL18A1',], 
                     'A3': ['WIF1', 'HAS1', 'SGCG', 'CORIN', 'ACKR4', 'C1QTNF3', 'CFD', 'QPCT', 'SGCA', ], # 'HAS1', 'CORIN', 'SGCG', 'F13A1', 'RETREG1', 
                     'A4': ['SFRP4', 'HSD3B7', 'CD70', 'C1QTNF3', 'PRG4', 'GLIPR2', 'PAMR1', 'FNDC1',],
                      
                     'B1': ['CXCL2', 'SPSB1', 'CXCL1', 'IL6', 'MYOC', 'CCL2', 'CXCL3', 'TNFSF14', 'MEDAG', 'ZC3H12A', 
                            'C11orf96', 'IRF8', 'ITM2A', 'KDM6B', 'SOD2', 'CDKN1A', 'CSRNP1', 'CSRP2', 'ERRFI1', 'FMO1', ], 
                     'B2': ['CCL19', 'C7', 'IGFBP3', 'RBP5', 'CCDC146', 'CH25H', 'TNFSF13B', 'CTSH', 'CD74' ], 
                     'B3': ['CHRDL1', 'GPX3', 'GPC3', 'ITM2A', 'MGP', 'TSPAN8', 'ADH1B', 'C7', 
                            'GGT5', 'MGST1', 'ROBO3', 'DNASE1L3', 'AADAC', 'HHIP-AS1', ],  # ITM2A, GPC3      
                      
                     'C1': ['DPEP1', 'UGT3A2', 'COL11A1', 'MME', 'RBP4', 'MYL4', 'WFDC1', 'CYYR1', 'EDNRA', 
                            'MEF2C', 'TNMD', 'CDH11', 'COL21A1', 'GPC3', 'KIF26B', 'PPP1R14A', 'EDIL3', 'SLC26A7' ],
                     'C2': ['COCH', 'SLITRK6', 'MKX', 'CHADL', 'RSPO4', 'CRABP1', 'NDNF', 'SLC22A16', 'FIBIN', 'TNN', 'CCK', 
                            'RHPN1', 'MAB21L2', 'ASPN', 'OGN', 'PLXDC1', 'SPARCL1', 'PLPP5', 'WNT10A', 'NECAB1', 'TNMD' ], # 'SLITRK6', 'MKX'
                     'C3': ['BGN', 'F2R', 'TNN', 'POSTN', 'GPM6B', 'PRSS23', 'FAP', 'EMID1', ],
                     'C4': ['IGFBP3', 'LUZP2', 'CENPW', 'TFAP2A', 'TPD52', 'LEF1', 'CPE', 'BMP7', 'DIO3', 'ACTR3B', 
                            'BAMBI', 'INHBA', 'SERPINE2', 'WNT5A', 'BMP4', 'STMN1', 'SEMA3G', 'NOG', 'DCXR', 'EDNRA'],
                      
                     'D1': ['CDH19', 'ANGPTL7', 'PLEKHB1', 'ENTPD2', 'SOX8', 'FGFBP2', 'SCRG1', 'TTYH1', 'KANK4', 'C2orf40', 
                            'ECRG4', 'COL9A3', 'SCN7A', 'FMO2', 'MCTP1', 'P2RY14', 'MIA', 'TIAM1', 'ATP1A2', 'APOD'], 
                     'D2': ['TNNT2', 'SLC2A1', 'SFRP5', 'WNT6', 'C19orf33', 'LMO7', 'NGFR', 'SBSPON', 'SLC22A3', 'DACT1', 
                            'TNNC1', 'EBF2', 'TENM2', 'ALLC', 'AQP3', 'NDUFA4L2', 'ITGA6', 'ITGB4', 'TJP1', 'ZBTB7C'],
                     'D3': ['FGFBP2', 'CPE', 'OLFML2A', 'SLC22A3', 'IGFBP2', 'SPON2', 'APOD', 'EGR2', 
                            'RAMP1', 'IGF1', 'KLK1', 'RGMA', 'PDGFD', 'PRSS23', 'TIMP3',], 
                     'U1': ['CHD1', 'MME', 'HSPH1', 'ADAMTS5', 'SESN3', 'REL'],
                     'U2-B/C': ['ICAM1', 'C11orf96', 'GPC3', 'HILPDA', 'NFKB1', 'PIM1', 'DAB2', 'STIP1', 'MFHAS1', 'DIO2', 'GBP1', 'JAG1', 'CXCL1', 'ASPN'],
                     'U3': ['ASPN', 'HTRA1', 'EGR1', 'MIR99AHG', 'HTRA3', 'TCF7L2', 'SOCS3', 'MAFB', 'COL6A6', 'CDKN2C', 'TRFL', 'CPXM1', 'ASB3', 
                            'SLC7A8', 'HPGD'],
                     'Glial': ['SOX10', 'S100B', 'NRNX1', 'L1CAM', 'AATK', 'SCN7A', 'GFRA3'], 
                     'melanocyte': ['MLANA', 'PMEL', 'TRIM63', 'QPCT', 'PLP1', 'TYRP1'],
                     'endo': ['PLVAP', 'CLDN5', 'ACKR1', 'LMCD1', 'NPDC1', 'A2M', 
                         'PECAM1', 'CLU', 'VWF', 'CD74', 'RAMP2', 'IFI27', 'GNG11'], 
                    }

In [None]:
dict_colors = {'A1': '#c93038', 'A2': '#de6a38', 'A3': '#ffad3b', 'A4': '#852d66',
               'B1': '#b4d645', 'B2': '#51c43f', 'B3': '#309c63',
               'C1': '#63c2c9', 'C2': '#4c93ad', 'C3': '#264f6e', 'C4': '#233663',
               'D1': '#fcbf8a', 'D2': '#b58057', 'D3': '#8a503e',
               'U1': '#eb088d', 'U2-B/C': '#cb086d', 'U3': '#ab083d'
               }

In [None]:
dict_rep = {'CCN5': 'WISP2', 'ECRG4': 'C2orf40'}

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
def plot_score_graph(adatax):
    df_cats_own = pd.DataFrame(index=adatax.obs_names, columns=['clusters', 'score'])
    for cluster in adatax.obs['cluster'].cat.categories:
        adata_sub = adatax[adatax.obs['cluster'] == cluster]
        try:
            df_cats_own.loc[adata_sub.obs_names, 'score'] = adata_sub.obs[f'cluster_{cluster}']
            df_cats_own.loc[adata_sub.obs_names, 'clusters'] = cluster
        except:
            pass

    df_cats_own = df_cats_own.sort_values('clusters')
    sns.barplot(x='clusters', y='score', data=df_cats_own, palette=adatax.uns['cluster_colors'])

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## data extraction and processing

In [None]:
data_dir = os.getcwd() + '/data/'
print(data_dir)

### Gao et al. 2021

In [None]:
gao_dir = data_dir + '/gao_2021'

In [None]:
adata_gao = sc.read(gao_dir + '/gao_2021.loom')
adata_gao = adata_gao[adata_gao.obs['Patient'].isin(['Ctrl1', 'Ctrl2', 'Ctrl3'])]

In [None]:
# Basic QC filtering
adata_gao.var['mt'] = adata_gao.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_gao, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_gao, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_gao, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_gao, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_gao = adata_gao[((adata_gao.obs.n_genes_by_counts < 7000) & 
                                    (adata_gao.obs.n_genes_by_counts > 500)).values, :]
adata_gao = adata_gao[adata_gao.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(adata_gao, min_counts=1)
sc.pp.normalize_total(adata_gao)
sc.pp.log1p(adata_gao)

In [None]:
sc.pp.pca(adata_gao, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_gao, metric='angular', batch_key='Patient', neighbors_within_batch=3)
tk.tl.triku(adata_gao)

In [None]:
sc.tl.umap(adata_gao, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gao, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'Patient'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_gao, dict_cats=dict_cats_fb, min_score=0.4,  quantile_gene_sel=0.4)

In [None]:
sc.pl.umap(adata_gao, color=['leiden', 'assigned_cats'], legend_loc='on data', 
           cmap=magma, use_raw=False)

In [None]:
adata_gao_fb = adata_gao[adata_gao.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_gao_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_gao_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_gao_fb, metric='angular', batch_key='Patient', neighbors_within_batch=2)
tk.tl.triku(adata_gao_fb)

In [None]:
sc.tl.umap(adata_gao_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_gao_fb, resolution=4, random_state=seed)

In [None]:
assign_cats(adata_gao_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.45, key_added='cluster')
assign_cats(adata_gao_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, quantile_gene_sel=0.6,
            key_added='axis', intermediate_states=True, diff=0.15)

In [None]:
adata_gao_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_gao_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_gao_fb, color=['Patient', 'leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_gao_fb, color=['cluster'], cmap=magma, use_raw=False, )

The unassigned cluster doesn't show any relevant DEGs, so it is just "rubbish".

In [None]:
del adata_gao_fb.obs['C3']
del adata_gao_fb.obs['C6']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_gao_fb, color=['cluster'] + [i for i in val if i in adata_gao_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_gao_fb)

### Gaydosik et al. 2020

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'

In [None]:
adata_gaydosik_CTCL = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_CTCL.h5ad')
adata_gaydosik_HC = sc.read_h5ad(gaydosik_dir + '/gaydosik_2020_HC.h5ad')

In [None]:
# Basic QC filtering
adata_gaydosik_HC.var['mt'] = adata_gaydosik_HC.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_gaydosik_HC, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_gaydosik_HC, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_gaydosik_HC, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_gaydosik_HC, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_gaydosik_HC = adata_gaydosik_HC[((adata_gaydosik_HC.obs.n_genes_by_counts < 5500) & 
                                    (adata_gaydosik_HC.obs.n_genes_by_counts > 400)).values, :]
adata_gaydosik_HC = adata_gaydosik_HC[adata_gaydosik_HC.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_gaydosik_HC, min_counts=1)
sc.pp.normalize_total(adata_gaydosik_HC)
sc.pp.log1p(adata_gaydosik_HC)

In [None]:
sc.pp.pca(adata_gaydosik_HC, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_gaydosik_HC, metric='angular', batch_key='sample', neighbors_within_batch=3)
tk.tl.triku(adata_gaydosik_HC)

In [None]:
sc.tl.umap(adata_gaydosik_HC, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_gaydosik_HC, resolution=0.3, random_state=seed)

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'sample'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_gaydosik_HC, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_gaydosik_HC, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_gaydosik_HC_fb = adata_gaydosik_HC[adata_gaydosik_HC.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_gaydosik_HC_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_gaydosik_HC_fb, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_gaydosik_HC_fb, metric='angular', batch_key='sample', neighbors_within_batch=3)
tk.tl.triku(adata_gaydosik_HC_fb)

In [None]:
sc.tl.umap(adata_gaydosik_HC_fb, min_dist=0.15, random_state=seed)
sc.tl.leiden(adata_gaydosik_HC_fb, resolution=6, random_state=seed)

In [None]:
assign_cats(adata_gaydosik_HC_fb, dict_cats=dict_cats_clusters, min_score=0.35, quantile_gene_sel=0.4, key_added='cluster')
assign_cats(adata_gaydosik_HC_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15)

In [None]:
adata_gaydosik_HC_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_gaydosik_HC_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_gaydosik_HC_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_gaydosik_HC_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_gaydosik_HC_fb, groupby='leiden', groups=['24'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_gaydosik_HC_fb, dendrogram=False, n_genes=200)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_gaydosik_HC_fb, color=['cluster'] + [i for i in val if i in adata_gaydosik_HC_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_gaydosik_HC_fb)

### He et al. 2020

In [None]:
he_dir = data_dir + '/He_2020'

In [None]:
adata_he = sc.read_loom(he_dir + '/He2020.loom')
adata_he.var_names_make_unique()

In [None]:
# Replace CCN5 by WISP2 because it is a key gene
adata_he.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_he.var_names]

In [None]:
# Basic QC filtering
adata_he.var['mt'] = adata_he.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_he, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_he, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_he, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_he, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_he = adata_he[adata_he.obs.n_genes_by_counts < 5000, :]
adata_he = adata_he[adata_he.obs.n_genes_by_counts > 400, :]
adata_he = adata_he[adata_he.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_genes(adata_he, min_counts=1)
sc.pp.normalize_total(adata_he)
sc.pp.log1p(adata_he)

In [None]:
sc.pp.pca(adata_he, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_he, random_state=seed, n_neighbors=int(0.5 * len(adata_he) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he)

In [None]:
sc.tl.umap(adata_he, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_he, resolution=3, random_state=seed)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
assign_cats(adata_he, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'assigned_cats', 'CDH19', 'ANGPTL7', 'PLEKHB1', 'ENTPD2', 'C2orf40', 
                           'SLC2A1', 'CLDN1', 'TNNT2', 'C19orf33', 'SFRP5'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_he, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_he_fb = adata_he[adata_he.obs['assigned_cats'].isin(['fibro', 'neuro', 'muscle'])]

In [None]:
sc.pp.filter_genes(adata_he_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_he_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_he_fb, random_state=seed, n_neighbors=int(0.5 * len(adata_he_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he_fb)

In [None]:
sc.tl.umap(adata_he_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_he_fb, resolution=15, random_state=seed)

In [None]:
assign_cats(adata_he_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.5, key_added='cluster')
assign_cats(adata_he_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
            quantile_gene_sel=0.4, intermediate_states=True, diff=0.15)

In [None]:
adata_he_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_he_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_he_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_he_fb, color=['DMKN', 'KRT5', 'KRT14'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
adata_he_fb = adata_he_fb[~ adata_he_fb.obs['cluster'].isin(['Glial', 'unassigned', 'endo'])]

In [None]:
sc.pp.filter_genes(adata_he_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_he_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_he_fb, random_state=seed, n_neighbors=int(0.5 * len(adata_he_fb) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_he_fb)

In [None]:
sc.tl.umap(adata_he_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_he_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(adata_he_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.7, key_added='cluster')
assign_cats(adata_he_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.35, key_added='axis', 
            quantile_gene_sel=0.4, intermediate_states=True, diff=0.15)

In [None]:
adata_he_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_he_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_he_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_he_fb.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_he_fb, groupby='leiden', groups=['1', '2', '3'])
sc.pl.rank_genes_groups_tracksplot(adata_he_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_he_fb, color=['cluster'] + [i for i in val if i in adata_he_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_he_fb)

### Hughes et al. 2020

In [None]:
hughes_dir = data_dir + '/hughes_2020'

In [None]:
adata_hughes = sc.read(hughes_dir + '/hughes_2020.h5ad')
adata_hughes = adata_hughes[adata_hughes.obs['donor_id'].isin(['Normal', 'Normal2', 'Normal3'])]
adata_hughes.obs['batch'] = adata_hughes.obs['donor_id']

In [None]:
adata_hughes

In [None]:
sc.pp.filter_genes(adata_hughes, min_counts=1)

In [None]:
# Basic QC filtering
adata_hughes.var['mt'] = adata_hughes.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_hughes, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_hughes, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_hughes, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_hughes, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_hughes = adata_hughes[((adata_hughes.obs.n_genes_by_counts < 3000) & 
                                    (adata_hughes.obs.n_genes_by_counts > 200)).values, :]
adata_hughes = adata_hughes[adata_hughes.obs.pct_counts_mt < 15, :]

In [None]:
batches = sorted(list(set(adata_hughes.obs['batch'].values)))
for batch in batches:
    counts = adata_hughes.obs['n_genes_by_counts'].loc[adata_hughes.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_hughes, min_counts=1)
sc.pp.normalize_total(adata_hughes)
sc.pp.log1p(adata_hughes)

In [None]:
sc.pp.pca(adata_hughes, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_hughes, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_hughes)

In [None]:
sc.tl.umap(adata_hughes, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_hughes, resolution=0.7, random_state=seed)

In [None]:
sc.pl.umap(adata_hughes, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_hughes, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_hughes, dict_cats=dict_cats_fb, min_score=0.3)

In [None]:
sc.pl.umap(adata_hughes, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_hughes_fb = adata_hughes[adata_hughes.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_hughes_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_hughes_fb, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_hughes_fb, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_hughes_fb)

In [None]:
sc.tl.umap(adata_hughes_fb, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_hughes_fb, resolution=9, random_state=seed)

In [None]:
assign_cats(adata_hughes_fb, dict_cats=dict_cats_clusters, min_score=0.35, quantile_gene_sel=0.3, key_added='cluster')
assign_cats(adata_hughes_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15)

In [None]:
adata_hughes_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_hughes_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_hughes_fb, color=['leiden', 'axis', 'batch', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_hughes_fb, groupby='cluster', groups=['C3'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_hughes_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_hughes_fb, color=['cluster'] + [i for i in val if i in adata_hughes_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_hughes_fb)

### Kim et al. 2020

In [None]:
kim_dir = data_dir + '/Kim_2020'

In [None]:
adata_kim = sc.read(kim_dir + '/Kim_2020.h5ad')
adata_kim.var_names_make_unique()

In [None]:
adata_kim.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_kim.var_names ]

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)

In [None]:
adata_kim.X = np.array(adata_kim.X.todense())

In [None]:
# Basic QC filtering
adata_kim.var['mt'] = adata_kim.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_kim, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_kim, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_kim, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_kim, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_kim = adata_kim[((adata_kim.obs.n_genes_by_counts < 4000) & 
                                    (adata_kim.obs.n_genes_by_counts > 500)).values, :]
adata_kim = adata_kim[adata_kim.obs.pct_counts_mt < 25, :]

In [None]:
adata_kim

In [None]:
batches = sorted(list(set(adata_kim.obs['batch'].values)))
for batch in batches:
    counts = adata_kim.obs['n_genes_by_counts'].loc[adata_kim.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)
sc.pp.normalize_total(adata_kim)
sc.pp.log1p(adata_kim)

In [None]:
adata_kim

In [None]:
sc.pp.pca(adata_kim, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_kim, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim)

In [None]:
sc.tl.umap(adata_kim, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim, resolution=0.7, random_state=seed)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_kim, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_kim_fb = adata_kim[adata_kim.obs['assigned_cats'] == 'fibro']

In [None]:
np.unique(adata_kim_fb.obs['batch'].values, return_counts=True)

In [None]:
adata_kim_fb = adata_kim_fb[adata_kim_fb.obs['batch'].isin(['0', '1', '2', '3', '4'])]

In [None]:
sc.pp.filter_genes(adata_kim_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_kim_fb, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_kim_fb, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim_fb)

In [None]:
sc.tl.umap(adata_kim_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_kim_fb, resolution=9, random_state=seed)

In [None]:
assign_cats(adata_kim_fb, dict_cats=dict_cats_clusters, min_score=0.35, quantile_gene_sel=0.3, key_added='cluster')
assign_cats(adata_kim_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.6, key_added='axis', 
             intermediate_states=True, diff=0.15)

In [None]:
adata_kim_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_kim_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_kim_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
sc.tl.rank_genes_groups(adata_kim_fb, groupby='cluster', groups=['unassigned'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_kim_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_kim_fb, color=['cluster'] + [i for i in val if i in adata_kim_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_kim_fb)

### Kim et al. 2021 [NAIL, EXPECTING DIFFERENT RESULTS]

In [None]:
kim_dir = data_dir + '/kim_2021'

In [None]:
adata_kim = sc.read(kim_dir + '/kim_2021.h5ad')
adata_kim.var_names_make_unique()

In [None]:
adata_kim.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_kim.var_names ]

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)

In [None]:
adata_kim.X = np.array(adata_kim.X.todense())

In [None]:
# Basic QC filtering
adata_kim.var['mt'] = adata_kim.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_kim, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_kim, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_kim, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_kim, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_kim = adata_kim[((adata_kim.obs.n_genes_by_counts < 6000) & 
                                    (adata_kim.obs.n_genes_by_counts > 1000)).values, :]
adata_kim = adata_kim[adata_kim.obs.pct_counts_mt < 40, :]

In [None]:
adata_kim

In [None]:
batches = sorted(list(set(adata_kim.obs['batch'].values)))
for batch in batches:
    counts = adata_kim.obs['n_genes_by_counts'].loc[adata_kim.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
sc.pp.filter_genes(adata_kim, min_counts=1)
sc.pp.normalize_total(adata_kim)
sc.pp.log1p(adata_kim)

In [None]:
adata_kim

In [None]:
sc.pp.pca(adata_kim, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_kim, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_kim)

In [None]:
sc.tl.umap(adata_kim, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_kim, resolution=0.7, random_state=seed)

In [None]:
sc.pp.subsample(adata_kim, fraction=1, random_state=0, copy=False)
sc.pl.umap(adata_kim, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_kim, dict_cats=dict_cats_fb)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_kim, color=['leiden', 'COL18A1', 'APCDD1', 'SLPI'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_kim_fb = adata_kim[adata_kim.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_kim_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_kim_fb, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_kim_fb, metric='angular', batch_key='batch', neighbors_within_batch=4)
tk.tl.triku(adata_kim_fb)

In [None]:
sc.tl.umap(adata_kim_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_kim_fb, resolution=9, random_state=seed)

In [None]:
assign_cats(adata_kim_fb, dict_cats=dict_cats_clusters, min_score=0.35, quantile_gene_sel=0.3, key_added='cluster')
assign_cats(adata_kim_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
             intermediate_states=True, diff=0.15)

In [None]:
adata_kim_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_kim_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_kim_fb, color=['leiden', 'axis', 'cluster', 'batch'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_kim_fb, color=['leiden', 'axis', 'cluster', 'SLPI', 'APCDD1'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
sc.tl.rank_genes_groups(adata_kim_fb, groupby='cluster', groups=['unassigned'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_kim_fb, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_kim_fb, color=['cluster'] + [i for i in val if i in adata_kim_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_kim_fb)

### McCarthy et al. 2020

In [None]:
mccarthy_dir = data_dir + '/McCarthy_2020'

In [None]:
df_meta = pd.read_csv(mccarthy_dir + '/E-MTAB-7167.sdrf.txt', sep='\t')
df_meta = df_meta.drop_duplicates('Comment[ENA_RUN]').set_index('Comment[ENA_RUN]')

In [None]:
adata_mccarthy = sc.read_loom(mccarthy_dir + '/mccarthy_2020.loom')

In [None]:
# Basic QC filtering
adata_mccarthy.var['mt'] = adata_mccarthy.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_mccarthy, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_mccarthy, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mccarthy, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mccarthy, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['pct_counts_mt'] < 17]
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['total_counts'] < 3_000_000]
adata_mccarthy = adata_mccarthy[adata_mccarthy.obs['n_genes_by_counts'] > 4000]

In [None]:
sc.pl.violin(adata_mccarthy, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mccarthy, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mccarthy, x='total_counts', y='n_genes_by_counts')

In [None]:
G1S_genes = ['MCM5','PCNA','TYMS','FEN1','MCM2','MCM4','RRM1','UNG','GINS2','MCM6','CDCA7','DTL','PRIM1','UHRF1','MLF1IP',
'HELLS','RFC2','RPA2','NASP','RAD51AP1','GMNN','WDR76','SLBP','CCNE2','UBR7','POLD3','MSH2','ATAD2','RAD51',
'RRM2','CDC45','CDC6','EXO1','TIPIN','DSCC1','BLM','CASP8AP2','USP1','CLSPN','POLA1','CHAF1B','BRIP1','E2F8',]

G2M_genes = ['HMGB2','CDK1','NUSAP1','UBE2C','BIRC5','TPX2','TOP2A','NDC80','CKS2','NUF2','CKS1B','MKI67',
'TMPO','CENPF','TACC3','FAM64A','SMC4','CCNB2','CKAP2L','CKAP2','AURKB','BUB1','KIF11','ANP32E','TUBB4B',
'GTSE1','KIF20B','HJURP','CDCA3','HN1','CDC20','TTK','CDC25C','KIF2C','RANGAP1','NCAPD2','DLGAP5','CDCA2',
'CDCA8','ECT2','KIF23','HMMR', 'AURKA','PSRC1','ANLN','LBR','CKAP5','CENPE','CTCF','NEK2','G2E3','GAS2L3','CBX5','CENPA',]


In [None]:
sc.tl.score_genes_cell_cycle(adata_mccarthy, s_genes=G1S_genes, g2m_genes=G2M_genes)

In [None]:
adata_mccarthy.obs['sex'] = df_meta['Characteristics[sex]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['individual'] = df_meta['Characteristics[individual]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['age'] = df_meta['Characteristics[age]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['organism part'] = df_meta['Characteristics[organism part]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['cell type'] = df_meta['Characteristics[cell type]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['disease'] = df_meta['Characteristics[disease]'].loc[adata_mccarthy.obs_names.values]
adata_mccarthy.obs['single cell quality'] = df_meta['Characteristics[single cell quality]'].loc[adata_mccarthy.obs_names.values]

In [None]:
sc.pp.filter_genes(adata_mccarthy, min_counts=1)
sc.pp.normalize_per_cell(adata_mccarthy)
sc.pp.log1p(adata_mccarthy)

In [None]:
sc.pp.pca(adata_mccarthy, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_mccarthy, random_state=seed, n_neighbors=int(len(adata_mccarthy) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_mccarthy, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_mccarthy, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_mccarthy, resolution=0.01, random_state=seed)
sc.pl.umap(adata_mccarthy, color=['leiden', 'n_genes_by_counts', 'pct_counts_mt', 'phase'], 
           legend_loc='on data', ncols=2)

In [None]:
sc.pl.umap(adata_mccarthy, color=['leiden', 'sex', 'individual', 'age', 
                                 'organism part', 'cell type', 'disease', 'single cell quality'], legend_loc='on data', ncols=2)

In [None]:
sc.tl.rank_genes_groups(adata_mccarthy, groupby='leiden', n_genes=350)
sc.pl.rank_genes_groups_tracksplot(adata_mccarthy, dendrogram=False, n_genes=50)

In [None]:
enr0 = gp.enrichr(gene_list=list(adata_mccarthy.uns['rank_genes_groups']['names']['0']),
                 gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 outdir='test/enrichr_kegg',
                 # no_plot=True,
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

enr1 = gp.enrichr(gene_list=list(adata_mccarthy.uns['rank_genes_groups']['names']['1']),
                 gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 outdir='test/enrichr_kegg',
                 # no_plot=True,
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

In [None]:
enr0.results[enr0.results['Adjusted P-value'] < 0.01]

In [None]:
enr1.results[enr1.results['Adjusted P-value'] < 0.01]

In [None]:
sc.tl.leiden(adata_mccarthy, resolution=2, random_state=seed)

In [None]:
assign_cats(adata_mccarthy, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_mccarthy, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.5, key_added='cluster')

In [None]:
sc.pl.umap(adata_mccarthy, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_mccarthy.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_mccarthy, color=['cluster'] + [i for i in val if i in adata_mccarthy.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_mccarthy)

### Mirizio et al. 2020

In [None]:
mirizio_dir = data_dir + '/mirizio_2020'

In [None]:
adata_mirizio = sc.read(mirizio_dir + '/Mirizio_2020.h5ad')
adata_mirizio.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_mirizio.var['mt'] = adata_mirizio.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_mirizio, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_mirizio, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_mirizio, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_mirizio, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_mirizio = adata_mirizio[((adata_mirizio.obs.n_genes_by_counts < 4000) & 
                                    (adata_mirizio.obs.n_genes_by_counts > 250)).values, :]
adata_mirizio = adata_mirizio[adata_mirizio.obs.pct_counts_mt < 40, :]

In [None]:
sc.pp.filter_genes(adata_mirizio, min_counts=1)
sc.pp.normalize_total(adata_mirizio)
sc.pp.log1p(adata_mirizio)

In [None]:
sc.pp.pca(adata_mirizio, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_mirizio, metric='angular', batch_key='batch', neighbors_within_batch=4)
tk.tl.triku(adata_mirizio)

In [None]:
sc.tl.umap(adata_mirizio, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_mirizio, resolution=7, random_state=seed)

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'batch'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_mirizio, dict_cats=dict_cats_fb, min_score=0.4)

In [None]:
sc.pl.umap(adata_mirizio, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_mirizio_fb = adata_mirizio[adata_mirizio.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_mirizio_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_mirizio_fb, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_mirizio_fb, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_mirizio_fb)

In [None]:
sc.tl.umap(adata_mirizio_fb, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_mirizio_fb, resolution=7, random_state=seed)

In [None]:
assign_cats(adata_mirizio_fb, dict_cats=dict_cats_clusters, min_score=0.5, quantile_gene_sel=0.45, key_added='cluster')
assign_cats(adata_mirizio_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, quantile_gene_sel=0.75,
            key_added='axis', intermediate_states=True, diff=0.15)

In [None]:
adata_mirizio_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_mirizio_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_mirizio_fb, color=['leiden', 'batch', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_mirizio_fb, color=['cluster'], cmap=magma, use_raw=False, )

In [None]:
# UNASSIGNED cells may refer to stress
sc.tl.rank_genes_groups(adata_mirizio_fb, groupby='cluster', groups=['unassigned'])
sc.pl.rank_genes_groups_tracksplot(adata_mirizio_fb, dendrogram=False, n_genes=150)

In [None]:
del adata_mirizio_fb.obs['C3']

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_mirizio_fb, color=['cluster'] + [i for i in val if i in adata_mirizio_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_mirizio_fb)

### Reynolds et al. 2021

In [None]:
reynolds_dir = data_dir + '/reynolds_2021'

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', 
                                    backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
adata_reynolds_healthy_fb = adata_reynolds_healthy_fb[adata_reynolds_healthy_fb.obs['hypoxia_stress'] == 'Normal']

In [None]:
adata_reynolds_healthy_fb

In [None]:
sc.pp.filter_genes(adata_reynolds_healthy_fb, min_counts=1)
sc.pp.highly_variable_genes(adata_reynolds_healthy_fb)
sc.pp.pca(adata_reynolds_healthy_fb, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_reynolds_healthy_fb, metric='angular', batch_key='sample_id', neighbors_within_batch=2, )
sc.tl.umap(adata_reynolds_healthy_fb, min_dist=0.05, random_state=seed)

In [None]:
assign_cats(adata_reynolds_healthy_fb, dict_cats=dict_cats_axes, min_score=0.4, key_added='axis')
assign_cats(adata_reynolds_healthy_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.4, key_added='cluster')

In [None]:
adata_reynolds_healthy_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_reynolds_healthy_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, palette='Dark2')

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['COL18A1', 'CLEC2A', 'COL13A1', 'COL6A5', 'NPTX2', 'HSPB3', 'COMP', 'APCDD1', 'NKD2', 'AKAP6'], 
           legend_loc='on data', cmap=magma, use_raw=False, palette='Dark2')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_healthy_fb, groupby='leiden', groups=['4', '21'])
sc.pl.rank_genes_groups_tracksplot(adata_reynolds_healthy_fb, dendrogram=False, use_raw=False, n_genes=50)

In [None]:
plot_score_graph(adata_reynolds_healthy_fb)

### Solé-Boldo et al. 2020

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'

In [None]:
adata_sole_young = sc.read_loom(sole_dir + '/SB2020.loom')
adata_sole_young.var_names_make_unique()

In [None]:
adata_sole_young.var_names = [dict_rep[i] if i in dict_rep else i for i in adata_sole_young.var_names ]

In [None]:
adata_sole_young.X = np.array(adata_sole_young.X.todense())

In [None]:
sc.pp.filter_genes(adata_sole_young, min_counts=1)

In [None]:
# Basic QC filtering
adata_sole_young.var['mt'] = adata_sole_young.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_sole_young, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_sole_young, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_sole_young, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_sole_young, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_sole_young = adata_sole_young[((adata_sole_young.obs.n_genes_by_counts < 2500) & 
                                    (adata_sole_young.obs.n_genes_by_counts > 200)).values, :]
adata_sole_young = adata_sole_young[adata_sole_young.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_sole_young, min_counts=1)
sc.pp.normalize_total(adata_sole_young)
sc.pp.log1p(adata_sole_young)

In [None]:
sc.pp.pca(adata_sole_young, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_sole_young, random_state=seed, n_neighbors=int(0.5 * len(adata_sole_young) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_sole_young)

In [None]:
sc.tl.umap(adata_sole_young, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_sole_young, resolution=6.5, random_state=seed)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'LUM', 'PDGFRA', 'COL1A1', 'DCN'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'S100B', 'MPZ', 'PLP1', 'MLANA', 'PMEL'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_sole_young, dict_cats=dict_cats_fb, min_score=0.3, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_sole_young, color=['leiden', 'assigned_cats'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_sole_young, color=['SAT1', 'RBFOX3', 'SELE'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_sole_young_fb = adata_sole_young[adata_sole_young.obs['assigned_cats'] == 'fibro']

In [None]:
sc.pp.filter_genes(adata_sole_young_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_sole_young_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_sole_young_fb, random_state=seed, n_neighbors=int(len(adata_sole_young_fb) ** 0.5), metric='cosine')
tk.tl.triku(adata_sole_young_fb)

In [None]:
sc.tl.umap(adata_sole_young_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_sole_young_fb, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_sole_young_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.7, key_added='cluster')
assign_cats(adata_sole_young_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15,)

In [None]:
adata_sole_young_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_sole_young_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_sole_young_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_sole_young_fb, color=['cluster'] + [i for i in val if i in adata_sole_young_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_sole_young_fb)

### Tabib et al. 2018

In [None]:
tabib_dir = data_dir + '/Tabib_2018'

In [None]:
adata_tabib = sc.read_csv(tabib_dir + '/Skin_6Control_rawUMI.csv')
adata_tabib = adata_tabib.transpose()

In [None]:
df_metadata_tabib = pd.read_csv(tabib_dir + '/Skin_6Control_Metadata.csv', index_col=0)

df metadata has 8366 cells, although the paper states that 8522 cells were analyzed. The rest of cells are erithrocytes, which were filtered out from the analysis.

In [None]:
adata_tabib.raw = adata_tabib

In [None]:
dict_reverse_mappings = {'Fibroblast': ['0', '3', '4'], 
                 'Keratinocyte': ['1', '5', '7', '11', '14',], 
                 'Endothelial cell': ['2'], 
                 'Pericyte': ['6', '10'], 
                 'Macrophage/DC': ['8'], 
                 'Lymphocyte': ['9'], 
                 'Secretory Epith': ['12'], 
                 'Smooth Muscle': ['13'], 
                 'Melanocyte': ['15'], 
                 'Neural Cell': ['16'],
                 'Cornified Env': ['17'],
                 'B cell': ['18'], 
                 'Erithrocyte': [np.NaN]}  # This is ours!

dict_mappings = {}

for key, val in dict_reverse_mappings.items():
    for val_i in val:
        dict_mappings[val_i] = key

In [None]:
adata_tabib.obs['res.0.6'] = df_metadata_tabib['res.0.6'].astype(str)
adata_tabib.obs['cluster'] = [dict_mappings[i] for i in adata_tabib.obs['res.0.6']]

Since we are interested in fibros, we are going to filter their specific populations

In [None]:
adata_tabib_fb = adata_tabib[adata_tabib.obs['cluster'].isin(['Fibroblast']), :].copy()
sc.pp.filter_genes(adata_tabib_fb, min_counts=1)
adata_tabib_fb.X = spr.csr.csr_matrix(adata_tabib_fb.X).copy()
adata_tabib_fb.raw = adata_tabib_fb

In [None]:
sc.pp.normalize_total(adata_tabib_fb)
sc.pp.log1p(adata_tabib_fb)

In [None]:
sc.pp.pca(adata_tabib_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_tabib_fb, random_state=seed, n_neighbors=int(0.5 * len(adata_tabib_fb) ** 0.5), metric='cosine')
# tk.tl.triku(adata_tabib_fb)

In [None]:
sc.tl.umap(adata_tabib_fb, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_tabib_fb, resolution=15, random_state=seed)

In [None]:
assign_cats(adata_tabib_fb, dict_cats=dict_cats_clusters, min_score=0.25, quantile_gene_sel=0.7, key_added='cluster')
assign_cats(adata_tabib_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.45, key_added='axis', 
            intermediate_states=True, diff=0.15,)

In [None]:
adata_tabib_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_tabib_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_tabib_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_tabib_fb, color=['cluster'] + [i for i in val if i in adata_tabib_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_tabib_fb)

### Vorstandlechner et al. 2020

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'

In [None]:
adata_vors = sc.read(vors_dir + '/skin_vorstandlechner.loom', cache=True)

In [None]:
sc.pp.filter_genes(adata_vors, min_counts=1)

In [None]:
# Basic QC filtering
adata_vors.var['mt'] = adata_vors.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_vors, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_vors, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_vors, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_vors, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_vors = adata_vors[((adata_vors.obs.n_genes_by_counts < 1700) & 
                                    (adata_vors.obs.n_genes_by_counts > 250)).values, :]
adata_vors = adata_vors[adata_vors.obs.pct_counts_mt < 15, :]

In [None]:
sc.pp.filter_genes(adata_vors, min_counts=1)
sc.pp.normalize_total(adata_vors)
sc.pp.log1p(adata_vors)

In [None]:
sc.pp.pca(adata_vors, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_vors, random_state=seed, n_neighbors=int(0.5 * len(adata_vors) ** 0.5), metric='cosine')
tk.tl.triku(adata_vors)

In [None]:
sc.tl.umap(adata_vors, min_dist=0.6, random_state=seed)
sc.tl.leiden(adata_vors, resolution=3, random_state=seed)

In [None]:
assign_cats(adata_vors, dict_cats=dict_cats_fb, min_score=0.5, quantile_gene_sel=0.2)

In [None]:
sc.pl.umap(adata_vors, color=['assigned_cats', 'leiden'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
adata_vors_fb = adata_vors[adata_vors.obs['assigned_cats'].isin(['fibro'])]
adata_vors_fb_raw = adata_vors_fb.copy()

In [None]:
sc.pp.filter_genes(adata_vors_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_vors_fb, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_vors_fb, random_state=seed, n_neighbors=int(len(adata_vors_fb) ** 0.5 // 3), metric='cosine')
tk.tl.triku(adata_vors_fb)

In [None]:
sc.tl.umap(adata_vors_fb, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_vors_fb, resolution=3, random_state=seed)

In [None]:
assign_cats(adata_vors_fb, dict_cats=dict_cats_clusters, min_score=0.4, quantile_gene_sel=0.7, key_added='cluster')
assign_cats(adata_vors_fb, column_groupby='cluster', dict_cats=dict_cats_axes, min_score=0.4, key_added='axis', 
            intermediate_states=True, diff=0.15)

In [None]:
adata_vors_fb.uns['cluster_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for 
                                      i in sorted(set(adata_vors_fb.obs['cluster']))]

In [None]:
sc.pl.umap(adata_vors_fb, color=['leiden', 'axis', 'cluster'], legend_loc='on data', 
           cmap=magma, use_raw=False, )

In [None]:
del adata_vors_fb.obs['C3']

In [None]:
# sc.tl.rank_genes_groups(adata_vors_fb, groupby='cluster', groups=['unassigned'])
# sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=200)

In [None]:
for key, val in dict_cats_clusters.items():
    print(key)
    sc.pl.umap(adata_vors_fb, color=['cluster'] + [i for i in val if i in adata_vors_fb.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

In [None]:
plot_score_graph(adata_vors_fb)

## Dataset merge

We will remove He from the list of datasets to remove noise. But the mergeing is still not good.

In [None]:
adata_all = sc.AnnData.concatenate(adata_tabib_fb, adata_sole_young_fb, adata_vors_fb, 
                           adata_kim_fb, adata_gaydosik_HC_fb, 
                           adata_gao_fb, adata_mirizio_fb, 
                          batch_key='dataset', 
                          batch_categories=['Tabib', 'Solé-Boldo', 'Vorstandlechner', 
                                            'Kim', 'Gaydosik', 'Gao', 'Mirizio'], 
                          join='outer')

adata_all = adata_all[adata_all.obs['cluster'] != 'unassigned']

In [None]:
sc.pp.filter_genes(adata_all, min_counts=1)

In [None]:
adata_all.obs['dataset_batch'] = adata_all.obs['dataset'].astype(str) + \
                                    adata_all.obs['batch'].astype(str)

In [None]:
sce.pp.harmony_integrate(adata_all, key='dataset_batch', max_iter_harmony = 30, epsilon_harmony = 5e-5,)
# sc.pp.pca(adata_all, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_all, n_neighbors=int(len(adata_all) ** 0.5 // 2), use_rep='X_pca_harmony')
# sce.pp.bbknn(adata_all, metric='angular', batch_key='dataset_batch', neighbors_within_batch=4)
tk.tl.triku(adata_all, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_all, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_all, resolution=5, random_state=seed)

In [None]:
sc.pl.umap(adata_all, color=['leiden', 'axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
sc.pl.umap(adata_all, color=['axis', 'cluster'], cmap=magma, use_raw=False, )

In [None]:
for cluster in list(dict.fromkeys(adata_all.obs['cluster'].values)):
    adata_all.obs[f'is_{cluster}'] = [i if i == cluster else '-'  for i in adata_all.obs['cluster'].values]
    adata_all.uns[f'is_{cluster}_colors'] = ['#bcbcbc', '#bc0000']

In [None]:
sc.pl.umap(adata_all, color=sorted([f'is_{i}' for i in list(dict.fromkeys(adata_all.obs['cluster'].values))]), 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_all, color=['dataset'], cmap=magma, use_raw=False, )

We see that the integration is not good enough to see the main clusters together.

## Detecting the genes that drive the axes (2021/06/16)

Right now, the characterization of the clusters is the following.

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-1wig{font-weight:bold;text-align:left;vertical-align:top}
.tg .tg-uca5{text-align:left;text-decoration:underline;vertical-align:top}
.tg .tg-0lax{text-align:left;vertical-align:top}
.tg .tg-akbm{font-weight:bold;text-align:left;text-decoration:underline;vertical-align:top}
.tg .tg-6t3r{font-style:italic;font-weight:bold;text-align:left;vertical-align:top}
.tg .tg-8zwo{font-style:italic;text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-0lax"></th>
    <th class="tg-1wig">A1</th>
    <th class="tg-1wig">A2</th>
    <th class="tg-1wig">A3</th>
    <th class="tg-1wig">A4</th>
    <th class="tg-1wig">B1</th>
    <th class="tg-1wig">B2</th>
    <th class="tg-1wig">D1</th>
    <th class="tg-1wig">C1</th>
    <th class="tg-1wig">C2</th>
    <th class="tg-1wig">C3</th>
    <th class="tg-akbm">C4</th>
    <th class="tg-akbm">C4*</th>
    <th class="tg-akbm">C5</th>
    <th class="tg-6t3r">C6</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-1wig">Tabib</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5"></td>
    <td class="tg-8zwo"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Solé-Boldo</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5">B</td>
    <td class="tg-8zwo"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Vorstandlechner</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5">C</td>
    <td class="tg-8zwo"></td>
  </tr>
  <tr>
    <td class="tg-1wig">He</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-uca5">B</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5"></td>
    <td class="tg-8zwo">C</td>
  </tr>
  <tr>
    <td class="tg-1wig">Kim</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5">B</td>
    <td class="tg-uca5">A</td>
    <td class="tg-8zwo">C</td>
  </tr>
  <tr>
    <td class="tg-1wig">Gaydosik</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-uca5">B</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5"></td>
    <td class="tg-8zwo"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Gao</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5">C</td>
    <td class="tg-uca5">B</td>
    <td class="tg-8zwo">C</td>
  </tr>
  <tr>
    <td class="tg-1wig">Mirizio</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5"></td>
    <td class="tg-8zwo"></td>
  </tr>
</tbody>
</table>

A: AEBP1, AQP1, CD9, COL1A2, COL1A1, COL6A1, ELN, FBN1, MMP2, NBL1, PAM, QPCT, RGCC, SFRP2, THBS2

B: APOE, C3, C7, CXCL12, CYGB, GGT5, IGFBP7, RARRES2, TMEM176A, TMEM176B, TNFSF13B

C: DKK3, EMID1, COL1A2, GPM6B, INHBA, SPARCL1, TNMD, TNN

We see that cluster D1, originally from Tabib et al. is appearing in the rest of datasets with higher or lower levels of confidence. From the original Axis naming convention (based on A1, A2, A3, A4, B1, B2, C1, C2, C3 and C4), the new clusters C4* C5, C6, and D1 are not completely assigned. D1 is fully assigned to B, C6 is fully assigned to C; but C4, C4*, and C5 are assigned to A, B and C in certain degrees. 

We are going to find genes that assign the C4/C5/C6 clusters reliably to one axis and, in case it does not work, assign them to its own axis.

The main idea based on the UMAPs is that:
* C5 and D1 are always near B1/B2. Most surely they will be B-like.
* C6 is always near C1 or C-like clusters. 
* C4 and C4* are sometimes near B or C clusters. Most likely they will be independent, a D axis.

To do that, we will get the genes first, assign putative axes to A (A1, A2, A3, A4), B (B1, B2, D1), C (C1, C2, C3, C6), and see where C4/C4* and C5.

In [None]:
for adata_str in ['adata_tabib_fb', 'adata_sole_young_fb', 'adata_vors_fb', 'adata_he_fb', 'adata_kim_fb', 
                  'adata_gaydosik_HC_fb', 'adata_gao_fb', 'adata_mirizio_fb']:
    print(adata_str)
    sc.pl.umap(eval(adata_str), color=['axis', 'cluster'], legend_loc='on data', cmap=magma, use_raw=False, )

In [None]:
for adata_name in ['adata_tabib_fb', 'adata_sole_young_fb', 'adata_vors_fb', 'adata_he_fb', 'adata_kim_fb', 
                   'adata_gaydosik_HC_fb', 'adata_gao_fb', 'adata_mirizio_fb']:
    df_adata = pd.Series(index=eval(adata_name).obs_names)
    df_adata[eval(adata_name).obs['cluster'].isin(['A1', 'A2', 'A3', 'A4'])] = 'A'
    df_adata[eval(adata_name).obs['cluster'].isin(['B1', 'B2', 'D1'])] = 'B'
    df_adata[eval(adata_name).obs['cluster'].isin(['C1', 'C2', 'C3', 'C6'])] = 'C'
    
    eval(adata_name).obs['putative_axis'] = df_adata    

In [None]:
adata_name = adata_sole_young_fb

In [None]:
del adata_name.obs['C3']

In [None]:
sc.tl.rank_genes_groups(adata_name, groupby='putative_axis', method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_name, dendrogram=False, n_genes=150, use_raw=False)

In [None]:
sc.pl.umap(adata_name, color=['leiden', 'axis', 'cluster'] + 
           list(adata_name.uns['rank_genes_groups']['names']['B'][:200]), 
           legend_loc='on data', cmap=magma, use_raw=False, )

## Rearranging the genes

After gene selection, the new axes identities are

A: PTGIS, SFRP2, MMP2, RGCC, COL14A1, AQP1, PTGIS, QPCT, ELN, COL14A1, LEPR, ISM1, CES1, WIF1, RECK, SGCA, FBN1, NBL1, CTSB, COL5A1, SMOC2, SGCG

B: GGT5, APOE, APOC1, CYGB, C7, IGFBP7, TNFSF13B, APOC1, RARRES2, CCDC146, CXCL12, ITM2A, EBF1, CCL2, IGFBP3, EBF1, CXCL2, EFEMP1, TMEM176A, C3, EGR1, LGALS3BP, BST2, ANGPTL4, ABCA8

C: TNN, TNMD, ASPN, CYP1B1, GPC1, PPP1R14A, GPM6B, COL11A1, DKK3, OGN, SDC1, PDE1A, MDK, NRP2, POSTN, F2R, KIF26B, TENM3, ALX4, PMEPA1, FIBIN, PCDH15

With that gene selection, the initial table remains the same. D1 and C6 belongs to B, and C4/C4*/C5 do not have a clear gene set from any axis. Therefore, we will assign the table as:

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-1wig{font-weight:bold;text-align:left;vertical-align:top}
.tg .tg-uca5{text-align:left;text-decoration:underline;vertical-align:top}
.tg .tg-0lax{text-align:left;vertical-align:top}
.tg .tg-akbm{font-weight:bold;text-align:left;text-decoration:underline;vertical-align:top}
.tg .tg-bc3m{font-style:italic;text-align:left;text-decoration:underline;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-0lax"></th>
    <th class="tg-1wig">A1</th>
    <th class="tg-1wig">A2</th>
    <th class="tg-1wig">A3</th>
    <th class="tg-1wig">A4</th>
    <th class="tg-1wig">B1</th>
    <th class="tg-1wig">B2</th>
    <th class="tg-1wig">B3 (D1)</th>
    <th class="tg-1wig">C1</th>
    <th class="tg-1wig">C2</th>
    <th class="tg-1wig">C3</th>
    <th class="tg-1wig">C4 (C6)</th>
    <th class="tg-1wig">D1 (C4)<br></th>
    <th class="tg-1wig">D2 (C4*)<br></th>
    <th class="tg-1wig">D3 (C5)<br></th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-1wig">Tabib</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">(B)</td>
    <td class="tg-0lax"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Solé-Boldo</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-bc3m">B</td>
    <td class="tg-bc3m"></td>
    <td class="tg-uca5">B/C</td>
  </tr>
  <tr>
    <td class="tg-1wig">Vorstandlechner</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-bc3m"></td>
    <td class="tg-bc3m"></td>
    <td class="tg-uca5">A/B/C</td>
  </tr>
  <tr>
    <td class="tg-1wig">He</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">C</td>
    <td class="tg-bc3m">B/C</td>
    <td class="tg-bc3m">B/C</td>
    <td class="tg-uca5"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Kim</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-bc3m">B</td>
    <td class="tg-bc3m">B</td>
    <td class="tg-uca5">A/B/C</td>
  </tr>
  <tr>
    <td class="tg-1wig">Gaydosik</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-bc3m">B</td>
    <td class="tg-bc3m">B/C</td>
    <td class="tg-uca5"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Gao</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-bc3m">B/C</td>
    <td class="tg-bc3m">B/C</td>
    <td class="tg-uca5">B</td>
  </tr>
  <tr>
    <td class="tg-1wig">Mirizio</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax">C</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5"></td>
    <td class="tg-uca5"></td>
  </tr>
  <tr>
    <td class="tg-1wig">Reynolds</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax">A</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax">B</td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
    <td class="tg-0lax"></td>
  </tr>
</tbody>
</table>

In [None]:
for gene in [i.upper() for i in ['Bmp5', 
'Tmem158', 
'Adamdec1', 
'Nbl1', 
'Bmp4', 
'Ednrb', 
'Tmem119', 
'Aldh1a1', 
'Nkx2-3', 
'Tspan13', 
'Emid1', 
'Cald1', 
'Rgs10', 
'Fhl1', 
'Tgm2', 
'Fhl2', 
'Sept4', 
'Tpm2', 
'Bmp7', 
'Myl9', ]]:
    print('\n\n\n\n', gene)
    for adata in [adata_tabib_fb, adata_sole_young_fb, adata_vors_fb, adata_he_fb, adata_kim_fb, 
                  adata_gaydosik_HC_fb, adata_gao_fb, adata_mirizio_fb]:
        try:
            sc.pl.umap(adata, color=['cluster', gene], cmap=magma, use_raw=False)
        except:
            print('NOT FOUND!')

In [None]:
sc.tl.rank_genes_groups(adata_name, groupby='cluster',  groups=['C5'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_name, dendrogram=False, n_genes=150, use_raw=False)

In [None]:
sc.pl.umap(adata_vors_fb, color='leiden')

In [None]:
sc.tl.rank_genes_groups(adata_vors_fb, groupby='leiden', groups=['1'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
del adata_tabib_fb.obs['C6']

In [None]:
adata_base = adata_sole_young_fb

In [None]:
del adata_base.obs['C3']

In [None]:
groups = ['A3']

In [None]:
sc.tl.rank_genes_groups(adata_tabib_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_tabib_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_sole_young_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_sole_young_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_vors_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_vors_fb, groupby='cluster', groups=['D1'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_vors_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_he_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_he_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_kim_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_kim_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_gaydosik_HC_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_gaydosik_HC_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_gao_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_gao_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_mirizio_fb, groupby='cluster', groups=groups, method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_mirizio_fb, dendrogram=False, n_genes=250, use_raw=False)

In [None]:
sc.pl.umap(adata_base, color=['axis', 'cluster'] +  
           list(adata_base.uns['rank_genes_groups']['names']['A3'][:250]), cmap=magma, use_raw=False, )

In [None]:
# list_genes = sorted(set(
list_genes = \
"""POSTN
TNMD
ASPN
TNN
MDK
GPM6B
F2R
FIBIN
PCOLCE
AOPEP
CYP1B1
LIMCH1
OGN
PPP1R14A
PMEPA1
SDC1
GPC1
EDNRA
KIF26B
COL11A1
PCDH15
TENM3
ALX4
PDE1A
NRP2""".split('\n')
#     ))

In [None]:
list_genes = ['DKK3', 'EMID1', 'COL1A2', 'GPM6B', 'INHBA', 'SPARCL1', 'TNMD', 'TNN']

In [None]:
list_genes

In [None]:
print('\n'.join(list_genes))

In [None]:
# adata_base = adata_tabib_fb
# adata_base = adata_sole_young_fb
# adata_base = adata_vors_fb
# adata_base = adata_he_fb
# adata_base = adata_kim_fb
# adata_base = adata_gaydosik_HC_fb
adata_base = adata_gao_fb
# adata_base = adata_mirizio_fb

In [None]:
del adata_base.obs['C3']

In [None]:
sc.pl.tracksplot(adata_base, var_names=[i if i in adata_base.var_names else 'SOX2' for i in list_genes], 
                                        groupby='cluster', use_raw=False)

In [None]:
sc.pl.umap(adata_base, color=['axis', 'cluster', 'leiden'] +  
           [i if i in adata_base.var_names else 'SOX2' for i in list_genes], 
           cmap=magma, use_raw=False, ncols=3, )

In [None]:
liu_dir = data_dir + '/liu_2021'


In [None]:
adata = sc.read_loom(f'{liu_dir}/prueba/K007CASE_1.loom')
adata.var_names_make_unique()

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=50)

sc.pp.log1p(adata)
sc.pp.normalize_per_cell(adata)

sc.pp.pca(adata)
sc.pp.neighbors(adata)

sc.tl.umap(adata, min_dist=0.05)

In [None]:
sc.pl.umap(adata, color=['ASPN', 'LUM', 'PDGFRA', 'COL1A1', 'VWF', 'KRT5', 'RGS5', 'PTPRC'], cmap=magma, use_raw=False)

In [None]:
adata

In [None]:
adata = sc.read_loom(f'{liu_dir}/prueba/K007CASE_1_FR.loom')
adata.var_names_make_unique()

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=50)

sc.pp.log1p(adata)
sc.pp.normalize_per_cell(adata)

sc.pp.pca(adata)
sc.pp.neighbors(adata)

sc.tl.umap(adata, min_dist=0.05)

In [None]:
sc.pl.umap(adata, color=['ASPN', 'LUM', 'PDGFRA', 'COL1A1', 'VWF', 'KRT5', 'RGS5', 'PTPRC'], cmap=magma, use_raw=False)

In [None]:
adata