In [None]:
# !wget https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/requirements.txt
# !pip install -r requirements.txt

In [None]:
import scanpy as sc
import scanpy.external as sce

import pandas as pd
import numpy as np

import os

import triku as tk

import matplotlib.pyplot as plt
import matplotlib as mpl

from functools import reduce

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
def assign_cats(adata, dict_cats, column_groupby='leiden', per=98, do_return=False):
    """
    This functions uses a set of genes assigned to different categories so that leiden clusters can be assigned to one of these categories.
    For example, to categorize fibroblasts from pericytes, endothelial cells, or cells with high mitochondrial content.
    It could be done with each cell individually, but it is better to use clusters to discern the different categories because
    the method, although efficient, can sometimes be noisy due to the noisiness of the sc datasets.
    """
    
    for cat in list(dict_cats.keys()):
        mat_cat = np.zeros((len(adata), len(dict_cat[cat])), dtype=float)
        
        for gene_idx, gene in enumerate(dict_cat[cat]):
            try:
                mat_cat[:, gene_idx] = adata[:, gene].X.todense().ravel()
                mat_cat[:, gene_idx] /= np.percentile(mat_cat[:, gene_idx][mat_cat[:, gene_idx] > 0], per)
            except:
                print(f'Gene {gene} is not on the list')            

        sum_mat_cat = np.asarray(mat_cat.sum(1)).ravel()
        sum_mat_cat /= np.percentile(sum_mat_cat[sum_mat_cat > 0], per, axis=0)

        adata.obs[cat] = sum_mat_cat
    
    score_per_cluster = adata.obs[[column_groupby] + list(dict_cats.keys())].groupby(column_groupby).quantile(.8)
    max_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, score_per_cluster.idxmax(axis=1).values))
    adata.obs['assigned_cats'] = [max_cat_dict[i] for i in adata.obs[column_groupby]]
    max_cat_dict_std = dict(zip(score_per_cluster.std(1).index, score_per_cluster.std(1).values))
    adata.obs['assigned_cats_std'] = [max_cat_dict_std[i] for i in adata.obs[column_groupby]]
    max_cat_dict_mean = dict(zip(score_per_cluster.mean(1).index, score_per_cluster.mean(1).values))
    adata.obs['assigned_cats_mean'] = [max_cat_dict_mean[i] for i in adata.obs[column_groupby]]
    adata.obs['assigned_cats_CV'] = adata.obs['assigned_cats_mean'] / adata.obs['assigned_cats_std']
    
    for cat in score_per_cluster.columns:
        max_cat_dict = dict(zip(score_per_cluster.index, score_per_cluster[cat].values))
        adata.obs[f'score_assign_{cat}'] = [max_cat_dict[i] for i in adata.obs[column_groupby]]
    
    if do_return:
        return score_per_cluster

# Reynolds et al. 2020 dataset download and preprocess
In this section we are going to download and process the anndata files for the 5 healthy samples (S1 to S5). We are first going to use the processed files from Reynolds et al., because they contain the main fb populations, which we are interested in (FB1-3). With that, we can partially replicate the analysis. However, the data richness is not that good (the UMAPs are more *blooby*) so we preprocess the FASTQ files on our own, and used the processed adatas.

Once we have our own anndatas, the preprocessing is the following:
* Assign FB types based on Reynolds adatas to our adatas. Some cells will be unassigned.
* QC metrics 
* Filter genes (min counts = 30)
* Set raw X
* Normalize and log1p
* PCA + triku + neighbors + umap + leiden
* Use  `assign_cats` to assign leiden clusters to cell populations with selected markers (markers for each dataset may vary!)
* Filter adata to retain only fibroblasts
* Filter genes to remove 0 counts
* PCA + triku + neighbors + umap + leiden [leiden is not used here but may be used later]
* Check if strange populations appear and filter them in `assign_cats`, then repeat the last steps.

In [None]:
os.getcwd()

In [None]:
reynolds_dir = 'reynolds_2020'
os.makedirs(reynolds_dir, exist_ok=True)

### Making and saving the fb healthy dataset to zenodo

In [None]:
# adata_reynolds = sc.read('submission_210120.h5ad', backup_url='https://zenodo.org/record/4536165/files/submission_210120.h5ad')
# adata_reynolds_fb = adata_reynolds[(adata_reynolds.obs['full_clustering'].isin(['F1', 'F2', 'F3'])) & 
#                                    (adata_reynolds.obs['Status'] == 'Healthy')]
# sc.pp.filter_genes(adata_reynolds_fb, min_counts=100)
# del adata_reynolds_fb.var

# for obs in ['mad_prd', 'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age', 'stage']:
#     del adata_reynolds_fb.obs[obs]
    
# adata_reynolds_fb.write_h5ad(reynolds_dir + '/reynolds_2020_fb_healthy.h5ad')

### Direct h5ad download

In [None]:
adata_reynolds_fb_healthy = sc.read(reynolds_dir + '/reynolds_2020_fb_healthy.h5ad', 
                                    backup_url='https://zenodo.org/record/4605340/files/reynolds_2020_fb_healthy.h5ad?download=1')

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_healthy, min_counts=50)

In [None]:
sc.pp.log1p(adata_reynolds_fb_healthy)
sc.pp.normalize_total(adata_reynolds_fb_healthy)

In [None]:
df_batches = pd.DataFrame(np.unique(adata_reynolds_fb_healthy.obs['sample_id'], return_counts=True)).transpose()

In [None]:
df_batches.sort_values(by=1, ascending=False)

In [None]:
selected_samples = df_batches[df_batches[1] > 50][0].values

In [None]:
adata_reynolds_fb_healthy = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(selected_samples)]  #selected_samples)]

In [None]:
adata_reynolds_fb_healthy

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_healthy, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_healthy, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_reynolds_fb_healthy, metric='angular', batch_key='sample_id')
tk.tl.triku(adata_reynolds_fb_healthy, n_procs=1, random_state=seed, use_adata_knn=True)

In [None]:
sc.tl.umap(adata_reynolds_fb_healthy, min_dist=0.1, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_healthy, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

## 4820STDY7388991 (S1)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_4820STDY7388991 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['4820STDY7388991'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_4820STDY7388991

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388991, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388991, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388991, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388991, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388991, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_4820STDY7388991_loom = sc.read('reynolds_2020/reynolds_2020_0_4820STDY7388991_s1_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_0_4820STDY7388991_s1_dermis_fibroblasts.loom')
adata_reynolds_fb_4820STDY7388991_loom.var_names_make_unique()

In [None]:
adata_reynolds_fb_4820STDY7388991_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_4820STDY7388991_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_4820STDY7388991_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_4820STDY7388991.obs_names & adata_reynolds_fb_4820STDY7388991_loom.obs_names] = adata_reynolds_fb_4820STDY7388991[adata_reynolds_fb_4820STDY7388991.obs_names & adata_reynolds_fb_4820STDY7388991_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_4820STDY7388991_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_4820STDY7388991_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_4820STDY7388991_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_4820STDY7388991_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388991_loom, min_counts=30)
adata_reynolds_fb_4820STDY7388991_loom.raw = adata_reynolds_fb_4820STDY7388991_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_4820STDY7388991_loom)
sc.pp.log1p(adata_reynolds_fb_4820STDY7388991_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388991_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388991_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388991_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388991_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991_loom, resolution=3.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom, color=['LUM', 'PDGFRA', 'VIM', 'DCN', 'COL1A1', 'SFRP2', 'APOE', 'POSTN'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388991_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7388991_loom)

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TPSB2', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 'melanocyte': ['PMEL', 'MLANA'], 
            'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6']}

assign_cats(adata_reynolds_fb_4820STDY7388991_loom, dict_cat)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
adata_reynolds_fb_4820STDY7388991_loom_fb = adata_reynolds_fb_4820STDY7388991_loom[adata_reynolds_fb_4820STDY7388991_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388991_loom_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388991_loom_fb, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388991_loom_fb, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388991_loom_fb, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991_loom_fb, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 
                                               'DKK3', 'TNN', 'SFRP1', 'POSTN'], cmap=magma, use_raw=False, legend_loc='on data')

## 4820STDY7388999 (S2)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_4820STDY7388999 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['4820STDY7388999'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_4820STDY7388999

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388999, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388999, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388999, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388999, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388999, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_4820STDY7388999_loom = sc.read('reynolds_2020/reynolds_2020_8_4820STDY7388999_s2_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_8_4820STDY7388999_s2_dermis_fibroblasts.loom')

adata_reynolds_fb_4820STDY7388999_loom.var_names_make_unique()

In [None]:
adata_reynolds_fb_4820STDY7388999_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_4820STDY7388999_loom.obs_names]

In [None]:
df_fb_type = pd.Series('Other', index=adata_reynolds_fb_4820STDY7388999_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_4820STDY7388999.obs_names & adata_reynolds_fb_4820STDY7388999_loom.obs_names] = adata_reynolds_fb_4820STDY7388999[adata_reynolds_fb_4820STDY7388999.obs_names & adata_reynolds_fb_4820STDY7388999_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_4820STDY7388999_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_4820STDY7388999_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_4820STDY7388999_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_4820STDY7388999_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388999_loom, min_counts=30)
adata_reynolds_fb_4820STDY7388999_loom.raw = adata_reynolds_fb_4820STDY7388999_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_4820STDY7388999_loom)
sc.pp.log1p(adata_reynolds_fb_4820STDY7388999_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388999_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388999_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388999_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388999_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999_loom, resolution=0.5, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 'POSTN'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 
                                               'DKK3', 'TNN', 'SFRP1'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 
                                                          'DMKN', 'KRT1'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TPSB2', 'TRAC', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 'endo': ['CLDN5', 'PECAM1'], 'kerato': ['DMKN', 'KRT1'],
            'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6']}

assign_cats(adata_reynolds_fb_4820STDY7388999_loom, dict_cat)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom)

In [None]:
adata_reynolds_fb_4820STDY7388999_loom = adata_reynolds_fb_4820STDY7388999_loom[adata_reynolds_fb_4820STDY7388999_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388999_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388999_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388999_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388999_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388999_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999_loom, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 
                                               'DKK3', 'TNN', 'SFRP1', 'EDNRB', 'IGFBP7'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom, groupby='leiden')

In [None]:
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom)

## 4820STDY7389007 (S3)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_4820STDY7389007 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['4820STDY7389007'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_4820STDY7389007

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7389007, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7389007, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7389007, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7389007, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7389007, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_4820STDY7389007_loom = sc.read('reynolds_2020/reynolds_2020_16_4820STDY7389007_s3_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_16_4820STDY7389007_s3_dermis_fibroblasts.loom')

adata_reynolds_fb_4820STDY7389007_loom.var_names_make_unique()

In [None]:
adata_reynolds_fb_4820STDY7389007_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_4820STDY7389007_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_4820STDY7389007_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_4820STDY7389007.obs_names & adata_reynolds_fb_4820STDY7389007_loom.obs_names] = adata_reynolds_fb_4820STDY7389007[adata_reynolds_fb_4820STDY7389007.obs_names & adata_reynolds_fb_4820STDY7389007_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_4820STDY7389007_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_4820STDY7389007_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_4820STDY7389007_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_4820STDY7389007_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7389007_loom, min_counts=30)
adata_reynolds_fb_4820STDY7389007_loom.raw = adata_reynolds_fb_4820STDY7389007_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_4820STDY7389007_loom)
sc.pp.log1p(adata_reynolds_fb_4820STDY7389007_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7389007_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7389007_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7389007_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7389007_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007_loom, resolution=0.8, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 'POSTN', 'RGS5', 'MYL9', 'NDUFA4L2', 'HBB'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TPSB2', 'TRAC', 'TPSAB1', 'HLA-DRA', 'FCER1G'], 'kerato': ['DMKN', 'KRT1'],
            'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6']}

assign_cats(adata_reynolds_fb_4820STDY7389007_loom, dict_cat)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7389007_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7389007_loom)

In [None]:
adata_reynolds_fb_4820STDY7389007_loom = adata_reynolds_fb_4820STDY7389007_loom[adata_reynolds_fb_4820STDY7389007_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7389007_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7389007_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7389007_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7389007_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7389007_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007_loom, resolution=1.2, random_state=seed)

In [None]:
adata_reynolds_fb_4820STDY7389007_loom

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

## SKN8104899 (S4)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_SKN8104899 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['SKN8104899'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_SKN8104899

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8104899, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8104899, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_SKN8104899, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_SKN8104899, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8104899, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_SKN8104899, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_SKN8104899_loom = sc.read('reynolds_2020/reynolds_2020_84_SKN8104899_S4_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_84_SKN8104899_S4_dermis_fibroblasts.loom')

adata_reynolds_fb_SKN8104899_loom.var_names_make_unique()

In [None]:
adata_reynolds_fb_SKN8104899_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_SKN8104899_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_SKN8104899_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_SKN8104899.obs_names & adata_reynolds_fb_SKN8104899_loom.obs_names] = adata_reynolds_fb_SKN8104899[adata_reynolds_fb_SKN8104899.obs_names & adata_reynolds_fb_SKN8104899_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_SKN8104899_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_SKN8104899_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_SKN8104899_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_SKN8104899_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8104899_loom, min_counts=30)
adata_reynolds_fb_SKN8104899_loom.raw = adata_reynolds_fb_SKN8104899_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_SKN8104899_loom)
sc.pp.log1p(adata_reynolds_fb_SKN8104899_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8104899_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_SKN8104899_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_SKN8104899_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8104899_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8104899_loom, resolution=0.8, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 'POSTN', 'RGS5', 'MYL9', 'NDUFA4L2', 'HBB'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 'POSTN'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TRAC', 'HLA-DRA', 'FCER1G'], 'kerato': ['DMKN', 'KRT1'],
            'mt': ['MTND4P12', 'ADAM33', 'RN7SL2', ]}

assign_cats(adata_reynolds_fb_SKN8104899_loom, dict_cat)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom)

In [None]:
adata_reynolds_fb_SKN8104899_loom = adata_reynolds_fb_SKN8104899_loom[adata_reynolds_fb_SKN8104899_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8104899_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8104899_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_SKN8104899_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_SKN8104899_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8104899_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8104899_loom, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom)

## SKN8105197 (S5)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_SKN8105197 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['SKN8105197'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_SKN8105197

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8105197, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8105197, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_SKN8105197, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_SKN8105197, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8105197, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_SKN8105197, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_SKN8105197_loom = sc.read('reynolds_2020/reynolds_2020_92_SKN8105197_S5_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_92_SKN8105197_S5_dermis_fibroblasts.loom')
adata_reynolds_fb_SKN8105197_loom.var_names_make_unique()

In [None]:
adata_reynolds_fb_SKN8105197_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_SKN8105197_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_SKN8105197_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_SKN8105197.obs_names & adata_reynolds_fb_SKN8105197_loom.obs_names] = adata_reynolds_fb_SKN8105197[adata_reynolds_fb_SKN8105197.obs_names & adata_reynolds_fb_SKN8105197_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_SKN8105197_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_SKN8105197_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_SKN8105197_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_SKN8105197_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8105197_loom, min_counts=30)
adata_reynolds_fb_SKN8105197_loom.raw = adata_reynolds_fb_SKN8105197_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_SKN8105197_loom)
sc.pp.log1p(adata_reynolds_fb_SKN8105197_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8105197_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_SKN8105197_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_SKN8105197_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8105197_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8105197_loom, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 'RGS5', 'MYL9', 'NDUFA4L2', 'HBB'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'SFRP2', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['HLA-DRA', 'FCER1G'], 'melano':['PMEL', 'MLANA']}

assign_cats(adata_reynolds_fb_SKN8105197_loom, dict_cat)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8105197_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_SKN8105197_loom)

In [None]:
adata_reynolds_fb_SKN8105197_loom = adata_reynolds_fb_SKN8105197_loom[adata_reynolds_fb_SKN8105197_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8105197_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8105197_loom, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_SKN8105197_loom, n_procs=1, random_state=seed, use_adata_knn=True)
sc.pp.neighbors(adata_reynolds_fb_SKN8105197_loom, metric='cosine')

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8105197_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8105197_loom, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'SFRP2', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

### SKN8105197 exclusion
We do not see SFRP2 expression, which has been a through fb marker, together with APOE. We will not include this sample in the analysis.

# Preparing adatas for FB1+3 and FB2

We've realized that FB1+3 and FB2 showed the same main populations based on JID (A1, A2 and B2 mainly). We want to obtain distinct populations based on the marker expression, and also be able to separate these two populations. The problem is that the FB1+3 and FB2 populations, although not fully overlapping, they do show some overlap between them, and we would like to separate them only based on marker genes and clustering. These markers should be shared between all datasets. To do that, we are going to do a preeliminar DEG analysis and find markers that separate these populations in an unbiased manner.

## Finding markers to separate 1+3 from 2

**This part may not be reproducible in other notebooks. This part is to get markers that will be reproducible later on**

In [None]:
# S1
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991_loom_fb, resolution=3, random_state=seed)
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_4820STDY7388991_loom_fb.obs['fb_type'] = ['1+3' if i in ['3', '16', '12', '19', '6', '17', '8', '2', '9', '11', '12', '14', '0', '21', '22'] else '2' for i in adata_reynolds_fb_4820STDY7388991_loom_fb.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388991_loom_fb, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s1 = pd.DataFrame(adata_reynolds_fb_4820STDY7388991_loom_fb.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_4820STDY7388991_loom_fb.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
# S2
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999_loom, resolution=3, random_state=seed)
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_4820STDY7388999_loom.obs['fb_type'] = ['2' if i in ['15', '16', '19'] else '1+3' for i in adata_reynolds_fb_4820STDY7388999_loom.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s2 = pd.DataFrame(adata_reynolds_fb_4820STDY7388999_loom.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_4820STDY7388999_loom.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
# S3
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007_loom, resolution=2.8, random_state=seed)
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_4820STDY7389007_loom.obs['fb_type'] = ['2' if i in ['0', '17', '16', '13', '4', '1', '19'] else '1+3' for i in adata_reynolds_fb_4820STDY7389007_loom.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7389007_loom, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s3 = pd.DataFrame(adata_reynolds_fb_4820STDY7389007_loom.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_4820STDY7389007_loom.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
# S4
sc.tl.leiden(adata_reynolds_fb_SKN8104899_loom, resolution=3, random_state=seed)
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_SKN8104899_loom.obs['fb_type'] = ['2' if i in ['8', '6', '14', '27'] else '1+3' for i in adata_reynolds_fb_SKN8104899_loom.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s4 = pd.DataFrame(adata_reynolds_fb_SKN8104899_loom.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_SKN8104899_loom.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
N = 30

genes_13, counts_13 = np.unique(list(df_rank_s1.iloc[:N, 0].values) + list(df_rank_s2.iloc[:N, 0].values) + list(df_rank_s3.iloc[:N, 0].values) + list(df_rank_s4.iloc[:N, 0].values), return_counts=True)
genes_13_shared = genes_13[counts_13 >= 3]

genes_2, counts_2 = np.unique(list(df_rank_s1.iloc[-N:, 0].values) + list(df_rank_s2.iloc[-N:, 0].values) + list(df_rank_s3.iloc[-N:, 0].values) + list(df_rank_s4.iloc[-N:, 0].values), return_counts=True)
genes_2_shared = genes_2[counts_2 >= 3]

In [None]:
genes_13_shared

In [None]:
genes_2_shared

## Separe the 1+3 from 2

In [None]:
genes_13_shared = ['BNIP3L', 'BNIP3P1', 'CEMIP', 'ENO1', 'GSTO1', 'MEDAG', 'NAMPT', 'NAMPTP1', 'NRN1', ]

In [None]:
genes_2_shared = ['ANXA2', 'CALM1', 'FOSB', 'LMNA', 'MTRNR2L12', 'S100A10', 'S100A4', 'S100A6', 'TMSB4X', 'TPPP3']

In [None]:
dict_cat = {'1+3': genes_13_shared, '2': genes_2_shared}

for adata in [adata_reynolds_fb_4820STDY7388991_loom_fb, adata_reynolds_fb_4820STDY7388999_loom, adata_reynolds_fb_4820STDY7389007_loom, adata_reynolds_fb_SKN8104899_loom]:
    assign_cats(adata, dict_cat)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats',  'score_assign_1+3', 'score_assign_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats', 'score_assign_1+3', 'score_assign_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats',  'score_assign_1+3', 'score_assign_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats',  'score_assign_1+3', 'score_assign_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

## Create the adatas

In [None]:
adata_s1_13 = adata_reynolds_fb_4820STDY7388991_loom_fb[adata_reynolds_fb_4820STDY7388991_loom_fb.obs['assigned_cats'] == '1+3'].copy()
adata_s1_2 = adata_reynolds_fb_4820STDY7388991_loom_fb[adata_reynolds_fb_4820STDY7388991_loom_fb.obs['assigned_cats'] == '2'].copy()

adata_s2_13 = adata_reynolds_fb_4820STDY7388999_loom[adata_reynolds_fb_4820STDY7388999_loom.obs['assigned_cats'] == '1+3'].copy()
adata_s2_2 = adata_reynolds_fb_4820STDY7388999_loom[adata_reynolds_fb_4820STDY7388999_loom.obs['assigned_cats'] == '2'].copy()

adata_s3_13 = adata_reynolds_fb_4820STDY7389007_loom[adata_reynolds_fb_4820STDY7389007_loom.obs['assigned_cats'] == '1+3'].copy()
adata_s3_2 = adata_reynolds_fb_4820STDY7389007_loom[adata_reynolds_fb_4820STDY7389007_loom.obs['assigned_cats'] == '2'].copy()

adata_s4_13 = adata_reynolds_fb_SKN8104899_loom[adata_reynolds_fb_SKN8104899_loom.obs['assigned_cats'] == '1+3'].copy()
adata_s4_2 = adata_reynolds_fb_SKN8104899_loom[adata_reynolds_fb_SKN8104899_loom.obs['assigned_cats'] == '2'].copy()

In [None]:
for adata_s_name in ['adata_s1_13', 'adata_s1_2', 'adata_s2_13', 'adata_s2_2', 'adata_s3_13', 'adata_s3_2', 'adata_s4_13', 'adata_s4_2']:  # We do this because with the adata name by itselft it won't work!!!
    sc.pp.filter_genes(eval(adata_s_name), min_counts=1)
    sc.pp.pca(eval(adata_s_name), random_state=seed, n_comps=30)
    tk.tl.triku(eval(adata_s_name), n_procs=1, random_state=seed)
    sc.pp.neighbors(eval(adata_s_name), metric='cosine')
    sc.tl.umap(eval(adata_s_name), min_dist=0.05, random_state=seed)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s1_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s1_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s2_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s2_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s3_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s3_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s4_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s4_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

## Assigning JID categories
Based on this we do not see any discordancies in the dataset. We are going to apply the category assignation to assign the JID clusters to the dataset.

In [None]:
def assign_cats(adata, dict_cats, column_groupby='leiden', quantile_gene_sel=0.85, per_outlier=98, do_return=False, intermediate_states=False, diff=0.05):
    """
    This functions uses a set of genes assigned to different categories so that leiden clusters can be assigned to one of these categories.
    For example, to categorize fibroblasts from pericytes, endothelial cells, or cells with high mitochondrial content.
    It could be done with each cell individually, but it is better to use clusters to discern the different categories because
    the method, although efficient, can sometimes be noisy due to the noisiness of the sc datasets.
    """
    
    for cat in list(dict_cats.keys()):
        mat_cat = np.zeros((len(adata), len(dict_cat[cat])), dtype=float)
        
        for gene_idx, gene in enumerate(dict_cat[cat]):
            try:
                mat_cat[:, gene_idx] = np.asarray(np.dot(adata.obsp['connectivities'], adata[:, gene].X).todense()).ravel()
                # We normalize the expression to the maximum (per 98) of expression ** 0.5 to account to some extent for the expression of the gene. 
                # In the end, a gene highly expressed must account for more than a gene with lower expression
                mat_cat[:, gene_idx] /= np.percentile(mat_cat[:, gene_idx][mat_cat[:, gene_idx] > 0], per_outlier) ** 0.5
            except:
                print(f'Gene {gene} is not on the list')            
        
        sum_mat_cat = np.asarray(mat_cat.sum(1)).ravel()
        try:
            sum_mat_cat /= np.percentile(sum_mat_cat[sum_mat_cat > 0], per_outlier, axis=0)
        except:
            pass
        
        adata.obs[cat] = sum_mat_cat
    
    score_per_cluster = adata.obs[[column_groupby] + list(dict_cats.keys())].groupby(column_groupby).quantile(quantile_gene_sel)
    max_cat_dict_std = dict(zip(score_per_cluster.std(1).index, score_per_cluster.std(1).values))
    adata.obs['assigned_cats_std'] = [max_cat_dict_std[i] for i in adata.obs[column_groupby]]
    max_cat_dict_mean = dict(zip(score_per_cluster.mean(1).index, score_per_cluster.mean(1).values))
    adata.obs['assigned_cats_mean'] = [max_cat_dict_mean[i] for i in adata.obs[column_groupby]]
    adata.obs['assigned_cats_CV'] = adata.obs['assigned_cats_mean'] / adata.obs['assigned_cats_std']
    
    for cat in score_per_cluster.columns:
        max_cat_dict = dict(zip(score_per_cluster.index, score_per_cluster[cat].values))
        adata.obs[f'score_assign_{cat}'] = [max_cat_dict[i] for i in adata.obs[column_groupby]]
    
    if intermediate_states: # For each cluster we will identify which categories are close to the highest one, and merge their names.
        list_names_cats_per_cluster = []
        for cluster in score_per_cluster.index:
            scores_cluster = score_per_cluster.loc[cluster]
            scores_cluster = scores_cluster[scores_cluster > scores_cluster.max() - diff]
            list_names_cats_per_cluster.append('/'.join(scores_cluster.index))
            
        inter_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, list_names_cats_per_cluster))
        adata.obs['assigned_cats'] = [str(inter_cat_dict[i]) for i in adata.obs[column_groupby]]
    else:
        max_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, score_per_cluster.idxmax(axis=1).values))
        adata.obs['assigned_cats'] = [str(max_cat_dict[i]) for i in adata.obs[column_groupby]]
    
    if do_return:
        return score_per_cluster

In [None]:
## MAP JID populations

dict_cat = {'A1': ['PI16', 'QPCT', 'SLPI', 'WISP2', 'CPE', 'CTHRC1', 'MFAP5', 'PCOLCE2', 'SCARA5', 'TSPAN8'], 
            'A2': ['APCDD1', 'COL18A1', 'COMP', 'NKD2', 'F13A1', 'HSPB3', 'LEPR', 'TGFBI'], 
            'B1': ['CCL2', 'ITM2A', 'SPSB1', 'SOD2', 'APOD', 'C3', 'C7', 'CXCL2', 'CXCL12', 'DNAJA1', 'GGT5', 'IGFBP7', 'MYC'], 
            'B2': ['SOCS3', 'CCL19', 'CD74', 'RARRES2', 'C3', 'CCDC146', 'IGFBP3', 'TNFSF13B'], 
            'C1': ['COL11A1', 'DPEP1', 'EDNRA', 'TMEM119'],
            'C2': ['COCH', 'CRABP1', 'EMID', 'ARHGAP15', 'PLXDC1', 'FIBIN', 'F13A1', 'GPM6B', 'HSPA2', 'MEOX',  'RSPO4', 'SERTAD4-AS1', 'SLITRK6', 'TNN', 'TRSP1'], 
            'C3': ['ASPN', 'F2R', 'POSTN', 'PTN', 'GPM6B'], 
            'C4': ['ANGPTL7', 'C2orf40', 'SCN7A', 'TM4SF1', 'CLDN1', 'CYP1B1', 'FGFBP2', 'NGFR', 'NRP2', 'SLC22A3']}

A1: TSPAN8, SCARA5 less expressed in 1+3 datasets
A2: F13A1, HSPB3 less expressed in 1+3 datasets. APCDD1 is not a good marker for all samples
B2: CCDC146 + TNFSF13B not expressed in 1+3 datasets

### S1 1+3

In [None]:
adata_s = adata_s1_13

In [None]:
genes = 
sc.pl.umap(adata_s1_13, color=['science_clustering'] + [i for i in genes if i in adata_s1_13.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)
sc.pl.umap(adata_s2_13, color=['science_clustering'] + [i for i in genes if i in adata_s2_13.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)
sc.pl.umap(adata_s3_13, color=['science_clustering'] + [i for i in genes if i in adata_s3_13.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)
sc.pl.umap(adata_s4_13, color=['science_clustering'] + [i for i in genes if i in adata_s4_13.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

sc.pl.umap(adata_s1_2, color=['science_clustering'] + [i for i in genes if i in adata_s1_2.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)
sc.pl.umap(adata_s2_2, color=['science_clustering'] + [i for i in genes if i in adata_s2_2.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)
sc.pl.umap(adata_s3_2, color=['science_clustering'] + [i for i in genes if i in adata_s3_2.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)
sc.pl.umap(adata_s4_2, color=['science_clustering'] + [i for i in genes if i in adata_s4_2.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val[:4] if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S1 2

In [None]:
adata_s = adata_s1_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S2 1+3

In [None]:
adata_s = adata_s2_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S2 2

In [None]:
adata_s = adata_s2_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=1)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S3 1+3

In [None]:
adata_s = adata_s3_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S3 2

In [None]:
adata_s = adata_s3_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S4 1+3

In [None]:
adata_s = adata_s4_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S4 2

In [None]:
adata_s = adata_s4_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.1)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'score_assign_A1', 'score_assign_A2', 'score_assign_B1', 'score_assign_B2', 'score_assign_C2', 'score_assign_C3'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)