In [None]:
# UNCOMMENT THIS TO INSTALL STUFF!
# !wget https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/requirements.txt
# !pip install -r requirements.txt

In [None]:
import scanpy as sc
import scanpy.external as sce

import pandas as pd
import numpy as np
import os
from functools import reduce
import gseapy as gp

import triku as tk

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import scipy.stats as sts

from scipy.sparse import csr_matrix

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
selected_enr_cols = ['Gene_set', 'Term', 'Adjusted P-value', 'Odds Ratio', 'Genes']

In [None]:
mpl.rcParams['figure.dpi'] = 150  # Set this to make higher quality pictures

In [None]:
def assign_cats(adata, dict_cats, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='assigned_cats', min_score=0.6, others_name='unassigned'):
    """
    This functions uses a set of genes assigned to different categories so that leiden clusters can be assigned to one of these categories.
    For example, to categorize fibroblasts from pericytes, endothelial cells, or cells with high mitochondrial content.
    It could be done with each cell individually, but it is better to use clusters to discern the different categories because
    the method, although efficient, can sometimes be noisy due to the noisiness of the sc datasets.
    """
    
    for cat in list(dict_cats.keys()):
        mat_cat = np.zeros((len(adata), len(dict_cats[cat])), dtype=float)
        
        for gene_idx, gene in enumerate(dict_cats[cat]):
            try:
                mat_cat[:, gene_idx] = np.asarray(np.dot(adata.obsp['connectivities'], adata[:, gene].X).todense()).ravel() / adata.uns['neighbors']['params']['n_neighbors']
                mat_cat[mat_cat[:, gene_idx] > 0, gene_idx] = np.argsort(np.argsort(mat_cat[mat_cat[:, gene_idx] > 0, gene_idx]))
                mat_cat[:, gene_idx] /= np.max(mat_cat[:, gene_idx])
            except:
                print(f'Gene {gene} is not on the list')    
            
        sum_mat_cat = np.asarray(mat_cat.mean(1)).ravel()       
        adata.obs[cat] = sum_mat_cat
    
    score_per_cluster = adata.obs[[column_groupby] + list(dict_cats.keys())].groupby(column_groupby).quantile(quantile_gene_sel)
    max_cat_dict_std = dict(zip(score_per_cluster.std(1).index, score_per_cluster.std(1).values))
    adata.obs[f'{key_added}_std'] = [max_cat_dict_std[i] for i in adata.obs[column_groupby]]
    max_cat_dict_mean = dict(zip(score_per_cluster.mean(1).index, score_per_cluster.mean(1).values))
    adata.obs[f'{key_added}_mean'] = [max_cat_dict_mean[i] for i in adata.obs[column_groupby]]
    max_cat_dict_max = dict(zip(score_per_cluster.max(1).index, score_per_cluster.max(1).values))
    adata.obs[f'{key_added}_max'] = [max_cat_dict_max[i] for i in adata.obs[column_groupby]]
    adata.obs[f'{key_added}_CV'] = adata.obs[f'{key_added}_mean'] / adata.obs[f'{key_added}_std']
    
    for cat in score_per_cluster.columns:
        max_cat_dict = dict(zip(score_per_cluster.index, score_per_cluster[cat].values))        
        adata.obs[f'{key_added}_{cat}'] = [max_cat_dict[i] for i in adata.obs[column_groupby]]
    
    if intermediate_states: # For each cluster we will identify which categories are close to the highest one, and merge their names.
        list_names_cats_per_cluster = []
        for cluster in score_per_cluster.index:
            scores_cluster = score_per_cluster.loc[cluster]
            scores_cluster = scores_cluster[scores_cluster > scores_cluster.max() - diff]
            list_names_cats_per_cluster.append('/'.join(scores_cluster.index))
        
        final_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, list_names_cats_per_cluster))
    else:        
        final_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, score_per_cluster.idxmax(axis=1).values))
    
    adata.obs[f'{key_added}'] = [str(final_cat_dict[i]) for i in adata.obs[column_groupby]]
    
    adata.obs[f'{key_added}'][adata.obs[f'{key_added}_max'] < min_score] = others_name
    
    if do_return:
        return score_per_cluster

# Comparison to other datasets
In this notebook we are going to use the datasets from our publication to compare the levels of stress and hypoxia in their fibroblasts compared to the dataset of Reynolds et al.

### Get the datasets

In [None]:
reynolds_fb = sc.read('reynolds_2020/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4709059/files/adata_reynolds_healthy_fb.h5ad')
tabib_fb = sc.read('other/tabib_fb.h5ad', backup_url='https://zenodo.org/record/4455850/files/tabib_fb.h5ad')
he_fb = sc.read('other/he_fb.h5ad', backup_url='https://zenodo.org/record/4455850/files/he_fb.h5ad')
vors_fb = sc.read('other/vorstandlechner_fb.h5ad', backup_url='https://zenodo.org/record/4455850/files/vorstandlechner_fb.h5ad')
sb_fb = sc.read('other/sole_fb.h5ad', backup_url='https://zenodo.org/record/4455850/files/sole_fb.h5ad')

We need to make some matrices as sparse to run the assign cats function

In [None]:
tabib_fb.X = csr_matrix(tabib_fb.X)
sb_fb.X = csr_matrix(sb_fb.X)

We will assign the categories to the reynolds dataset to get similar categories as in the rest of datasets

In [None]:
dict_cats = {'A1': ['PI16', 'QPCT', 'SLPI', 'CCN5', 'CPE', 'CTHRC1', 'MFAP5', 'PCOLCE2', 'SCARA5', 'TSPAN8'], 
            'A2': ['APCDD1', 'COL18A1', 'COMP', 'NKD2', 'F13A1', 'HSPB3', 'LEPR', 'TGFBI'], 
            'B1': ['CXCL2', 'MYC', 'C7', 'SPSB1', 'ITM2A'], 
            'B2': ['SOCS3', 'CCL19', 'CD74', 'RARRES2', 'CCDC146', 'IGFBP3', 'TNFSF13B'], 
            'C': ['CRABP1', 'PLXDC1', 'RSPO4', 'ASPN', 'F2R', 'POSTN', 'TNN']}

dict_colors = {'A1': '#e14b67', 'A2': '#d98c58', 'B1': '#009f61', 'B2': '#54ab4c', 'B1/B2': '#2AA557', 
               'A2/B2': '#979C52', 'A1/A2': '#DD6C60', 'A2/B1': '#6D965D', 'A2/B1/B2': '#819857', 'other': '#bcbcbc'}

assign_cats(reynolds_fb, dict_cats=dict_cats, column_groupby='leiden', intermediate_states=True, diff=0.05, key_added='clusters', 
            others_name='other', min_score=0.45)

reynolds_fb.uns['clusters_colors'] = [dict_colors[i] for i in sorted(set(reynolds_fb.obs['clusters'].values))]

In [None]:
sc.pl.umap(reynolds_fb, color=['clusters'], cmap=magma, use_raw=False)

### Get the gene lists

In [None]:
try:
    stress_genes = np.loadtxt('papers_genes_bad_quality/stress_genes.txt', dtype=str)
    hypoxia_genes = np.loadtxt('papers_genes_bad_quality/hypoxia_genes.txt', dtype=str)
except:
    stress_genes = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/stress_genes.txt', dtype=str)
    hypoxia_genes = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/hypoxia_genes.txt', dtype=str)

In [None]:
dict_cats = {'Stress': stress_genes, 'Hypoxia': hypoxia_genes}

In [None]:
for adata_name in ['reynolds_fb', 'tabib_fb', 'he_fb', 'vors_fb', 'sb_fb']:
    print(f"|||| {adata_name.upper()} ||||")
    assign_cats(eval(adata_name), dict_cats=dict_cats, column_groupby='leiden', intermediate_states=True, min_score=0.5,
            key_added='hypoxia_stress')

In [None]:
sc.pl.umap(reynolds_fb, color=['clusters', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], 
           cmap=magma, use_raw=False, ncols=4, legend_loc='on data')

In [None]:
sc.pl.umap(tabib_fb, color=['clusters', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], 
           cmap=magma, use_raw=False, ncols=4, legend_loc='on data')

In [None]:
sc.pl.umap(he_fb, color=['clusters', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], 
           cmap=magma, use_raw=False, ncols=4, legend_loc='on data')

In [None]:
sc.pl.umap(vors_fb, color=['clusters', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], 
           cmap=magma, use_raw=False, ncols=4, legend_loc='on data')

In [None]:
sc.pl.umap(sb_fb, color=['clusters', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], 
           cmap=magma, use_raw=False, ncols=4, legend_loc='on data')

In [None]:
reynolds_fb.obs['clusters'] = reynolds_fb.obs['full_clustering']

# Figure 2

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(5*2.5, 3*2.5))
for col_idx, adata in enumerate(['reynolds_fb', 'tabib_fb', 'he_fb', 'vors_fb', 'sb_fb']):
    for row_idx, cat in enumerate(['clusters', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia']):
        if row_idx == 2:
            vmin, vmax = 0.2, 0.7
        else:
            vmin, vmax = 0.25, 0.8
            
        sc.pl.umap(eval(adata), color=cat, ax=axs[row_idx, col_idx], title='', show=False, legend_loc='on data', 
                  use_raw=False, frameon=False, vmin=vmin, vmax=vmax, cmap='magma')

for col_idx, name in enumerate(['Reynolds', 'Tabib', 'He', 'Vorstandlechner', 'Solé-Boldo']):
    axs[0, col_idx].set_title(name)

for row_idx, cat in enumerate(['clusters', 'Stress', 'Hypoxia']):
    axs[row_idx, 0].axis('on')
    axs[row_idx, 0].set_ylabel(cat)
    axs[row_idx, 0].set_xlabel('')
    axs[row_idx, 0].set_frame_on(False)

plt.savefig('figures/umapF2.png', dpi=250)

In [None]:
for row_idx, cat in enumerate(['clusters', 'Stress', 'Hypoxia']):
    axs[row_idx, 0].set_ylabel(cat)

In [None]:
plt.show()

# Plot hypoxia genes

In [None]:
sc.pl.umap(reynolds_fb, color=[i for i in hypoxia_genes if i in tabib_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(tabib_fb, color=[i for i in hypoxia_genes if i in tabib_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(he_fb, color=[i for i in hypoxia_genes if i in he_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(vors_fb, color=[i for i in hypoxia_genes if i in vors_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(sb_fb, color=[i for i in hypoxia_genes if i in sb_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

# Plot stress genes

In [None]:
sc.pl.umap(reynolds_fb, color=[i for i in stress_genes if i in tabib_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(tabib_fb, color=[i for i in stress_genes if i in tabib_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(he_fb, color=[i for i in stress_genes if i in he_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(vors_fb, color=[i for i in stress_genes if i in vors_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(sb_fb, color=[i for i in stress_genes if i in sb_fb.var_names], cmap=magma, use_raw=False, legend_loc='on data')