In [None]:
# !wget https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/requirements.txt
# !pip install -r requirements.txt

In [None]:
import scanpy as sc
import scanpy.external as sce

import pandas as pd
import numpy as np
import os
from functools import reduce
import gseapy as gp

import triku as tk

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import scipy.stats as sts

import gc

from cellassign import assign_cats

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
mpl.rcParams['figure.dpi'] = 70  # Set this to make higher quality figures

# Regressing out hypoxia and stress signals

Forms of regressing out:
* Removing the genes of "interest" and rerun processing
* Treat each condition as a batch and using batch effect correction
* Use a function like regress_out 

In [None]:
reynolds_dir = 'reynolds_2020'
os.makedirs(reynolds_dir, exist_ok=True)

papers_dir = 'papers_genes_bad_quality'
os.makedirs(papers_dir, exist_ok=True)

In [None]:
try:
    stress_genes = np.loadtxt('papers_genes_bad_quality/stress_genes.txt', dtype=str)
    hypoxia_genes = np.loadtxt('papers_genes_bad_quality/hypoxia_genes.txt', dtype=str)
except:
    stress_genes = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/stress_genes.txt', dtype=str)
    hypoxia_genes = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/hypoxia_genes.txt', dtype=str)

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')
adata_reynolds_healthy_ve = sc.read(reynolds_dir + '/adata_reynolds_healthy_ve.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_ve.h5ad?download=1')
adata_reynolds_healthy_per = sc.read(reynolds_dir + '/adata_reynolds_healthy_per.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_per.h5ad?download=1')
adata_reynolds_healthy_krt = sc.read(reynolds_dir + '/adata_reynolds_healthy_krt.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_krt.h5ad?download=1')
adata_reynolds_healthy_lymphoid = sc.read(reynolds_dir + '/adata_reynolds_healthy_lymphoid.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_lymphoid.h5ad?download=1')
adata_reynolds_healthy_APC = sc.read(reynolds_dir + '/adata_reynolds_healthy_APC.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_APC.h5ad?download=1')

## Removing genes of interest

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_healthy_fb, groupby='hypoxia_stress')

In [None]:
sc.pl.rank_genes_groups(adata_reynolds_healthy_fb)

In [None]:
DEGs_hypoxia = adata_reynolds_healthy_fb.uns['rank_genes_groups']['names']['Hypoxia'][(adata_reynolds_healthy_fb.uns['rank_genes_groups']['pvals_adj']['Hypoxia'] < 10e-15) & 
                                                                                     (adata_reynolds_healthy_fb.uns['rank_genes_groups']['logfoldchanges']['Hypoxia'] > 0)]
DEGs_stress = adata_reynolds_healthy_fb.uns['rank_genes_groups']['names']['Stress'][(adata_reynolds_healthy_fb.uns['rank_genes_groups']['pvals_adj']['Stress'] < 10e-15) & 
                                                                                     (adata_reynolds_healthy_fb.uns['rank_genes_groups']['logfoldchanges']['Stress'] > 0)]

list_renes_remove = set(list(stress_genes) + list(hypoxia_genes) + list(DEGs_hypoxia) + list(DEGs_stress))

In [None]:
len(list_renes_remove)

In [None]:
adata_reynolds_healthy_fb = adata_reynolds_healthy_fb[:, [i for i in adata_reynolds_healthy_fb.var_names if i not in list_renes_remove]]

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.filter_genes(eval(adata_name), min_counts=1)
    sc.pp.highly_variable_genes(eval(adata_name))
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='sample_id', epsilon_cluster = 1e-4, epsilon_harmony = 1e-5, max_iter_harmony=30)
    sc.pp.neighbors(eval(adata_name), n_neighbors=int(len(eval(adata_name)) ** 0.5 // 2), use_rep='X_pca_harmony')
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress', 'sample_id'])

## Removing out batch effects

### Harmony for batch, bbknn for sample

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.filter_genes(eval(adata_name), min_counts=1)
    sc.pp.highly_variable_genes(eval(adata_name))
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='hypoxia_stress', epsilon_cluster = 1e-6, epsilon_harmony = 1e-6, max_iter_harmony=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='sample_id', neighbors_within_batch=2, use_rep='X_pca_harmony')
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress'])

### Harmony for batch and for sample

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
adata_reynolds_healthy_fb.obs['sample_stress'] = adata_reynolds_healthy_fb.obs['sample_id'].astype(str) + '_' + adata_reynolds_healthy_fb.obs['hypoxia_stress'].astype(str)

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.filter_genes(eval(adata_name), min_counts=1)
    sc.pp.highly_variable_genes(eval(adata_name))
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='sample_stress', epsilon_cluster = 1e-4, epsilon_harmony = 1e-5, max_iter_harmony=30)
    sc.pp.neighbors(eval(adata_name), n_neighbors=int(len(eval(adata_name)) ** 0.5 // 2), use_rep='X_pca_harmony')
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress'])

### bbknn for batch and for sample

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
adata_reynolds_healthy_fb.obs['sample_stress'] = adata_reynolds_healthy_fb.obs['sample_id'].astype(str) + '_' + adata_reynolds_healthy_fb.obs['hypoxia_stress'].astype(str)

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.filter_genes(eval(adata_name), min_counts=1)
    sc.pp.highly_variable_genes(eval(adata_name))
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='sample_stress', neighbors_within_batch=2, set_op_mix_ratio=0.9)
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress'])

## Regress out

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
sc.pp.filter_genes(adata_reynolds_healthy_fb, min_counts=1)

In [None]:
sc.pp.regress_out(adata_reynolds_healthy_fb, keys=['hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], n_jobs=1)

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='sample_id', epsilon_cluster = 1e-4, epsilon_harmony = 1e-5, max_iter_harmony=30)
    sc.pp.neighbors(eval(adata_name), n_neighbors=int(len(eval(adata_name)) ** 0.5 // 2), use_rep='X_pca_harmony')
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress', 'sample_id'])

## Regress out of normal / stress populations

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
adata_reynolds_healthy_fb = adata_reynolds_healthy_fb[adata_reynolds_healthy_fb.obs['hypoxia_stress'].isin(['Normal', 'Stress'])]

In [None]:
sc.pp.filter_genes(adata_reynolds_healthy_fb, min_counts=1)

In [None]:
sc.pp.regress_out(adata_reynolds_healthy_fb, keys=['hypoxia_stress_Stress'], n_jobs=1)

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='sample_id', epsilon_cluster = 1e-4, epsilon_harmony = 1e-5, max_iter_harmony=30)
    sc.pp.neighbors(eval(adata_name), n_neighbors=int(len(eval(adata_name)) ** 0.5 // 2), use_rep='X_pca_harmony')
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress', 'sample_id'])

## Regress out of normal / hypoxia populations

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
adata_reynolds_healthy_fb = adata_reynolds_healthy_fb[adata_reynolds_healthy_fb.obs['hypoxia_stress'].isin(['Normal', 'Hypoxia'])]

In [None]:
sc.pp.filter_genes(adata_reynolds_healthy_fb, min_counts=1)

In [None]:
sc.pp.regress_out(adata_reynolds_healthy_fb, keys=['hypoxia_stress_Hypoxia'], n_jobs=1)

In [None]:
for adata_name in ['adata_reynolds_healthy_fb']:
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='sample_id', epsilon_cluster = 1e-4, epsilon_harmony = 1e-5, max_iter_harmony=30)
    sc.pp.neighbors(eval(adata_name), n_neighbors=int(len(eval(adata_name)) ** 0.5 // 2), use_rep='X_pca_harmony')
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_healthy_fb, color=['hypoxia_stress', 'sample_id'])

In [None]:
dict_cat = {'A1': ['PI16', 'QPCT', 'SLPI', 'CCN5', 'WIF2', 'CPE', 'CTHRC1', 'MFAP5', 'PCOLCE2', 'SCARA5', 'TSPAN8'], 
            'A2': ['APCDD1', 'COL18A1', 'COMP', 'NKD2', 'F13A1', 'HSPB3', 'LEPR', 'TGFBI'], 
            'B1': ['CXCL2', 'MYC', 'C7', 'SPSB1', 'ITM2A'], 
            'B2': ['SOCS3', 'CCL19', 'CD74', 'RARRES2', 'CCDC146', 'IGFBP3', 'TNFSF13B'], 
            'C': ['CRABP1', 'PLXDC1', 'RSPO4', 'ASPN', 'F2R', 'POSTN', 'TNN']}

In [None]:
del adata_reynolds_healthy_fb.var

In [None]:
sc.tl.leiden(adata_reynolds_healthy_fb, resolution=1.4, random_state=seed)

In [None]:
assign_cats(adata_reynolds_healthy_fb, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.05, min_score=0.4, key_added='fb_JID')
sc.pl.umap(adata_reynolds_healthy_fb, color=['leiden', 'fb_JID', 'fb_JID_max',
                           'fb_JID_A1', 'fb_JID_A2', 'fb_JID_B1', 'fb_JID_B2', 'fb_JID_C'], legend_loc='on data', cmap=magma, use_raw=False)

# Seeing the percentage of counts assigned to hypoxia / stress populations

In [None]:
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

In [None]:
bool_stress = np.array([True if i in stress_genes else False for i in adata_reynolds_healthy_fb.var_names])
bool_hypoxia = np.array([True if i in hypoxia_genes else False for i in adata_reynolds_healthy_fb.var_names])
bool_normal = 1 - (bool_stress + bool_hypoxia)

In [None]:
bool_normal.sum()

In [None]:
adata_reynolds_healthy_fb[:, [True if i in stress_genes for i in adata_reynolds_healthy_fb.var_names else False]]

In [None]:
mirar qué porcentaje de los counts de las células de stress/hipoxia pertenecen a los marcadores