In [None]:
# UNCOMMENT THIS TO INSTALL STUFF!
# !wget https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/requirements.txt
# !pip install -r requirements.txt

In [None]:
import scanpy as sc
import scanpy.external as sce

import pandas as pd
import numpy as np
import os
from functools import reduce
import gseapy as gp

import triku as tk

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import scipy.stats as sts

In [None]:
# To print versions of imports 

import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

excludes = ['builtins', 'types', 'sys']

imported_modules = [module for module in imports() if module not in excludes]

clean_modules = []

for module in imported_modules:

    sep = '.'  # to handle 'matplotlib.pyplot' cases
    rest = module.split(sep, 1)[0]
    clean_modules.append(rest)

changed_imported_modules = list(set(clean_modules))  # drop duplicates

pip_modules = !pip freeze  # you could also use `!conda list` with anaconda

for module in pip_modules:
    try:
        name, version = module.split('==')
        if name in changed_imported_modules:
            print(name + '\t' + version)
    except:
        pass

In [None]:
seed = 0

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
selected_enr_cols = ['Gene_set', 'Term', 'Adjusted P-value', 'Odds Ratio', 'Genes']

In [None]:
mpl.rcParams['figure.dpi'] = 70  # Set this to make higher quality pictures

In [None]:
def assign_cats(adata, dict_cats, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='assigned_cats', min_score=0.6, others_name='unassigned'):
    """
    This functions uses a set of genes assigned to different categories so that leiden clusters can be assigned to one of these categories.
    For example, to categorize fibroblasts from pericytes, endothelial cells, or cells with high mitochondrial content.
    It could be done with each cell individually, but it is better to use clusters to discern the different categories because
    the method, although efficient, can sometimes be noisy due to the noisiness of the sc datasets.
    """
    
    for cat in list(dict_cats.keys()):
        mat_cat = np.zeros((len(adata), len(dict_cats[cat])), dtype=float)
        
        for gene_idx, gene in enumerate(dict_cats[cat]):
            try:
                mat_cat[:, gene_idx] = np.asarray(np.dot(adata.obsp['connectivities'], adata[:, gene].X).todense()).ravel() / adata.uns['neighbors']['params']['n_neighbors']
                mat_cat[mat_cat[:, gene_idx] > 0, gene_idx] = np.argsort(np.argsort(mat_cat[mat_cat[:, gene_idx] > 0, gene_idx]))
                mat_cat[:, gene_idx] /= np.max(mat_cat[:, gene_idx])
            except:
                print(f'Gene {gene} is not on the list')    
            
        sum_mat_cat = np.asarray(mat_cat.mean(1)).ravel()       
        adata.obs[cat] = sum_mat_cat
    
    score_per_cluster = adata.obs[[column_groupby] + list(dict_cats.keys())].groupby(column_groupby).quantile(quantile_gene_sel)
    max_cat_dict_std = dict(zip(score_per_cluster.std(1).index, score_per_cluster.std(1).values))
    adata.obs[f'{key_added}_std'] = [max_cat_dict_std[i] for i in adata.obs[column_groupby]]
    max_cat_dict_mean = dict(zip(score_per_cluster.mean(1).index, score_per_cluster.mean(1).values))
    adata.obs[f'{key_added}_mean'] = [max_cat_dict_mean[i] for i in adata.obs[column_groupby]]
    max_cat_dict_max = dict(zip(score_per_cluster.max(1).index, score_per_cluster.max(1).values))
    adata.obs[f'{key_added}_max'] = [max_cat_dict_max[i] for i in adata.obs[column_groupby]]
    adata.obs[f'{key_added}_CV'] = adata.obs[f'{key_added}_mean'] / adata.obs[f'{key_added}_std']
    
    for cat in score_per_cluster.columns:
        max_cat_dict = dict(zip(score_per_cluster.index, score_per_cluster[cat].values))        
        adata.obs[f'{key_added}_{cat}'] = [max_cat_dict[i] for i in adata.obs[column_groupby]]
    
    if intermediate_states: # For each cluster we will identify which categories are close to the highest one, and merge their names.
        list_names_cats_per_cluster = []
        for cluster in score_per_cluster.index:
            scores_cluster = score_per_cluster.loc[cluster]
            scores_cluster = scores_cluster[scores_cluster > scores_cluster.max() - diff]
            list_names_cats_per_cluster.append('/'.join(scores_cluster.index))
        
        final_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, list_names_cats_per_cluster))
    else:        
        final_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, score_per_cluster.idxmax(axis=1).values))
    
    adata.obs[f'{key_added}'] = [str(final_cat_dict[i]) for i in adata.obs[column_groupby]]
    
    adata.obs[f'{key_added}'][adata.obs[f'{key_added}_max'] < min_score] = others_name
    
    if do_return:
        return score_per_cluster

# Reynolds et al. 2020 dataset download and preprocess
In this section we are going to download and process the anndata files for the 5 healthy samples (S1 to S5). We are first going to use the processed files from Reynolds et al., because they contain the main fb populations, which we are interested in (FB1-3). With that, we can partially replicate the analysis. However, the data richness is not that good (the UMAPs are more *blooby*) so we preprocess the FASTQ files on our own, and used the processed adatas.

Once we have our own anndatas, the preprocessing is the following:
* Assign FB types based on Reynolds adatas to our adatas. Some cells will be unassigned.
* QC metrics 
* Filter genes (min counts = 30)
* Set raw X
* Normalize and log1p
* PCA + triku + neighbors + umap + leiden
* Use  `assign_cats` to assign leiden clusters to cell populations with selected markers (markers for each dataset may vary!)
* Filter adata to retain only fibroblasts
* Filter genes to remove 0 counts
* PCA + triku + neighbors + umap + leiden [leiden is not used here but may be used later]
* Check if strange populations appear and filter them in `assign_cats`, then repeat the last steps.

In [None]:
os.getcwd()

In [None]:
reynolds_dir = 'reynolds_2020'
os.makedirs(reynolds_dir, exist_ok=True)


papers_dir = 'papers_genes_bad_quality'
os.makedirs(papers_dir, exist_ok=True)

### Making and saving the fb healthy dataset to zenodo

In [None]:
# adata_reynolds = sc.read('submission_210120.h5ad', backup_url='https://zenodo.org/record/4536165/files/submission_210120.h5ad')
# adata_reynolds_fb = adata_reynolds[(adata_reynolds.obs['full_clustering'].isin(['F1', 'F2', 'F3'])) & 
#                                    (adata_reynolds.obs['Status'] == 'Healthy')]
# sc.pp.filter_genes(adata_reynolds_fb, min_counts=100)
# del adata_reynolds_fb.var

# for obs in ['mad_prd', 'Status', 'Site', 'Tissue', 'Enrichment', 'Location', 'Sex', 'Age', 'stage']:
#     del adata_reynolds_fb.obs[obs]
    
# adata_reynolds_fb.write_h5ad(reynolds_dir + '/reynolds_2020_fb_healthy.h5ad')

### Direct h5ad download

In [None]:
adata_reynolds_fb_healthy = sc.read(reynolds_dir + '/reynolds_2020_fb_healthy.h5ad', 
                                    backup_url='https://zenodo.org/record/4605340/files/reynolds_2020_fb_healthy.h5ad?download=1')

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_healthy, min_counts=50)

In [None]:
sc.pp.log1p(adata_reynolds_fb_healthy)
sc.pp.normalize_total(adata_reynolds_fb_healthy)

In [None]:
df_batches = pd.DataFrame(np.unique(adata_reynolds_fb_healthy.obs['sample_id'], return_counts=True)).transpose()

In [None]:
df_batches.sort_values(by=1, ascending=False)

In [None]:
selected_samples = df_batches[df_batches[1] > 50][0].values

In [None]:
adata_reynolds_fb_healthy = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(selected_samples)]  #selected_samples)]

In [None]:
adata_reynolds_fb_healthy

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_healthy, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_healthy, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_reynolds_fb_healthy, metric='angular', batch_key='sample_id')
tk.tl.triku(adata_reynolds_fb_healthy, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_healthy, min_dist=0.1, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_healthy, resolution=1.5, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_healthy, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

## 4820STDY7388991 (S1)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_4820STDY7388991 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['4820STDY7388991'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_4820STDY7388991

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388991, min_counts=1)
adata_reynolds_fb_4820STDY7388991 = adata_reynolds_fb_4820STDY7388991[:, adata_reynolds_fb_4820STDY7388991.var_names != 'C3']

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388991, random_state=seed, n_comps=30)
tk.tl.triku(adata_reynolds_fb_4820STDY7388991, use_raw=False)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388991, metric='cosine', random_state=seed)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388991, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_4820STDY7388991_loom = sc.read('reynolds_2020/reynolds_2020_0_4820STDY7388991_s1_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_0_4820STDY7388991_s1_dermis_fibroblasts.loom')
adata_reynolds_fb_4820STDY7388991_loom.var_names_make_unique()

adata_reynolds_fb_4820STDY7388991_loom = adata_reynolds_fb_4820STDY7388991_loom[:, adata_reynolds_fb_4820STDY7388991_loom.var_names != 'C3']

In [None]:
adata_reynolds_fb_4820STDY7388991_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_4820STDY7388991_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_4820STDY7388991_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_4820STDY7388991.obs_names & adata_reynolds_fb_4820STDY7388991_loom.obs_names] = adata_reynolds_fb_4820STDY7388991.obs['full_clustering'].loc[adata_reynolds_fb_4820STDY7388991.obs_names & adata_reynolds_fb_4820STDY7388991_loom.obs_names]
adata_reynolds_fb_4820STDY7388991_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_4820STDY7388991_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_4820STDY7388991_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_4820STDY7388991_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388991_loom, min_counts=30)
adata_reynolds_fb_4820STDY7388991_loom.raw = adata_reynolds_fb_4820STDY7388991_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_4820STDY7388991_loom)
sc.pp.log1p(adata_reynolds_fb_4820STDY7388991_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388991_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388991_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7388991_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388991_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991_loom, resolution=3.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom, color=['LUM', 'PDGFRA', 'VIM', 'DCN', 'COL1A1', 'SFRP2', 'APOE', 'POSTN'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388991_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7388991_loom)

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TPSB2', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 'melanocyte': ['PMEL', 'MLANA'], 
            'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6']}

assign_cats(adata_reynolds_fb_4820STDY7388991_loom, dict_cat, min_score=0.05)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
adata_reynolds_fb_4820STDY7388991_loom_fb = adata_reynolds_fb_4820STDY7388991_loom[adata_reynolds_fb_4820STDY7388991_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388991_loom_fb, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388991_loom_fb, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388991_loom_fb, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7388991_loom_fb, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991_loom_fb, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 
                                               'DKK3', 'TNN', 'SFRP1', 'POSTN'], cmap=magma, use_raw=False, legend_loc='on data')

## 4820STDY7388999 (S2)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_4820STDY7388999 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['4820STDY7388999'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_4820STDY7388999

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388999, min_counts=1)
adata_reynolds_fb_4820STDY7388999 = adata_reynolds_fb_4820STDY7388999[:, adata_reynolds_fb_4820STDY7388999.var_names != 'C3']

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388999, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388999, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7388999, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388999, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_4820STDY7388999_loom = sc.read('reynolds_2020/reynolds_2020_8_4820STDY7388999_s2_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_8_4820STDY7388999_s2_dermis_fibroblasts.loom')

adata_reynolds_fb_4820STDY7388999_loom.var_names_make_unique()
adata_reynolds_fb_4820STDY7388999_loom = adata_reynolds_fb_4820STDY7388999_loom[:, adata_reynolds_fb_4820STDY7388999_loom.var_names != 'C3']

In [None]:
adata_reynolds_fb_4820STDY7388999_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_4820STDY7388999_loom.obs_names]

In [None]:
df_fb_type = pd.Series('Other', index=adata_reynolds_fb_4820STDY7388999_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_4820STDY7388999.obs_names & adata_reynolds_fb_4820STDY7388999_loom.obs_names] = adata_reynolds_fb_4820STDY7388999[adata_reynolds_fb_4820STDY7388999.obs_names & adata_reynolds_fb_4820STDY7388999_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_4820STDY7388999_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_4820STDY7388999_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_4820STDY7388999_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_4820STDY7388999_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388999_loom, min_counts=30)
adata_reynolds_fb_4820STDY7388999_loom.raw = adata_reynolds_fb_4820STDY7388999_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_4820STDY7388999_loom)
sc.pp.log1p(adata_reynolds_fb_4820STDY7388999_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388999_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388999_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7388999_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388999_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999_loom, resolution=0.5, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 'POSTN'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 
                                               'DKK3', 'TNN', 'SFRP1'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 
                                                          'DMKN', 'KRT1'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TPSB2', 'TRAC', 'TPSAB1', 'HLA-DRA', 'FCER1G', 'CD74'], 'endo': ['CLDN5', 'PECAM1'], 'kerato': ['DMKN', 'KRT1'],
            'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6']}

assign_cats(adata_reynolds_fb_4820STDY7388999_loom, dict_cat, min_score=0.05)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom)

In [None]:
adata_reynolds_fb_4820STDY7388999_loom = adata_reynolds_fb_4820STDY7388999_loom[adata_reynolds_fb_4820STDY7388999_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7388999_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7388999_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7388999_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7388999_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7388999_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999_loom, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 
                                               'DKK3', 'TNN', 'SFRP1', 'EDNRB', 'IGFBP7'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom, groupby='leiden')

In [None]:
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom)

## 4820STDY7389007 (S3)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_4820STDY7389007 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['4820STDY7389007'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_4820STDY7389007

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7389007, min_counts=1)
adata_reynolds_fb_4820STDY7389007 = adata_reynolds_fb_4820STDY7389007[:, adata_reynolds_fb_4820STDY7389007.var_names != 'C3']

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7389007, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7389007, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7389007, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7389007, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007, color=['POSTN', 'COMP', 'COCH'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_4820STDY7389007_loom = sc.read('reynolds_2020/reynolds_2020_16_4820STDY7389007_s3_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_16_4820STDY7389007_s3_dermis_fibroblasts.loom')

adata_reynolds_fb_4820STDY7389007_loom.var_names_make_unique()

adata_reynolds_fb_4820STDY7389007_loom = adata_reynolds_fb_4820STDY7389007_loom[:, adata_reynolds_fb_4820STDY7389007_loom.var_names != 'C3']

In [None]:
adata_reynolds_fb_4820STDY7389007_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_4820STDY7389007_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_4820STDY7389007_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_4820STDY7389007.obs_names & adata_reynolds_fb_4820STDY7389007_loom.obs_names] = adata_reynolds_fb_4820STDY7389007[adata_reynolds_fb_4820STDY7389007.obs_names & adata_reynolds_fb_4820STDY7389007_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_4820STDY7389007_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_4820STDY7389007_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_4820STDY7389007_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_4820STDY7389007_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7389007_loom, min_counts=30)
adata_reynolds_fb_4820STDY7389007_loom.raw = adata_reynolds_fb_4820STDY7389007_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_4820STDY7389007_loom)
sc.pp.log1p(adata_reynolds_fb_4820STDY7389007_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7389007_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7389007_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7389007_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7389007_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007_loom, resolution=0.8, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 
                                                          'POSTN', 'RGS5', 'MYL9', 'NDUFA4L2', 'HBB', 
                                                          'DMKN', 'KRT1', 'KRT5'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TPSB2', 'TRAC', 'TPSAB1', 'HLA-DRA', 'FCER1G'], 'kerato': ['DMKN', 'KRT1', 'KRT5'],
            'mt': ['MTND2P28', 'MTND4P12', 'MTCO1P40', 'ADAM33', 'RN7SL2', 'MTRNR2L6']}

assign_cats(adata_reynolds_fb_4820STDY7389007_loom, dict_cat, quantile_gene_sel=0.85, min_score=0.05)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7389007_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_4820STDY7389007_loom)

In [None]:
adata_reynolds_fb_4820STDY7389007_loom = adata_reynolds_fb_4820STDY7389007_loom[adata_reynolds_fb_4820STDY7389007_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_4820STDY7389007_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_4820STDY7389007_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_4820STDY7389007_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_4820STDY7389007_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_4820STDY7389007_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007_loom, resolution=1.2, random_state=seed)

In [None]:
adata_reynolds_fb_4820STDY7389007_loom

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

## SKN8104899 (S4)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_SKN8104899 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['SKN8104899'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_SKN8104899

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8104899, min_counts=1)
adata_reynolds_fb_SKN8104899 = adata_reynolds_fb_SKN8104899[:, adata_reynolds_fb_SKN8104899.var_names != 'C3']

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8104899, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_SKN8104899, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_SKN8104899, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8104899, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_SKN8104899, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_SKN8104899_loom = sc.read('reynolds_2020/reynolds_2020_84_SKN8104899_S4_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_84_SKN8104899_S4_dermis_fibroblasts.loom')

adata_reynolds_fb_SKN8104899_loom.var_names_make_unique()

adata_reynolds_fb_SKN8104899_loom = adata_reynolds_fb_SKN8104899_loom[:, adata_reynolds_fb_SKN8104899_loom.var_names != 'C3']

In [None]:
adata_reynolds_fb_SKN8104899_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_SKN8104899_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_SKN8104899_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_SKN8104899.obs_names & adata_reynolds_fb_SKN8104899_loom.obs_names] = adata_reynolds_fb_SKN8104899[adata_reynolds_fb_SKN8104899.obs_names & adata_reynolds_fb_SKN8104899_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_SKN8104899_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_SKN8104899_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_SKN8104899_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_SKN8104899_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8104899_loom, min_counts=30)
adata_reynolds_fb_SKN8104899_loom.raw = adata_reynolds_fb_SKN8104899_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_SKN8104899_loom)
sc.pp.log1p(adata_reynolds_fb_SKN8104899_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8104899_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_SKN8104899_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_SKN8104899_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8104899_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8104899_loom, resolution=0.8, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 
                                                     'POSTN', 'RGS5', 'MYL9', 'NDUFA4L2', 'HBB'], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74', 'POSTN'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['TRAC', 'HLA-DRA', 'FCER1G'], 'kerato': ['DMKN', 'KRT1'],
            'mt': ['MTND4P12', 'ADAM33', 'RN7SL2', ]}

assign_cats(adata_reynolds_fb_SKN8104899_loom, dict_cat, min_score=0.05)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom)

In [None]:
adata_reynolds_fb_SKN8104899_loom = adata_reynolds_fb_SKN8104899_loom[adata_reynolds_fb_SKN8104899_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8104899_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8104899_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_SKN8104899_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_SKN8104899_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8104899_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8104899_loom, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'SFRP2', 'PI16', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom)

## SKN8105197 (S5)

### Reynolds preprocessed

In [None]:
adata_reynolds_fb_SKN8105197 = adata_reynolds_fb_healthy[adata_reynolds_fb_healthy.obs['sample_id'].isin(['SKN8105197'])]  #selected_samples)]

In [None]:
adata_reynolds_fb_SKN8105197

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8105197, min_counts=1)
adata_reynolds_fb_SKN8105197 = adata_reynolds_fb_SKN8105197[:, adata_reynolds_fb_SKN8105197.var_names != 'C3']

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8105197, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_SKN8105197, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_SKN8105197, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8105197, min_dist=0.3, random_state=seed)

In [None]:
sc.tl.leiden(adata_reynolds_fb_SKN8105197, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197, color=['leiden', 'sample_id', 'full_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197, color=['APCDD1', 'COL18A1', 'COMP', 'SLPI', 'WIF1'], cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197, color=['MT2A', 'CCL19', 'CCL2', 'CD46'], cmap=magma, use_raw=False)

### Direct loom

In [None]:
adata_reynolds_fb_SKN8105197_loom = sc.read('reynolds_2020/reynolds_2020_92_SKN8105197_S5_dermis_fibroblasts.loom', 
                                                 backup_url='https://zenodo.org/record/4596375/files/reynolds_2020_92_SKN8105197_S5_dermis_fibroblasts.loom')
adata_reynolds_fb_SKN8105197_loom.var_names_make_unique()

adata_reynolds_fb_SKN8105197_loom = adata_reynolds_fb_SKN8105197_loom[:, adata_reynolds_fb_SKN8105197_loom.var_names != 'C3']

In [None]:
adata_reynolds_fb_SKN8105197_loom.obs_names = [f"{i.split('_')[-1]}-1-{i.split('_')[1]}" for i in adata_reynolds_fb_SKN8105197_loom.obs_names]

In [None]:
df_fb_type = pd.Series('-', index=adata_reynolds_fb_SKN8105197_loom.obs_names)
df_fb_type.loc[adata_reynolds_fb_SKN8105197.obs_names & adata_reynolds_fb_SKN8105197_loom.obs_names] = adata_reynolds_fb_SKN8105197[adata_reynolds_fb_SKN8105197.obs_names & adata_reynolds_fb_SKN8105197_loom.obs_names].obs['full_clustering']
adata_reynolds_fb_SKN8105197_loom.obs['science_clustering'] = df_fb_type

In [None]:
sc.pp.calculate_qc_metrics(adata_reynolds_fb_SKN8105197_loom, inplace=True)

In [None]:
sc.pl.violin(adata_reynolds_fb_SKN8105197_loom, ['n_genes_by_counts'])
sc.pl.violin(adata_reynolds_fb_SKN8105197_loom, ['log1p_total_counts'])

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8105197_loom, min_counts=30)
adata_reynolds_fb_SKN8105197_loom.raw = adata_reynolds_fb_SKN8105197_loom

In [None]:
sc.pp.normalize_per_cell(adata_reynolds_fb_SKN8105197_loom)
sc.pp.log1p(adata_reynolds_fb_SKN8105197_loom)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8105197_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_SKN8105197_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_SKN8105197_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8105197_loom, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8105197_loom, resolution=1, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'science_clustering'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['LUM', 'PDGFRA', 'VIM', 'COL1A1', 'SFRP2', 'APOE', 'RGS5', 'MYL9', 'NDUFA4L2', 'HBB'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'SFRP2', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
dict_cat = {'fb': ['LUM', 'PDGFRA', 'COL1A1', 'SFRP2', 'CCL19'], 'peri': ['RGS5', 'MYL9', 'NDUFA4L2'], 'eritro': ['HBB', 'HBA2', 'HBA1'], 
            'immune': ['HLA-DRA', 'FCER1G'], 'melano':['PMEL', 'MLANA']}

assign_cats(adata_reynolds_fb_SKN8105197_loom, dict_cat, min_score=0.05)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'science_clustering', 'assigned_cats'], legend_loc='on data', cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8105197_loom, groupby='leiden')
sc.pl.rank_genes_groups(adata_reynolds_fb_SKN8105197_loom)

In [None]:
adata_reynolds_fb_SKN8105197_loom = adata_reynolds_fb_SKN8105197_loom[adata_reynolds_fb_SKN8105197_loom.obs['assigned_cats'] == 'fb']

In [None]:
sc.pp.filter_genes(adata_reynolds_fb_SKN8105197_loom, min_counts=1)

In [None]:
sc.pp.pca(adata_reynolds_fb_SKN8105197_loom, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_reynolds_fb_SKN8105197_loom, metric='cosine', random_state=seed)
tk.tl.triku(adata_reynolds_fb_SKN8105197_loom, use_raw=False)

In [None]:
sc.tl.umap(adata_reynolds_fb_SKN8105197_loom, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_reynolds_fb_SKN8105197_loom, resolution=1.2, random_state=seed)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=['leiden', 'SFRP2', 'SLPI', 'WIF1', 'COL18A1', 'COMP', 
                                               'APOE', 'CCL2',  'ITM2A', 'SPSB1', 
                                               'CCDC146', 'CCL19', 'CD74'], cmap=magma, use_raw=False, legend_loc='on data')

## SKN8105197 exclusion
We do not see SFRP2 expression, which has been a through fb marker, together with APOE. We will not include this sample in the analysis.

In [None]:
genes = ['science_clustering', 'SFRP2', 'APOE', 'CCL19']

sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=genes, cmap=magma, use_raw=False, legend_loc='on data')
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=genes, cmap=magma, use_raw=False, legend_loc='on data')
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=genes, cmap=magma, use_raw=False, legend_loc='on data')

sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=genes, cmap=magma, use_raw=False, legend_loc='on data')
sc.pl.umap(adata_reynolds_fb_SKN8105197_loom, color=genes, cmap=magma, use_raw=False, legend_loc='on data')

# Preparing adatas for FB1+3 and FB2

We've realized that FB1+3 and FB2 showed the same main populations based on JID (A1, A2 and B2 mainly). We want to obtain distinct populations based on the marker expression, and also be able to separate these two populations. The problem is that the FB1+3 and FB2 populations, although not fully overlapping, they do show some overlap between them, and we would like to separate them only based on marker genes and clustering. These markers should be shared between all datasets. To do that, we are going to do a preeliminar DEG analysis and find markers that separate these populations in an unbiased manner.

## Finding markers to separate 1+3 from 2

**This part may not be reproducible in other notebooks. This part is to get markers that will be reproducible later on**

In [None]:
# S1
sc.tl.leiden(adata_reynolds_fb_4820STDY7388991_loom_fb, resolution=3, random_state=seed)
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_4820STDY7388991_loom_fb.obs['fb_type'] = ['1+3' if i in ['3', '16', '12', '19', '6', '17', '8', '2', '9', '11', '12', '14', '0', '21', '22'] else '2' for i in adata_reynolds_fb_4820STDY7388991_loom_fb.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388991_loom_fb, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s1 = pd.DataFrame(adata_reynolds_fb_4820STDY7388991_loom_fb.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_4820STDY7388991_loom_fb.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
# S2
sc.tl.leiden(adata_reynolds_fb_4820STDY7388999_loom, resolution=3, random_state=seed)
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_4820STDY7388999_loom.obs['fb_type'] = ['2' if i in ['15', '16', '19'] else '1+3' for i in adata_reynolds_fb_4820STDY7388999_loom.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7388999_loom, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s2 = pd.DataFrame(adata_reynolds_fb_4820STDY7388999_loom.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_4820STDY7388999_loom.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
# S3
sc.tl.leiden(adata_reynolds_fb_4820STDY7389007_loom, resolution=2.8, random_state=seed)
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_4820STDY7389007_loom.obs['fb_type'] = ['2' if i in ['0', '17', '16', '13', '4', '1', '19'] else '1+3' for i in adata_reynolds_fb_4820STDY7389007_loom.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_4820STDY7389007_loom, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s3 = pd.DataFrame(adata_reynolds_fb_4820STDY7389007_loom.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_4820STDY7389007_loom.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
# S4
sc.tl.leiden(adata_reynolds_fb_SKN8104899_loom, resolution=3, random_state=seed)
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')
adata_reynolds_fb_SKN8104899_loom.obs['fb_type'] = ['2' if i in ['8', '6', '14', '27'] else '1+3' for i in adata_reynolds_fb_SKN8104899_loom.obs['leiden']]
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['fb_type', 'science_clustering'], cmap=magma, use_raw=False, legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_reynolds_fb_SKN8104899_loom, groupby='fb_type', groups=['1+3'], reference='2', rankby_abs=True)
df_rank_s4 = pd.DataFrame(adata_reynolds_fb_SKN8104899_loom.uns['rank_genes_groups']['names']['1+3'],
adata_reynolds_fb_SKN8104899_loom.uns['rank_genes_groups']['scores']['1+3']).sort_index(ascending=False)

In [None]:
N = 100

genes_13, counts_13 = np.unique(list(df_rank_s1.iloc[:N, 0].values) + list(df_rank_s2.iloc[:N, 0].values) + list(df_rank_s3.iloc[:N, 0].values) + list(df_rank_s4.iloc[:N, 0].values), return_counts=True)
genes_13_shared = genes_13[counts_13 >= 3]

genes_2, counts_2 = np.unique(list(df_rank_s1.iloc[-N:, 0].values) + list(df_rank_s2.iloc[-N:, 0].values) + list(df_rank_s3.iloc[-N:, 0].values) + list(df_rank_s4.iloc[-N:, 0].values), return_counts=True)
genes_2_shared = genes_2[counts_2 >= 3]

In [None]:
genes_13_shared

In [None]:
genes_2_shared

## Separe the 1+3 from 2

In [None]:
genes_13_shared = ['BNIP3L', 'BNIP3P1', 'CEMIP', 'ENO1', 'GSTO1', 'MEDAG', 'NAMPT', 'NAMPTP1', 'NRN1', ]

In [None]:
genes_2_shared = ['ANXA2', 'CALM1', 'FOSB', 'LMNA', 'MTRNR2L12', 'S100A10', 'S100A4', 'S100A6', 'TMSB4X', 'TPPP3']

In [None]:
dict_cat = {'1+3': genes_13_shared, '2': genes_2_shared}

for adata in [adata_reynolds_fb_4820STDY7388991_loom_fb, adata_reynolds_fb_4820STDY7388999_loom, adata_reynolds_fb_4820STDY7389007_loom, adata_reynolds_fb_SKN8104899_loom]:
    assign_cats(adata, dict_cat, min_score=0.3)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats',  
                                                             'assigned_cats_1+3', 'assigned_cats_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388991_loom_fb, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats', 
                                                          'assigned_cats_1+3', 'assigned_cats_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7388999_loom, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats',  
                                                          'assigned_cats_1+3', 'assigned_cats_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_4820STDY7389007_loom, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden', 'science_clustering', 'fb_type', 'assigned_cats',  
                                                     'assigned_cats_1+3', 'assigned_cats_2', 'assigned_cats_CV'], legend_loc='on data', cmap=magma)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden'] + genes_13_shared, legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_reynolds_fb_SKN8104899_loom, color=['leiden'] + genes_2_shared, legend_loc='on data', cmap=magma, use_raw=False)

## Create the adatas

In [None]:
adata_s1_13 = adata_reynolds_fb_4820STDY7388991_loom_fb[adata_reynolds_fb_4820STDY7388991_loom_fb.obs['assigned_cats'] == '1+3'].copy()
adata_s1_2 = adata_reynolds_fb_4820STDY7388991_loom_fb[adata_reynolds_fb_4820STDY7388991_loom_fb.obs['assigned_cats'] == '2'].copy()

adata_s2_13 = adata_reynolds_fb_4820STDY7388999_loom[adata_reynolds_fb_4820STDY7388999_loom.obs['assigned_cats'] == '1+3'].copy()
adata_s2_2 = adata_reynolds_fb_4820STDY7388999_loom[adata_reynolds_fb_4820STDY7388999_loom.obs['assigned_cats'] == '2'].copy()

adata_s3_13 = adata_reynolds_fb_4820STDY7389007_loom[adata_reynolds_fb_4820STDY7389007_loom.obs['assigned_cats'] == '1+3'].copy()
adata_s3_2 = adata_reynolds_fb_4820STDY7389007_loom[adata_reynolds_fb_4820STDY7389007_loom.obs['assigned_cats'] == '2'].copy()

adata_s4_13 = adata_reynolds_fb_SKN8104899_loom[adata_reynolds_fb_SKN8104899_loom.obs['assigned_cats'] == '1+3'].copy()
adata_s4_2 = adata_reynolds_fb_SKN8104899_loom[adata_reynolds_fb_SKN8104899_loom.obs['assigned_cats'] == '2'].copy()

In [None]:
for adata_s_name in ['adata_s1_13', 'adata_s1_2', 'adata_s2_13', 'adata_s2_2', 'adata_s3_13', 'adata_s3_2', 'adata_s4_13', 'adata_s4_2']:  # We do this because with the adata name by itselft it won't work!!!
    sc.pp.filter_genes(eval(adata_s_name), min_counts=1)
    sc.pp.pca(eval(adata_s_name), random_state=seed, n_comps=30)
    sc.pp.neighbors(eval(adata_s_name), metric='cosine')
    tk.tl.triku(eval(adata_s_name), use_raw=False)
    sc.tl.umap(eval(adata_s_name), min_dist=0.05, random_state=seed)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s1_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s1_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s2_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s2_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s3_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s3_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(6, 3))
sc.pl.umap(adata_s4_13, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[0], show=False)
sc.pl.umap(adata_s4_2, color=['science_clustering'], legend_loc='on data', cmap=magma, ax=axs[1], show=False)

## Assigning JID categories
Based on this we do not see any discordancies in the dataset. We are going to apply the category assignation to assign the JID clusters to the dataset.

In [None]:
## MAP JID populations

dict_cat = {'A1': ['PI16', 'QPCT', 'SLPI', 'CCN5', 'CPE', 'CTHRC1', 'MFAP5', 'PCOLCE2', 'SCARA5', 'TSPAN8'], 
            'A2': ['APCDD1', 'COL18A1', 'COMP', 'NKD2', 'F13A1', 'HSPB3', 'LEPR', 'TGFBI'], 
            'B1': ['CXCL2', 'MYC', 'C7', 'SPSB1', 'ITM2A'], 
            'B2': ['SOCS3', 'CCL19', 'CD74', 'RARRES2', 'CCDC146', 'IGFBP3', 'TNFSF13B'], 
            'C': ['CRABP1', 'PLXDC1', 'RSPO4', 'ASPN', 'F2R', 'POSTN', 'TNN']}

### S1 1+3

In [None]:
adata_s = adata_s1_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val[:4] if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
sc.tl.leiden(adata_s, resolution=1.4, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.05, min_score=0.6)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 'assigned_cats_max',
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, vmin=0, cmap=magma, use_raw=False)

### S1 2

In [None]:
adata_s = adata_s1_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
sc.tl.leiden(adata_s, resolution=1.5, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.07)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S2 1+3

In [None]:
adata_s = adata_s2_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
sc.tl.leiden(adata_s, resolution=1.5, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.07)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S2 2

In [None]:
adata_s = adata_s2_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in eval(adata_s_name).var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
sc.tl.leiden(adata_s, resolution=1, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.07)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S3 1+3

In [None]:
adata_s = adata_s3_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in adata_s.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=5)

In [None]:
sc.tl.leiden(adata_s, resolution=2, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.05)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S3 2

In [None]:
adata_s = adata_s3_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in adata_s.var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=1.4, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.07)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S4 1+3

In [None]:
adata_s = adata_s4_13

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in adata_s.var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.07)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

### S4 2

In [None]:
adata_s = adata_s4_2

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_s, color=['science_clustering'] + [i for i in val if i in adata_s.var_names], legend_loc='on data', cmap=magma, use_raw=False)

In [None]:
sc.tl.leiden(adata_s, resolution=2, random_state=seed)
assign_cats(adata_s, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.07)
sc.pl.umap(adata_s, color=['leiden', 'assigned_cats', 
                           'assigned_cats_A1', 'assigned_cats_A2', 'assigned_cats_B1', 'assigned_cats_B2', 'assigned_cats_C'], legend_loc='on data', vmax=1, cmap=magma, use_raw=False)

## Joining 1+3 and 2 adatas
Now we are going to join all adatas into two: one for each category. We should find that the predicted categories should match between them across adatas.

**OUTER JOIN IS APPLIED TO KEEPS GENES NOT EXPRESSED IN ALL ADATAS**

### No batch effect correction

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=1)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sc.pp.neighbors(eval(adata_name), metric='cosine', random_state=seed)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.3, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'assigned_cats'])

### bbknn

#### kNN within batch: 2

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='s_dataset', neighbors_within_batch=2)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.tl.leiden(eval(adata_name), resolution=1.5)
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'leiden', 'assigned_cats'])

#### kNN within batch: 3

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='s_dataset', neighbors_within_batch=3)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.tl.leiden(eval(adata_name), resolution=1.5)
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'leiden', 'assigned_cats'])

#### kNN within batch: 6

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='s_dataset', neighbors_within_batch=6)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'assigned_cats'])

#### kNN within batch: 8

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='s_dataset', neighbors_within_batch=8)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'assigned_cats'])

### harmonypy

#### sigma = 0.1

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='s_dataset', basis='X_pca', 
                             adjusted_basis='X_pca_harmony', random_state=seed, 
                             epsilon_cluster=1e-06, epsilon_harmony=0.00001, 
                             max_iter_harmony=25, sigma=0.1)
    sc.pp.neighbors(eval(adata_name), metric='cosine', use_rep='X_pca_harmony', random_state=seed)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.3, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.tl.leiden(eval(adata_name), resolution=0.2)
    sc.tl.rank_genes_groups(eval(adata_name), method='wilcoxon', groupby='leiden')
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'leiden', 'assigned_cats',])

In [None]:
sc.pl.umap(adata_all_13,  color=['s_dataset', 'leiden', 'assigned_cats',])

#### sigma = 0.25

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='s_dataset', basis='X_pca', 
                             adjusted_basis='X_pca_harmony', random_state=seed, 
                             epsilon_cluster=1e-06, epsilon_harmony=0.00001, 
                             max_iter_harmony=25, sigma=0.25)
    sc.pp.neighbors(eval(adata_name), metric='cosine', use_rep='X_pca_harmony', random_state=seed)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.3, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.tl.leiden(eval(adata_name), resolution=0.6)
    sc.tl.rank_genes_groups(eval(adata_name), method='wilcoxon', groupby='leiden')
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'leiden', 'assigned_cats',])

In [None]:
sc.pl.umap(adata_all_13,  color=['s_dataset', 'leiden', 'assigned_cats',])
sc.pl.rank_genes_groups_tracksplot(adata_all_13, dendrogram=False, use_raw=False, n_genes=20)

In [None]:
sc.pl.umap(adata_all_2,  color=['s_dataset', 'leiden', 'assigned_cats',])
sc.pl.rank_genes_groups_tracksplot(adata_all_2, dendrogram=False, use_raw=False, n_genes=20)

#### sigma = 0.5

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.harmony_integrate(eval(adata_name), key='s_dataset', basis='X_pca', 
                             adjusted_basis='X_pca_harmony', random_state=seed, 
                             epsilon_cluster=1e-06, epsilon_harmony=0.00001, 
                             max_iter_harmony=25, sigma=0.5)
    sc.pp.neighbors(eval(adata_name), metric='cosine', use_rep='X_pca_harmony', random_state=seed)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.3, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.tl.leiden(eval(adata_name), resolution=0.6)
    sc.tl.rank_genes_groups(eval(adata_name), method='wilcoxon', groupby='leiden')
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'leiden', 'assigned_cats',])

In [None]:
sc.pl.umap(adata_all_13,  color=['s_dataset', 'leiden', 'assigned_cats',])
sc.pl.rank_genes_groups_tracksplot(adata_all_13, dendrogram=False, use_raw=False, n_genes=20)

In [None]:
sc.pl.umap(adata_all_2,  color=['s_dataset', 'leiden', 'assigned_cats',])
sc.pl.rank_genes_groups_tracksplot(adata_all_2, dendrogram=False, use_raw=False, n_genes=20)

### Selecting the definitive option with bbknn with k=2

In [None]:
adata_all_13 = sc.AnnData.concatenate(adata_s1_13, adata_s2_13, adata_s3_13, adata_s4_13, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')
adata_all_2 = sc.AnnData.concatenate(adata_s1_2, adata_s2_2, adata_s3_2, adata_s4_2, batch_categories=['s1', 's2', 's3', 's4'], batch_key='s_dataset', join='outer')

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.pp.filter_genes(eval(adata_name), min_counts=30)
    sc.pp.pca(eval(adata_name), random_state=seed, n_comps=30)
    sce.pp.bbknn(eval(adata_name), metric='angular', batch_key='s_dataset', neighbors_within_batch=2)
    tk.tl.triku(eval(adata_name), use_raw=False)
    sc.tl.umap(eval(adata_name), min_dist=0.05, random_state=seed)

In [None]:
for adata_name in ['adata_all_13', 'adata_all_2']:
    sc.tl.leiden(eval(adata_name), resolution=1.5)
    sc.pl.umap(eval(adata_name),  color=['s_dataset', 'leiden', 'assigned_cats'])

## Analysis
We observe that either FB1+FB3 or FB2 cells show an A1, an A2 and a B1/B2 population. Moreover, in the FB1+FB3 clusters there are two A2 clusters, and the A1 and B1/B2 clusters seem "enlarged", that is, can be differentiated along a "differentiation" axis. We are going to understand why are there 3 copies of the A and B populations. To do that we are going to answer two questions:
* What genes separate FB2 from FB1+FB3 clusters?
* What genes separate the two axes from the FB2 cluster?

## What genes separate clusters FB2 from FB1+FB3?
To answer that question we are going to get the DEGs from FB2 and FB1+FB3 clusters, and get ontology terms to discern any pattern.

In [None]:
adata_all_123 = sc.AnnData.concatenate(adata_all_2, adata_all_13, batch_categories=['2', '1+3'], batch_key='a_dataset', join='outer')

In [None]:
sc.pp.filter_genes(adata_all_123, min_counts=30)
sc.pp.pca(adata_all_123, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_all_123, metric='angular', batch_key='s_dataset', neighbors_within_batch=2)
tk.tl.triku(adata_all_123, use_raw=False)
sc.tl.umap(adata_all_123, min_dist=0.05, random_state=seed)

In [None]:
sc.tl.leiden(adata_all_123, resolution=1.5)
sc.pl.umap(adata_all_123,  color=['a_dataset', 'science_clustering', 's_dataset', 'leiden', 'assigned_cats'])

In [None]:
sc.tl.rank_genes_groups(adata_all_123, groupby='a_dataset', method='wilcoxon', use_raw=False)

In [None]:
DEGs_123_2 = adata_all_123.uns['rank_genes_groups']['names']['2'][:150]
# there are many MT and RP, so we are going to remove them to make GOs easier to interpret
DEGs_123_2 = np.array([i for i in DEGs_123_2 if (i[:2] != 'RP') & (i[:3] != 'MT-')])

DEGs_123_13 = adata_all_123.uns['rank_genes_groups']['names']['1+3'][:150]

In [None]:
DEGs_123_2.sort()
DEGs_123_2

In [None]:
DEGs_123_13.sort()
DEGs_123_13

In DEGs from the 2 cluster dataset we see genes like FOS, JUN, JUND, GADD45B, IER2, IRF, which are classic stress-related genes. 

In the DEGs from the 1+3 cluster we see glicolysis-related genes (ALDOA, LDHA, PGK1, GAPDH) and, some hypoxia-related genes (BNIP3, BNIP3L), so it might be related to hypoxia.   

**We are going to focus on the DEGs / GOs associated to the cluster 2.**

In [None]:
enr_123_2 = gp.enrichr(gene_list=list(DEGs_123_2),
                    gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

In [None]:
enr_123_2.results.sort_values(by='Adjusted P-value').iloc[:15][selected_enr_cols]

In [None]:
enr_123_13 = gp.enrichr(gene_list=list(DEGs_123_13),
                    gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

In [None]:
enr_123_13.results.sort_values(by='Adjusted P-value').iloc[:15][selected_enr_cols]

The GO terms are clearly replicating the same results from *de visu* analysis. DEGs from FB2 dataset reveal GO terms related to stress (*response to unfolded protein*, *regulation of apoptotic process*, *mRNA catabolic process*). To make sure of these results, we are going to search in the literature for stress-related genes, and we will map them to our dataset. We should expect cluster FB2 to show an increased expression of many of the genes set in the literature.

### Mapping stress-related genes to populations
In this section we are going to use a set of genes from the following references:
* [van den Brick et al. (2017)](https://www.nature.com/articles/nmeth.4437) Table S1
* [O'Flanagan et al. (2019)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1830-0) Figure 3, and [Github](https://github.com/kieranrcampbell/scrnaseq-digestion-paper/blob/master/data/deliverables/coregene_df-FALSE-v3.csv)
* [Denisenko et al. (2020)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02048-6) Tables S1 and S3
* [Adam et al. (2017)](https://journals.biologists.com/dev/article/144/19/3625/48196/Psychrophilic-proteases-dramatically-reduce-single) Tables S1 and S2

These tables include genes overexpressed in different tissues (kidney, tumours, etc.) and hot (37 ºC) and cold conditions. Most of the authors assess that these conditions are not tissue-specific and appear, overall, in all samples with a certain degree of processing. Therefore, we are going to use these gene lists and map their genes to the dataset to see how apparent the gene expression patterns are. 

In [None]:
try:
    gene_list_adam_2017_s1_hot = np.loadtxt('papers_genes_bad_quality/adam_2017_s1_hot.txt', dtype=str)
    gene_list_adam_2017_s1_cold = np.loadtxt('papers_genes_bad_quality/adam_2017_s1_cold.txt', dtype=str)

    gene_list_adam_2017_s2_hot = np.loadtxt('papers_genes_bad_quality/adam_2017_s2_hot.txt', dtype=str)
    gene_list_adam_2017_s2_cold = np.loadtxt('papers_genes_bad_quality/adam_2017_s2_cold.txt', dtype=str)
except:
    gene_list_adam_2017_s1_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/adam_2017_s1_hot.txt', dtype=str)
    gene_list_adam_2017_s1_cold = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/adam_2017_s1_cold.txt', dtype=str)

    gene_list_adam_2017_s2_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/adam_2017_s2_hot.txt', dtype=str)
    gene_list_adam_2017_s2_cold = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/adam_2017_s2_cold.txt', dtype=str)

In [None]:
try:
    gene_list_denisenko_2020_s1_hot = np.loadtxt('papers_genes_bad_quality/denisenko_2020_s1.txt', dtype=str)
    gene_list_denisenko_2020_s1_cold = np.loadtxt('papers_genes_bad_quality/denisenko_2020_s1_cold.txt', dtype=str)

    gene_list_denisenko_2020_s3_hot = np.loadtxt('papers_genes_bad_quality/denisenko_2020_s3.txt', dtype=str)
except:
    gene_list_denisenko_2020_s1_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/denisenko_2020_s1.txt', dtype=str)
    gene_list_denisenko_2020_s1_cold = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/denisenko_2020_s1_cold.txt', dtype=str)

    gene_list_denisenko_2020_s3_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/denisenko_2020_s3.txt', dtype=str)

In [None]:
try:
    gene_list_oflanagan_2019_supp_hot = np.loadtxt('papers_genes_bad_quality/oflanagan_2019_gene_list_supp_hot.txt', dtype=str)
    gene_list_oflanagan_2019_supp_cold = np.loadtxt('papers_genes_bad_quality/oflanagan_2019_gene_list_supp_cold.txt', dtype=str)

    gene_list_oflanagan_2019_fig3_hot = np.loadtxt('papers_genes_bad_quality/oflanagan_2019_gene_list_fig3.txt', dtype=str)
except:
    gene_list_oflanagan_2019_supp_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/oflanagan_2019_gene_list_supp_hot.txt', dtype=str)
    gene_list_oflanagan_2019_supp_cold = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/oflanagan_2019_gene_list_supp_cold.txt', dtype=str)

    gene_list_oflanagan_2019_fig3_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/oflanagan_2019_gene_list_fig3.txt', dtype=str)

In [None]:
try:
    gene_list_vandenbrick_2017_hot = np.loadtxt('papers_genes_bad_quality/vandenbrink_2017_gene_list.txt', dtype=str)
except:
    gene_list_vandenbrick_2017_hot = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/vandenbrink_2017_gene_list.txt', dtype=str)

In [None]:
hot_genes = np.concatenate((gene_list_adam_2017_s1_hot, gene_list_adam_2017_s2_hot, gene_list_denisenko_2020_s1_hot, gene_list_denisenko_2020_s3_hot, 
                            gene_list_oflanagan_2019_supp_hot, gene_list_oflanagan_2019_fig3_hot, gene_list_vandenbrick_2017_hot))

cold_genes = np.concatenate((gene_list_adam_2017_s1_cold, gene_list_adam_2017_s2_cold, gene_list_denisenko_2020_s1_cold, gene_list_oflanagan_2019_supp_cold))

In [None]:
hot_genes, hot_counts = np.unique(hot_genes, return_counts=True)
cold_genes, cold_counts = np.unique(cold_genes, return_counts=True)

We are going to keep genes that appear at least once across the different *cold*-related gene lists, and at least twice across the *hot*-related gene lists.

In [None]:
hot_genes_good = hot_genes[hot_counts > 2]
hot_genes_good

In [None]:
np.savetxt('papers_genes_bad_quality/stress_genes.txt', hot_genes_good, fmt='%s')

In [None]:
cold_genes_good = cold_genes[cold_counts > 1]
cold_genes_good

In [None]:
np.intersect1d(DEGs_123_2, hot_genes_good)

We see that there is a certain grade of intersect between the selected genes and the DEGs from Reynolds cluster FB2.

In [None]:
sc.pl.umap(adata_all_123,  color=['a_dataset', 'assigned_cats'] + [i for i in list(hot_genes_good) if i in adata_all_123.var_names], 
           cmap=magma, use_raw=False)

We see that most of the genes are more expressed in the FB2 cluster. Moreover, some of them (BTG2, FOSB, KLF2, PHLDA2, HSPA1B) are mainly expressed in cluster FB2!

In [None]:
dict_cats = {'Stress': ['ATF3', 'BTG2', 'CEBPB', 'CEBPD', 'CLDN4', 'CSRNP1', 'CTGF',
       'CXCL1', 'CXCL2', 'CYR61', 'DNAJA1', 'DNAJB1', 'DUSP1', 'DUSP2',
       'DUSP5', 'EGR1', 'ELF3', 'FOS', 'FOSB', 'GADD45B', 'GADD45G',
       'HSP90AA1', 'HSPA1A', 'HSPA1B', 'HSPB1', 'IER2', 'IER3', 'IFRD1',
       'IRF1', 'JUN', 'JUNB', 'JUND', 'KLF2', 'KLF4', 'KLF6', 'MAFF',
       'NFKBIA', 'NFKBIZ', 'NR4A1', 'NR4A2', 'PHLDA2', 'PIM1', 'PLAUR',
       'PLK3', 'PPP1R15A', 'RASD1', 'RHOB', 'SOCS3', 'TNFAIP3', 'UBC',
       'ZFP36']}
assign_cats(adata_all_123, dict_cats=dict_cats, column_groupby='leiden', intermediate_states=True, 
            others_name='Non-stress', min_score=0.55, key_added='assigned_cats_stress')

In [None]:
sc.pl.umap(adata_all_123,  color=['a_dataset', 'assigned_cats', 'assigned_cats_stress'], legend_loc='on data',
           cmap=magma, use_raw=False)

## What genes separate A1/A2/B fibroblasts within FB1+FB3?
To answer that question we are going to get the DEGs from A2 clusters within FB1+FB3 cells, and get ontology terms to discern any pattern.

In [None]:
sc.tl.leiden(adata_all_13, resolution=2)

sc.pl.umap(adata_all_13,  color=['s_dataset', 'leiden', 'assigned_cats'])
sc.tl.rank_genes_groups(adata_all_13, groupby='leiden', method='wilcoxon', use_raw=False)

In [None]:
sc.pl.rank_genes_groups_tracksplot(adata_all_13, dendrogram=False, use_raw=False, n_genes=50)

In [None]:
# THIS PART MIGHT NOT BE REPLICABLE OUTSIDE!!! Clusters are set to detect differences in A2 left VS right
sc.tl.rank_genes_groups(adata_all_13, groupby='leiden', method='wilcoxon', groups=['1'], reference='0')
sc.pl.rank_genes_groups_tracksplot(adata_all_13, dendrogram=False, use_raw=False, n_genes=150)

In [None]:
sc.pl.umap(adata_all_13, color=['leiden'] + list(adata_all_13.uns['rank_genes_groups']['names']['1'][:150]), cmap=magma, use_raw=False)

In [None]:
# THIS PART MIGHT NOT BE REPLICABLE OUTSIDE!!! Clusters are set to detect differences in A2 left VS right
sc.tl.rank_genes_groups(adata_all_13, groupby='leiden', method='wilcoxon', groups=['0'], reference='1')
sc.pl.rank_genes_groups_tracksplot(adata_all_13, dendrogram=False, use_raw=False, n_genes=150)

In [None]:
sc.tl.rank_genes_groups(adata_all_13, groupby='leiden', method='wilcoxon', groups=['1'], reference='0')
sc.pl.umap(adata_all_13, color=['leiden'] + list(adata_all_13.uns['rank_genes_groups']['names']['1'][:150]), cmap=magma, use_raw=False)

In [None]:
genes_cluster_A2_A = ['HILPDA', 'VIM', 'FAM162A', 'VEGFA', 'LDHA', 'ADM', 'SNHG7', 'SYNPO', 'INSIG2', 'ERO1A', 'ZNF395', 'ANGPTL4', 
                      'NRN1', 'SLC2A1', 'EHD2', 'DDIT4', 'P4HA2', 'DPYSL2', 'WSB1', 'SNHG1', 'SNHG8', 'SLC16A3', 'BNIP3', 'RAB20', 'VKORC1',
                      'LOXL2', 'KLF6', 'PGM1', 'SLC2A14', 'PPP1R18', 'BAIAP2', 'CDON', 'CLK1', 'PDK1', 'PYGL', 'NFIX', 'AK4', 'RORA',
                      'NGLY1', 'SCD', 'GPRC5A', 'RASSF5', 'COPS2', 'COL27A1', 'SH3BP5', 'EGLN3', 'JUN', 'PGF', 'CXCR4', 'PPP1R3B', 
                      'HSD3B7', 'SNX33', 'KCTD11', 'MAFF', 'BEND5', 'SEPTIN9', 'PPP1R3C', 'SMIM3', 'SFXN3', 'SNHG18', 'OSBPL5',
                      ]
genes_cluster_A2_B = ['HSPE1', 'HMGB1', 'CEMIP', 'SNRPG', 'ERH', 'PRDX1', 'AKR1C1', 'PSMB3', 'DDX5', 'ATP5MF', 'CCT7', 'NDUFS6', 
                      'COX6B1', 'PSMA4', 'SPON1', 'NDUFAF3', 'ACTB', 'ADAMTS5', 'SLIRP', 'SSB', 'MRPL41', 'POLR2K', 'NQO2']

From the gene sets, we are going to assume that A gene set represents hypoxic conditions ('hypoxia') and B represents normoxic conditions ('normoxia')

In [None]:
sc.pl.umap(adata_all_13, color=genes_cluster_A2_A, cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_all_13, color=genes_cluster_A2_B, cmap=magma, use_raw=False)

In [None]:
assign_cats(adata_all_13, min_score=0.3, key_added='cats_hypoxia',
            dict_cats={'Hypoxia': genes_cluster_A2_A, 'Normoxia': genes_cluster_A2_B})

In [None]:
sc.pl.umap(adata_all_13, color=['assigned_cats', 'leiden', 'cats_hypoxia'], cmap=magma, use_raw=False)

In [None]:
sc.tl.rank_genes_groups(adata_all_13, groupby='cats_hypoxia', method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_all_13, dendrogram=False, use_raw=False, n_genes=150)

In [None]:
DEGs_hypoxia =  list(adata_all_13.uns['rank_genes_groups']['names']['Hypoxia'][:150])
DEGs_hypoxia = np.array([i for i in DEGs_hypoxia if (i[:2] != 'RP') & (i[:3] != 'MT-')])

DEGs_normoxia =  list(adata_all_13.uns['rank_genes_groups']['names']['Normoxia'][:150])
DEGs_normoxia = np.array([i for i in DEGs_normoxia if (i[:2] != 'RP') & (i[:3] != 'MT-')])

In [None]:
DEGs_hypoxia.sort()
DEGs_hypoxia

In [None]:
DEGs_normoxia.sort()
DEGs_normoxia

In [None]:
enr_hypoxia = gp.enrichr(gene_list=list(DEGs_hypoxia),
                    gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

In [None]:
enr_hypoxia.results.sort_values(by='Adjusted P-value').iloc[:15][selected_enr_cols]

In [None]:
enr_normoxia = gp.enrichr(gene_list=list(DEGs_normoxia),
                    gene_sets=['GO_Biological_Process_2018'],
                 organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
                 description='test_name',
                 cutoff=0.5 # test dataset, use lower value from range(0,1)
                )

In [None]:
enr_normoxia.results.sort_values(by='Adjusted P-value').iloc[:15][selected_enr_cols]

We confirm that the cell set with genes related to hypoxic conditions show a clear enrichment of glicolysis-related genes and hypoxia. It has been observed that cells in hypoxic conditions upregulate glicolysis to produce energy under hypoxic conditions. Therefore, these cell will underexpress genes related to energy production with oxygen, such as the NDUFA proteins, subunits of the mitochondrial complex III; PSMAs, which are involved in general metabolism; or the ATPase subunits.

Interestingly, normoxic cells overexpress *HIF1A*. Maybe it is an underexpression of *HIF1A* because hypoxic cells are already under hypoxia, and normoxic cells are starting to express it to signal hypoxic conditions?

In [None]:
sc.pl.umap(adata_all_13, color=['leiden', 'VEGFA', 'HIF1A', 'ATR'] , cmap=magma, use_raw=False)

### Mapping hypoxia-related genes to populations
In this section we are going to use a set of genes from the following references:
* HALLMARK_HYPOXIA, and HALLMARK_GLICOLYSIS gene lists from GSEA. The use of these lists are supported by different references ([Evers et al. (2019)](https://doi.org/10.1021/acs.analchem.9b02410), 
[Xiao et al. (2019)](https://doi.org/10.1038/s41467-019-11738-0), 
[Mohyeldin et al. (2010)](https://doi.org/10.1016/j.stem.2010.07.007), 
[Simon et al. (2008)](https://doi.org/10.1038/nrm2354))
* [Wu et al. (2021)](https://doi.org/10.7554/eLife.63003) 
* [Guo et al. (2020)](https://doi.org/10.3389/fcell.2021.624711) Supplementary File 1

In [None]:
try:
    gene_list_GSEA_hallmark_hypoxia = np.loadtxt('papers_genes_bad_quality/GSEA_hallmark_hypoxia.txt', dtype=str)
    gene_list_GSEA_hallmark_glicolysis = np.loadtxt('papers_genes_bad_quality/GSEA_hallmark_glicolysis.txt', dtype=str)
except:
    gene_list_GSEA_hallmark_hypoxia = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/GSEA_hallmark_hypoxia.txt', dtype=str)
    gene_list_GSEA_hallmark_glicolysis = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/GSEA_hallmark_glicolysis.txt', dtype=str)

In [None]:
try:
    gene_list_guo_2021 = np.loadtxt('papers_genes_bad_quality/guo_2021_brain_hypoxia.txt', dtype=str)
except:
    gene_list_guo_2021 = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/guo_2021_brain_hypoxia.txt', dtype=str)

In [None]:
try:
    gene_list_wu_2021_bulk = np.loadtxt('papers_genes_bad_quality/wu_2021_lung_hypoxia_bulk.txt', dtype=str)
    gene_list_wu_2021_endo = np.loadtxt('papers_genes_bad_quality/wu_2021_lung_hypoxia_bulk.txt', dtype=str)
except:
    gene_list_wu_2021_bulk = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/wu_2021_lung_hypoxia_bulk.txt', dtype=str)
    gene_list_wu_2021_endo = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/wu_2021_lung_hypoxia_bulk.txt', dtype=str)    

In [None]:
try:
    gene_list_ferreira_2018_s1_all = np.loadtxt('papers_genes_bad_quality/ferreira_2018_s1_all.txt', dtype=str)
    gene_list_ferreira_2018_s1_filtered = np.loadtxt('papers_genes_bad_quality/ferreira_2018_s1_filtered.txt', dtype=str)
    gene_list_ferreira_2018_s2_blood = np.loadtxt('papers_genes_bad_quality/ferreira_2018_s2_blood.txt', dtype=str)
except:
    gene_list_ferreira_2018_s1_all = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/ferreira_2018_s1_all.txt', dtype=str)
    gene_list_ferreira_2018_s1_filtered = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/ferreira_2018_s1_filtered.txt', dtype=str)
    gene_list_ferreira_2018_s2_blood = np.loadtxt('https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/papers_genes_bad_quality/ferreira_2018_s2_blood.txt', dtype=str)    

In [None]:
manual_list_hypoxia_genes = ['ADM', 'ANGPTL4', 'ANKRD37', 'ARID5B', 'ARRDC3', 'BHLHE40', 'BNIP3', 'BNIP3L', 'C4orf3', 'CD44', 'CLN8', 
'COPS2', 'DDIT3', 'DDIT4', 'DHRS3', 'EHD2', 'EIF4A2', 'ENO1', 'ENO2', 'ERO1A', 'FAM162A', 'FGF11', 'FGFR1', 'FNBP1', 
'GBE1', 'GPI', 'HIF1A-AS2', 'HILPDA', 'HK2', 'INSIG2', 'LDHA', 'LOXL2', 'NDRG1', 'NGLY1', 'NRN1', 'P4HA1', 'PDE2A', 
'PGK1', 'PLIN2', 'PLOD2', 'PLP2', 'RBPJ', 'SERPINE1', 'SLC2A1', 'SLC2A3', 'SNHG12', 'SNHG7', 'SYNPO', 'TAF1D', 'TNFAIP8', 
'TNIP1', 'TPI1', 'VEGFA', 'ZFAS1', 'ZNF267', 'ZNF395', ]

In [None]:
hypoxia_genes = np.concatenate((gene_list_GSEA_hallmark_hypoxia, gene_list_GSEA_hallmark_glicolysis
                                , gene_list_guo_2021, gene_list_wu_2021_bulk, 
                                gene_list_wu_2021_endo, gene_list_ferreira_2018_s1_all, gene_list_ferreira_2018_s1_filtered, 
                                gene_list_ferreira_2018_s2_blood))
hypoxia_genes, hypoxia_counts = np.unique(hypoxia_genes, return_counts=True)
hypoxia_genes_good = np.array(sorted(list(set(hypoxia_genes[hypoxia_counts > 2]) | set(manual_list_hypoxia_genes))))
hypoxia_genes_good

In [None]:
sc.pl.umap(adata_all_13, color=['leiden'] + [i for i in list(hypoxia_genes_good) if i in adata_all_13.var_names], 
           cmap=magma, use_raw=False)

In [None]:
np.savetxt('papers_genes_bad_quality/hypoxia_genes.txt', hypoxia_genes_good, fmt='%s')

In [None]:
dict_cats = {'Stress': hot_genes_good, 'Hypoxia': hypoxia_genes_good,}

In [None]:
sc.tl.leiden(adata_all_123, resolution=1.7)
assign_cats(adata_all_123, dict_cats=dict_cats, column_groupby='leiden', intermediate_states=False, min_score=0.5,
            key_added='hypoxia_stress', others_name='Normal')
sc.pl.umap(adata_all_123, color=['hypoxia_stress', 'assigned_cats'], 
           cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata_all_123, color=['leiden', 'hypoxia_stress', 'assigned_cats', 'hypoxia_stress_Stress', 'hypoxia_stress_Hypoxia'], 
           cmap=magma, use_raw=False, ncols=3, legend_loc='on data')

# Preparing Figure 1A

In [None]:
adata_fig = adata_all_123[adata_all_123.obs['hypoxia_stress'] == 'Normal']

In [None]:
sc.pp.filter_genes(adata_fig, min_counts=30)
sc.pp.pca(adata_fig, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_fig, metric='angular', batch_key='s_dataset', neighbors_within_batch=2)
tk.tl.triku(adata_fig, use_raw=False)
sc.tl.umap(adata_fig, min_dist=0.05, random_state=seed)

In [None]:
sc.tl.leiden(adata_fig, resolution=2.5)

assign_cats(adata_fig, dict_cats=dict_cat, column_groupby='leiden', intermediate_states=True, diff=0.02, min_score=0)

dict_colors = {'A1': '#e14b67', 'A2': '#d98c58', 'B1': '#009f61', 'B2': '#54ab4c', 'B1/B2': '#2AA557', 
               'A2/B2': '#979C52', 'A1/A2': '#DD6C60', 'A2/B1': '#6D965D', 'A2/B1/B2': '#819857'}

adata_fig.uns['assigned_cats_colors'] = [dict_colors[i] for i in sorted(set(adata_fig.obs['assigned_cats'].values))]

sc.pl.umap(adata_fig,  color=['assigned_cats'], legend_loc='on data', frameon=False, legend_fontsize=13, title='', save='F1A.pdf')

In [None]:
for key, val in dict_cat.items():
        print(key, val)
        sc.pl.umap(adata_fig, color=['science_clustering'] + [i for i in val if i in adata_fig.var_names], 
                   legend_loc='on data', cmap=magma, use_raw=False, ncols=5)