## DE for broad and granular annotations

In [1]:
import sys
import io
from pathlib import Path
import pickle

import scanpy as sc
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from pathlib import Path
import seaborn as sns

In [None]:
sys.path.append(str(Path.home() / 'Code/sctoolkit')) # https://github.com/gokceneraslan/sctoolkit/

In [2]:
from sctoolkit.utils import run_spring, summarized_expression_df, bin_pval

In [3]:
sc.set_figure_params(dpi=100, vector_friendly=True, facecolor='white', transparent=False)
sc.settings.verbosity = 'info'

In [4]:
from typing import Optional, Iterable, Tuple, Union
from anndata import AnnData

# similar to sc.get.rank_genes_groups_df but with pts and pts_rest columns
# nowadays it's possible with sc.tl.rank_genes_groups(..., pts=True) I guess
def rank_genes_groups_df(
    adata: AnnData,
    group: Union[str, Iterable[str]] = None,
    *,
    key: str = "rank_genes_groups",
    pval_cutoff: Optional[float] = None,
    log2fc_min: Optional[float] = None,
    log2fc_max: Optional[float] = None,
    gene_symbols: Optional[str] = None,
) -> pd.DataFrame:

    if isinstance(group, str):
        group = [group]
    if group is None:
        group = list(adata.uns[key]['names'].dtype.names)
        
    colnames = ['names', 'scores', 'logfoldchanges', 'pvals', 'pvals_adj']

    d = [pd.DataFrame(adata.uns[key][c])[group] for c in colnames]
    d = pd.concat(d, axis=1, names=[None, 'group'], keys=colnames)
    d = d.stack(level=1).reset_index()
    d['group'] = pd.Categorical(d['group'], categories=group)
    d = d.sort_values(['group', 'level_0']).drop(columns='level_0')

    if pval_cutoff is not None:
        d = d[d["pvals_adj"] < pval_cutoff]
    if log2fc_min is not None:
        d = d[d["logfoldchanges"] > log2fc_min]
    if log2fc_max is not None:
        d = d[d["logfoldchanges"] < log2fc_max]
    if gene_symbols is not None:
        d = d.join(adata.var[gene_symbols], on="names")
        
    if 'pts' in adata.uns[key]:
        pts = adata.uns[key]['pts'][group].reset_index().melt(
            id_vars='index', 
            var_name='group', 
            value_name='pts').rename(columns={'index': 'names'})
        d = d.merge(pts)
    if 'pts_rest' in adata.uns[key]:
        ptsr = adata.uns[key]['pts_rest'][group].reset_index().melt(
            id_vars='index', 
            var_name='group', 
            value_name='pts_rest').rename(columns={'index': 'names'})
        d = d.merge(ptsr)

    return d.reset_index(drop=True)

## Literature-curated marker list

In [5]:
markers = []

for t in tqdm(('Breast', 'Esophagus', 'Heart', 'Lung', 'Skeletal Muscle', 'Prostate', 'Skin')):
    df = pd.read_excel('../markers.xlsx', sheet_name=t)[['Gene', 'Cell-Type']].rename(columns={'Gene': 'genes', 'Cell-Type': 'marker'})
    t = t.replace(' ', '').lower()
    if t == 'esophagus':
        markers.append(df.assign(tissue='esophagusmuscularis'))
        markers.append(df.assign(tissue='esophagusmucosa'))        
    else:
        markers.append(df.assign(tissue=t))
    
markers = pd.concat(markers, axis=0).drop_duplicates().reset_index(drop=True)
markers

  0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,genes,marker,tissue
0,CD19,B cell,breast
1,FCER2,B cell_mature-naïve,breast
2,CD69,B cell_mature-naïve,breast
3,CD80,B cell_mature-naïve,breast
4,CD86,B cell_mature-naïve,breast
...,...,...,...
1325,PTPRB,Vascular Endothelial,skin
1326,EGLF7,Vascular Endothelial,skin
1327,SCGB3A1,Secretory Cell,skin
1328,SCGB3A2,Secretory Cell,skin


In [6]:
markers = markers.groupby(['genes', 'tissue']).agg(lambda x: ','.join(sorted(list(set(x))))).reset_index()

In [7]:
markers

Unnamed: 0,genes,tissue,marker
0,ABCG2,skeletalmuscle,Side Population (SP) Cell
1,ACPP,prostate,Luminal Epithelial
2,ACSL1,breast,Adipocyte
3,ACSL1,heart,Adipocyte
4,ACSL1,lung,Adipocyte
...,...,...,...
1220,VWF,heart,Vascular Endothelial
1221,VWF,lung,Vascular Endothelial
1222,VWF,prostate,Vascular Endothelial
1223,VWF,skeletalmuscle,Vascular Endothelial


## Load data

In [None]:
adata = sc.read('../alltissue_v5__myocytes_20210204.h5ad')

In [10]:
def celltype_de(ad, tissue, correct=('prep', 'Sex'), markers=None):
    
    for var in correct:
        sc.pp.combat(ad, var)

    spring_df_broad = run_spring(ad, 'Broad cell type')
    spring_df_granular = run_spring(ad, 'Granular cell type')        

    sc.tl.rank_genes_groups(ad, groupby='Broad cell type', key_added='broad_de', use_raw=False)
    sc.tl.rank_genes_groups(ad, groupby='Granular cell type', key_added='granular_de', use_raw=False)

    df_broad = rank_genes_groups_df(ad, key='broad_de')
    df_granular = rank_genes_groups_df(ad, key='granular_de')

    df_exp = summarized_expression_df(ad, groupby='Broad cell type', ops=['mean_expressed'], use_raw=False).reset_index().rename(columns={'Broad cell type': 'group', 'gene': 'names'})
    df_exp.mean_expressed.fillna(0, inplace=True)
    df_broad = df_broad.merge(df_exp, how='left')
    df_broad.mean_expressed.fillna(0, inplace=True)

    df_exp = summarized_expression_df(ad, groupby='Granular cell type', ops=['mean_expressed'], use_raw=False).reset_index().rename(columns={'Granular cell type': 'group', 'gene': 'names'})
    df_exp.mean_expressed.fillna(0, inplace=True)
    df_granular = df_granular.merge(df_exp, how='left')
    df_granular.mean_expressed.fillna(0, inplace=True)
    
    df_broad = df_broad.merge(spring_df_broad, how='left')
    df_granular = df_granular.merge(spring_df_granular, how='left')    

    df_broad.rename(columns={'names': 'genes', 'scores': 'tstat', 'pts': 'fraction_group', 'pts_rest': 'fraction_rest', 'logfoldchanges': 'log2FC', 'pvals_adj': 'pvals_fdr'}, inplace=True)
    df_granular.rename(columns={'names': 'genes', 'scores': 'tstat', 'pts': 'fraction_group', 'pts_rest': 'fraction_rest', 'logfoldchanges': 'log2FC', 'pvals_adj': 'pvals_fdr'}, inplace=True)
    
    if markers is not None:
        df_broad = df_broad.assign(tissue=tissue).merge(markers, how='left')
        df_broad['marker'] = df_broad['marker'].fillna('').astype(str)
        df_granular = df_granular.assign(tissue=tissue).merge(markers, how='left')
        df_granular['marker'] = df_granular['marker'].fillna('').astype(str)
    
    return df_broad, df_granular, ad

In [11]:
de_dict_adatas = {}

de_dict_broad = {}
de_dict_granular = {}

## DE

In [12]:
def move_spring_to_de(ad, spring_key, de_key, old_de_key, obs_key):
    ad.uns[de_key] = ad.uns[old_de_key].copy()
    cats = ad.obs[obs_key].cat.categories
    ad.uns[de_key]['names'] = pd.concat([ad.uns[spring_key][ad.uns[spring_key]['group'] == x].sort_values('spring_score', ascending=False)[['genes']].reset_index(drop=True) for x in cats], axis=1, ignore_index=True)
    ad.uns[de_key]['names'].columns = cats
    ad.uns[de_key]['names'] = ad.uns[de_key]['names'].to_records(index=False)

In [None]:
%%time

correct = ['prep', 'Sex']

for tissue in tqdm(adata.obs.Tissue.cat.categories):
    ad = adata[adata.obs.Tissue == tissue].copy()
    correct_ex = [x for x in correct if len(ad.obs[x].cat.categories)>1]
    
    df_broad, df_granular, ad_ret = celltype_de(ad, correct=correct_ex, markers=markers, tissue=tissue)

    ad.uns['broad_de'] = ad_ret.uns['broad_de']
    ad.uns['granular_de'] = ad_ret.uns['granular_de']

    ad.uns['df_broad'] = df_broad.copy()
    ad.uns['df_granular'] = df_granular.copy()

    de_dict_broad[tissue] = df_broad.copy()
    de_dict_granular[tissue] = df_granular.copy()
    
    ad.obsm['X_umap'] = ad.obsm['X_umap_tissue'].values
    if 'Granular cell type_colors' in ad.uns_keys():
        del ad.uns['Granular cell type_colors']
        
    move_spring_to_de(ad, 'df_broad', 'broad_zscore_de', 'broad_de', 'Broad cell type')
    move_spring_to_de(ad, 'df_granular', 'granular_zscore_de', 'granular_de', 'Granular cell type')        
    
    ad.write(f'DE-{tissue}.h5ad')

  0%|          | 0/8 [00:00<?, ?it/s]

Standardizing Data across genes.

Found 4 batches

Found 0 numerical variables:
	

Found 642 genes with zero variance.
Fitting L/S model and finding priors

Finding parametric adjustments





Adjusting data



  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

ranking genes
    finished (0:00:09)
ranking genes




    finished (0:00:15)
Standardizing Data across genes.

Found 4 batches

Found 0 numerical variables:
	

Found 275 genes with zero variance.
Fitting L/S model and finding priors

Finding parametric adjustments

Adjusting data



  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

ranking genes
    finished (0:00:50)
ranking genes
    finished (0:01:17)




Standardizing Data across genes.

Found 4 batches

Found 0 numerical variables:
	

Found 314 genes with zero variance.
Fitting L/S model and finding priors

Finding parametric adjustments

Adjusting data

Standardizing Data across genes.

Found 2 batches

Found 0 numerical variables:
	

Found 314 genes with zero variance.
Fitting L/S model and finding priors

Finding parametric adjustments

Adjusting data



  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

ranking genes
    finished (0:00:54)
ranking genes
    finished (0:01:29)
Standardizing Data across genes.

Found 4 batches

Found 0 numerical variables:
	

Found 312 genes with zero variance.
Fitting L/S model and finding priors



In [None]:
with open('de_dict_broad.pkl', 'wb') as f:
    pickle.dump(de_dict_broad, f)
    
with open('de_dict_granular.pkl', 'wb') as f:
    pickle.dump(de_dict_granular, f)    

In [None]:
de_dict_broad = pickle.load(open('de_dict_broad.pkl', 'rb'))
de_dict_granular = pickle.load(open('de_dict_granular.pkl', 'rb'))

In [None]:
pd.concat([v.assign(tissue=k) for k,v in de_dict_broad.items()], axis=0).to_csv('de_results_broad.csv.gz', index=None)
pd.concat([v.assign(tissue=k) for k,v in de_dict_granular.items()], axis=0).to_csv('de_results_granular.csv.gz', index=None)