__Author:__ Bram Van de Sande
   
__Date:__ 1 FEB 2018

__Outline:__

In [9]:
import pandas as pd
import numpy as np
import pickle
import os

In [10]:
RESOURCES_FOLDER="/Users/bramvandesande/Projects/lcb/resources"
DATA_FOLDER="/Users/bramvandesande/Projects/lcb/tmp"

Load and rank expression profiles from single-cell experiment.

In [13]:
ex_mtx = pd.read_csv(os.path.join(RESOURCES_FOLDER, 'GSE60361_C1-3005-Expression.txt'), sep='\t', header=0, index_col=0)

In [48]:
rnk_mtx = ex_mtx.rank(axis=0, ascending=False, method='first').astype('int64')

Load regulomes discovered in previous phase.

In [50]:
with open(os.path.join(DATA_FOLDER, 'regulomes.pickle'), 'rb') as f:
    regulomes = pickle.load(f)

In [51]:
regulomes

[Regulome(name='Regulome for Regulome for Alx1 (target weight >= 0.001)', nomenclature='MGI', gene2weights=<frozendict {'Tpm1': 1.0, 'Rftn2': 1.0, 'Phlda2': 1.0, 'Ptgds': 1.0, 'Myocd': 1.0, 'Bahcc1': 1.0, 'Emilin1': 1.0, 'Icam1': 1.0, 'Obsl1': 1.0, 'Dnmt3b': 1.0, 'Vim': 1.0, 'Fnip2': 1.0, 'Bnc2': 1.0, 'Errfi1': 1.0, 'Serping1': 1.0, 'Sdk2': 1.0, 'Zic1': 1.0, 'Tpm2': 1.0, 'Fosb': 1.0, 'Prelp': 1.0, 'Gadd45b': 1.0, 'Actn1': 1.0, 'Pcdh18': 1.0, 'Gja4': 1.0, 'Myl9': 1.0, 'Frmd4b': 1.0, 'H3f3a': 1.0, 'Nfkbiz': 1.0, 'Rbpms2': 1.0, 'Tbx2': 1.0, 'Tbx18': 1.0, 'Nrarp': 1.0, 'Id3': 1.0, 'Col12a1': 1.0, 'Des': 1.0, 'Ccnd2': 1.0, 'Irf1': 1.0, 'Sema4b': 1.0, 'Gjb2': 1.0, 'Epb4.1': 1.0, 'Prrx1': 1.0, 'Nfatc4': 1.0, 'Rras': 1.0, 'Tmem200b': 1.0, 'Zic2': 1.0, 'Samd4': 1.0, 'Klf4': 1.0}>, transcription_factor='Alx1', context=('mm9-500bp-upstream-7species', 'target weight >= 0.00'), score=0.26503234510199464)]

Calculate enrichment as AUC (NES is not valid because AUC are not normally distributed) of regulomes in cells.

In [90]:
def enrichment(rnk_mtx, regulome, rank_threshold: int = 5000, auc_threshold: float = 0.05) -> pd.DataFrame:
    # Best to only select subset of regulome based on same database and regulome definition.
    
    # Load rank of genes from database.
    total_genes = len(rnk_mtx.index)
    rank_cutoff = int(round(auc_threshold * total_genes))
    df = rnk_mtx[rnk_mtx.index.isin(regulome.genes)]
    features, genes, rankings = df.columns.values, df.index.values, df.T.values
    weights = np.asarray([regulome[gene] for gene in genes] + [0.0])

    # Calculate recovery curves.
    def calc_rcc(ranking, weights, total_genes, rank_threshold):
        curranking = np.append(ranking, total_genes)
        return np.cumsum(np.bincount(curranking, weights=weights)[:rank_threshold])
    # Apply along axis does not improve performance, only more readable code.
    rccs = np.apply_along_axis(calc_rcc, 1, rankings, weights, total_genes, rank_threshold)

    # Calculate AUC and NES.
    maxauc = float(rank_cutoff * total_genes)
    aucs = rccs[:, :rank_cutoff].sum(axis=1) / maxauc

    return pd.DataFrame(index=pd.Index(features, name="Cell"), data={
        "AUC": aucs,
        "Regulome": regulome.transcription_factor})

In [91]:
df = enrichment(rnk_mtx, regulomes[0])

In [92]:
df.set_index(['Regulome'], append=True).unstack("Regulome")

Unnamed: 0_level_0,AUC
Regulome,Alx1
Cell,Unnamed: 1_level_2
1772058148_A01,0.000178
1772058148_A03,0.000099
1772058148_A04,0.000051
1772058148_A05,0.000054
1772058148_A06,0.000066
1772058148_A07,0.000118
1772058148_A09,0.000110
1772058148_A10,0.000276
1772058148_A11,0.000189
1772058148_A12,0.000085


Concatenate all enrichment and unstack!