In [1]:
import pandas as pd
import numpy as np

from horta_exp.introns.fetch_data import get_intron_events

from bokeh.io import push_notebook, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_file
from bokeh.charts import BoxPlot, Bar, Histogram
from bokeh.layouts import gridplot
from bokeh.layouts import gridplot

FILE = 'chrom_real_all.pkl'
output_notebook()

In [2]:
introns = get_intron_events()
df = pd.read_pickle(FILE)
df.set_index(['gene', 'intron'], inplace=True)
df.sort_index(inplace=True)

In [3]:
ii = introns.index.intersection(df.index)
introns = introns.loc[ii,:].sort_index()

# Bonferroni correction

In [4]:
ntests = len(df.index.unique())
print("Number of tests: %d" % ntests)

# Bonferroni within gene-intron
def pval_agg(x):
    return np.clip(np.min(x) * len(x), 0, 1)

def method_agg(x):
    return x[0]
    
def gene_intron_wise(df):
    return df.groupby(level=[0, 1]).agg({
        'lmm-pval': pval_agg,
        'lmm-rank-pval': pval_agg,
        'qep-pval': pval_agg
    })

df = gene_intron_wise(df)

Number of tests: 66489


In [5]:
def method_column(df):
    newdata = pd.DataFrame(columns=('pval', 'method'))

    def append_df(data, df, name):
        pval = df['%s-pval' % name]
        return pd.concat([data, pd.DataFrame({'pval': pval, 'method': name})])
        return data

    newdata = append_df(newdata, df, 'lmm')
    newdata = append_df(newdata, df, 'lmm-rank')
    newdata = append_df(newdata, df, 'qep')
    return newdata

In [6]:
def set_index(data):
    data = method_column(data)
    data.reset_index(inplace=True)
    data['gene'] = data['index'].apply(lambda x: x[0])
    data['intron'] = data['index'].apply(lambda x: int(x[1]))
    del data['index']
    data.set_index(['gene', 'intron'], inplace=True)
    data.sort_index(inplace=True)
    return data

In [7]:
df = set_index(df)

In [8]:
def plot_curve(data, perc=False, exclude=False, lthr=-5, rthr=-0.30103):
    
    nhits = {'lmm-rank':[], 'qep':[]}
    if not exclude:
        nhits['lmm'] = []
    
    pts = np.logspace(lthr, rthr)
    for pt in pts:

        if perc:
            grouped = data.groupby(data['method']).agg({
                'pval': lambda x: 100*sum(x*len(x)<=pt)/float(len(x))
            })
        else:
            grouped = data.groupby(data['method']).agg({
                'pval': lambda x: sum(x*len(x)<=pt)
            })


        nhits['lmm-rank'].append(grouped.loc['lmm-rank', 'pval'])
        if not exclude:
            nhits['lmm'].append(grouped.loc['lmm', 'pval'])
        nhits['qep'].append(grouped.loc['qep', 'pval'])

    p = figure(title="Power", tools=['save,reset'])
    p.grid.grid_line_alpha=0.3
    p.xaxis.axis_label = 'P-value threshold'
    if perc:
        p.yaxis.axis_label = 'Percentage of hits'
    else:
        p.yaxis.axis_label = 'Number of hits'
    p.line(pts, nhits['lmm-rank'], color='green', legend='lmm-rank')
    if not exclude:
        p.line(pts, nhits['lmm'], color='blue', legend='lmm')
    p.line(pts, nhits['qep'], color='red', legend='qep')
    p.legend.location = "bottom_right"
    return p

In [9]:
def plot_grid(introns, group, df, expression, STAT, q,):
    df = df.loc[expression]
    
    group = group.loc[expression]
    msg = '####### %s stratified (%.2f quantile) -- %d gene-introns #######' % (STAT, q, len(group))
    print('                   %s' % msg)
    
    lthr = -5
    rthr = -1
    p1 = plot_curve(df, False, lthr=lthr, rthr=rthr)
    p2 = plot_curve(df, False, True, lthr=lthr, rthr=rthr)
    
    def choice(vals):
        return np.random.RandomState(0).choice(vals, min(5000, len(vals)))
    
    ntris = choice(introns['ntri'].values)
    h0 = Histogram(ntris, title="Number of trials - across samples",
                   tools=['save'], xlabel='number of trials',
                   width=400, height=400)
    
    values = choice(group.values)
    h1 = Histogram(values, title="Number of trials - across gene-intron pairs",
                   tools=['save'], xlabel='number of trials', width=400, height=400)
    
    
    show(gridplot([[p1,p2], [h0, h1]], plot_width=400, plot_height=400))
    print()
    print()
    print()

In [10]:
group = introns.groupby(level=[0, 1]).agg({
    'ntri': {
        'max': 'max',
        'min': 'min',
        'median': 'median',
        'var': 'var',
        'mean': 'mean',
    }
})
group.sort_index(inplace=True)

# Stratified by median

In [11]:
STAT = 'median'
for v in [0.0, 0.25, 0.5,  0.75, 0.9]:
    expression = group['ntri'][STAT] > group['ntri'][STAT].quantile(v)
    plot_grid(introns, group['ntri'][STAT], df, expression, STAT, v)

                   ####### median stratified (0.00 quantile) -- 66484 gene-introns #######





                   ####### median stratified (0.25 quantile) -- 49594 gene-introns #######





                   ####### median stratified (0.50 quantile) -- 33029 gene-introns #######





                   ####### median stratified (0.75 quantile) -- 16593 gene-introns #######





                   ####### median stratified (0.90 quantile) -- 6570 gene-introns #######







# Stratified by variance

In [12]:
STAT = 'var'
for v in [0.0, 0.25, 0.5,  0.75, 0.9]:
    expression = group['ntri'][STAT] > group['ntri'][STAT].quantile(v)
    plot_grid(introns, group['ntri'][STAT], df, expression, STAT, v)

                   ####### var stratified (0.00 quantile) -- 66488 gene-introns #######





                   ####### var stratified (0.25 quantile) -- 49866 gene-introns #######





                   ####### var stratified (0.50 quantile) -- 33244 gene-introns #######





                   ####### var stratified (0.75 quantile) -- 16622 gene-introns #######





                   ####### var stratified (0.90 quantile) -- 6649 gene-introns #######







# Stratified by mean

In [13]:
STAT = 'mean'
for v in [0.0, 0.25, 0.5,  0.75, 0.9]:
    expression = group['ntri'][STAT] > group['ntri'][STAT].quantile(v)
    plot_grid(introns, group['ntri'][STAT], df, expression, STAT, v)

                   ####### mean stratified (0.00 quantile) -- 66488 gene-introns #######





                   ####### mean stratified (0.25 quantile) -- 49865 gene-introns #######





                   ####### mean stratified (0.50 quantile) -- 33244 gene-introns #######





                   ####### mean stratified (0.75 quantile) -- 16622 gene-introns #######





                   ####### mean stratified (0.90 quantile) -- 6649 gene-introns #######





