In [1]:
import pandas as pd
import numpy as np

from horta_exp.introns.fetch_data import get_intron_events

from limix_genetics import hitsplot
from bokeh.io import push_notebook, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_file
from bokeh.charts import BoxPlot, Bar, Histogram
from bokeh.layouts import gridplot

FILE = {'real': 'chrom_real_all.pkl', 'null': 'chrom_null_all.pkl'}
output_notebook()

In [2]:
def convert_and_save(filename):
    introns = get_intron_events()
    df = pd.read_pickle(filename).set_index(['gene', 'intron']).sort_index()
    introns = introns.loc[df.index.drop_duplicates()]

    introns.to_pickle('introns_%s.pkl' % filename)
    df.to_pickle('df_%s.pkl' % filename)
    
    return (introns, df)

In [3]:
introns = dict()
df = dict()

if True:
    for n in ['real', 'null']:
        introns[n] = pd.read_pickle('introns_%s.pkl' % FILE[n])
        df[n] = pd.read_pickle('df_%s.pkl' % FILE[n])
else:
    for n in ['real', 'null']:
        (introns[n], df[n]) = convert_and_save(FILE[n])

In [4]:
npairs = len(df['real'].index.unique())
print("Number of gene-intron pairs: %d" % npairs)

Number of gene-intron pairs: 66507


# Bonferroni correction within gene-introns and aggregate the p-values

In [5]:
def bonferroni_and_aggregate(df):
    def pval_agg(x):
        return np.clip(np.min(x) * len(x), 0, 1)

    return df.groupby(level=[0, 1]).agg({
        'lmm-pval': pval_agg,
        'lmm-rank-pval': pval_agg,
        'qep-pval': pval_agg
    })

for n in ['real', 'null']:
    df[n] = bonferroni_and_aggregate(df[n])

In [6]:
from IPython.display import display
for n in ['real', 'null']:
    display(df[n].head())

Unnamed: 0_level_0,Unnamed: 1_level_0,lmm-rank-pval,qep-pval,lmm-pval
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,1.0,1.0,1.0
ENSG00000000419,6,0.405983,1.0,1.0
ENSG00000000419,7,1.0,1.0,0.820634
ENSG00000000419,8,1.0,1.0,1.0
ENSG00000000457,1,0.065261,0.057588,0.007733


Unnamed: 0_level_0,Unnamed: 1_level_0,lmm-rank-pval,qep-pval,lmm-pval
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,1.0,1.0,1.0
ENSG00000000419,6,1.0,1.0,1.0
ENSG00000000419,7,1.0,1.0,1.0
ENSG00000000419,8,0.408914,0.38103,0.278223
ENSG00000000457,1,1.0,1.0,0.039725


In [7]:
def method_column(df):
    newdata = pd.DataFrame(columns=('pval', 'method'))

    def append_df(data, df, name):
        pval = df['%s-pval' % name]
        return pd.concat([data, pd.DataFrame({'pval': pval, 'method': name})])
        return data

    newdata = append_df(newdata, df, 'lmm')
    newdata = append_df(newdata, df, 'lmm-rank')
    newdata = append_df(newdata, df, 'qep')
    return newdata

def set_index(data):
    data = method_column(data)
    data.reset_index(inplace=True)
    data['gene'] = data['index'].apply(lambda x: x[0])
    data['intron'] = data['index'].apply(lambda x: int(x[1]))
    del data['index']
    data.set_index(['gene', 'intron'], inplace=True)
    data.sort_index(inplace=True)
    return data

DF = dict()
for n in ['real', 'null']:
    DF[n] = set_index(df[n])
    DF[n] = DF[n].rename(columns={'method':'label', 'pval':'p-value'})

In [8]:
for n in ['real', 'null']:
    display(DF[n].head())

Unnamed: 0_level_0,Unnamed: 1_level_0,label,p-value
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,5,lmm,1.0
ENSG00000000419,5,lmm-rank,1.0
ENSG00000000419,5,qep,1.0
ENSG00000000419,6,lmm,1.0
ENSG00000000419,6,lmm-rank,0.405983


Unnamed: 0_level_0,Unnamed: 1_level_0,label,p-value
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,5,lmm,1.0
ENSG00000000419,5,lmm-rank,1.0
ENSG00000000419,5,qep,1.0
ENSG00000000419,6,lmm,1.0
ENSG00000000419,6,lmm-rank,1.0


In [9]:
group = introns['real'].groupby(level=[0, 1]).agg({
    'ntri': {
        'max': 'max',
        'min': 'min',
        'median': 'median',
        'var': 'var',
        'mean': 'mean',
    }
})
group.sort_index(inplace=True)

In [10]:
group.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ntri,ntri,ntri,ntri,ntri
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,median,var,min
gene,intron,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ENSG00000000419,5,355.765324,951,347,19941.57641,29
ENSG00000000419,6,221.404553,675,218,8380.139558,8
ENSG00000000419,7,294.374781,863,291,14244.062801,28
ENSG00000000419,8,447.775832,1243,449,33432.223345,104
ENSG00000000457,1,443.460595,1309,432,33727.862918,73


In [15]:
def plot_grid(DF, ntri, introns, STAT):
    
    size = 400
    colors={'lmm': 'blue', 'qep': 'red', 'lmm-rank': 'green'}
    
    df = DF['real'].loc[ntri.index].copy()
    df['p-value'] = np.clip(df['p-value'] * npairs, 0, 1)
    
    f0 = hitsplot(df,
                  colors=colors,
         perc=True, width=size, height=size, min_threshold=1e-5, max_threshold=1e-2,
        show=False)
    
    df = DF['null'].loc[ntri.index].copy()
    df['p-value'] = np.clip(df['p-value'] * npairs, 0, 1)
    
    f1 = hitsplot(df, colors=colors,
         perc=True, width=size, height=size, min_threshold=1e-5, max_threshold=1e-2,
        show=False)

    
    def choice(vals):
        return np.random.RandomState(0).choice(vals, min(5000, len(vals)))
    
    ntris = choice(introns.loc[ntri.index]['ntri'].values)
    h0 = Histogram(ntris, title="Number of trials - across samples",
                   tools=['save'], xlabel='number of trials',
                   width=size, height=size)
    
    values = choice(ntri[STAT].values)
    h1 = Histogram(values, title="%s number of trials - across gene-intron pairs" % STAT,
                   tools=['save'], xlabel='number of trials', width=size, height=size)
    
    
    show(gridplot([[f0, f1], [h0, h1]]))

In [17]:
ntri = group['ntri'].copy()
ntri = ntri[ntri['min'] > 30]
ntri = ntri[ntri['var'] < ntri['var'].quantile(0.10)]
plot_grid(DF, ntri, introns['real'], 'min')

In [None]:
ntri = group['ntri'].copy()
ntri = ntri[ntri['min'] > 30]
ntri = ntri[ntri['var'] < ntri['var'].quantile(0.20)]
plot_grid(DF, ntri, introns['real'], 'min')

In [None]:
ntri = group['ntri'].copy()
ntri = ntri[ntri['min'] > 30]
ntri = ntri[ntri['var'] < ntri['var'].quantile(0.30)]
plot_grid(DF, ntri, introns['real'], 'min')

In [None]:
ntri = group['ntri'].copy()
ntri = ntri[ntri['min'] > 30]
ntri = ntri[ntri['var'] < ntri['var'].quantile(0.40)]
plot_grid(DF, ntri, introns['real'], 'min')