In [2]:
from IPython.display import display
import pickle as pkl

import pandas as pd
import numpy as np

from horta_exp.introns.fetch_data import get_intron_events

from limix_genetics import hitsplot
from limix_genetics import qqplot

from bokeh.io import output_notebook
from bokeh.plotting import show
from bokeh.charts import Histogram
from bokeh.layouts import gridplot
from bokeh.models import Range1d

FILE = {'real': 'chrom_real_all.pkl', 'null': 'chrom_null_all.pkl'}
output_notebook()

In [3]:
group = pd.read_pickle('horta_group.pkl')


with open("horta_DF.pkl","rb") as f:
    DF = pkl.load(f)

with open("horta_DF_agg.pkl","rb") as f:
    DF_agg = pkl.load(f)

with open("horta_introns.pkl","rb") as f:
    introns = pkl.load(f)

npairs = len(DF_agg['real'].index.unique())
print(npairs)

groupA = group[group['ntri']['mean'] < 100]
groupB = group[(group['ntri']['mean'] >= 100) & (group['ntri']['var'] < 10000)]
groupC = group[(group['ntri']['mean'] >= 100) & (group['ntri']['var'] >= 10000)]

66508


# Introduction


There are two files storing the results: `chrom_real_all.pkl` and `chrom_null_all.pkl`.

The first one refers to the results on the real phenotypes (i.e., gene-intron pairs associated
with correct genetic locus.), whereas the second one refers to results from gene-intron pairs
randomly associated with genetic locus from different chromossome).

The intron events are retrieved from Blosc-compressed [intron_events_filter4.pkl.blp] file, of a Python Pickle file.

We read those three files and create `df_FILENAME.pkl` and `introns_FILENAME.pkl` files for mere convenience.

[intron_events_filter4.pkl.blp]: https://github.com/glimix/alternative-splicing/blob/master/quant_splicing/transcript-qtls/intron_events_filter4.pkl.blp

In [4]:
def convert_and_save(filename):
    introns = get_intron_events()
    df = pd.read_pickle(filename).set_index(['gene', 'intron']).sort_index()
    introns = introns.loc[df.index.copy().drop_duplicates()]

    introns.to_pickle('introns_%s.pkl' % filename)
    df.to_pickle('df_%s.pkl' % filename)
    
    return (introns, df)

introns = dict()
df = dict()

if True:
    for n in ['real', 'null']:
        introns[n] = pd.read_pickle('introns_%s.pkl' % FILE[n])
        df[n] = pd.read_pickle('df_%s.pkl' % FILE[n])
else:
    for n in ['real', 'null']:
        (introns[n], df[n]) = convert_and_save(FILE[n])

We now have dictionaries **introns** and **df** regarding gene-intron events and results, respectively:

In [3]:
display(introns['real'].head(), df['real'].head())
display(introns['null'].head(), df['null'].head())

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nsuc,ntri
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,HG00096.1.M_111124_6,37,347
ENSG00000000419,5,HG00097.7.M_120219_2,68,451
ENSG00000000419,5,HG00099.1.M_120209_6,31,327
ENSG00000000419,5,HG00099.5.M_120131_3,34,289
ENSG00000000419,5,HG00100.2.M_111215_8,35,281


Unnamed: 0_level_0,Unnamed: 1_level_0,chrom,lmm-pval,lmm-rank-pval,pos,qep-pval,snp_id
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000000419,5,20,0.757149,0.61579,49508683,0.599888,snp_20_49508683
ENSG00000000419,5,20,0.149643,0.489927,49509175,0.523147,snp_20_49509175
ENSG00000000419,5,20,0.466112,0.30247,49509184,0.240761,snp_20_49509184
ENSG00000000419,5,20,0.364057,0.273795,49511295,0.235351,snp_20_49511295
ENSG00000000419,5,20,0.466112,0.30247,49511352,0.240761,snp_20_49511352


Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nsuc,ntri
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,HG00096.1.M_111124_6,37,347
ENSG00000000419,5,HG00097.7.M_120219_2,68,451
ENSG00000000419,5,HG00099.1.M_120209_6,31,327
ENSG00000000419,5,HG00099.5.M_120131_3,34,289
ENSG00000000419,5,HG00100.2.M_111215_8,35,281


Unnamed: 0_level_0,Unnamed: 1_level_0,chrom,lmm-pval,lmm-rank-pval,pos,qep-pval,snp_id
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000000419,5,20,0.417235,0.406138,42131442,0.386107,snp_19_42131442
ENSG00000000419,5,20,0.013192,0.014254,42131753,0.009565,indel:1I_19_42131753
ENSG00000000419,5,20,0.96978,0.590113,42132193,0.654554,snp_19_42132193
ENSG00000000419,5,20,0.89123,0.94535,42132273,0.963257,snp_19_42132273
ENSG00000000419,5,20,0.971855,0.857315,42132314,0.891363,snp_19_42132314


The number of evaluated gene-intron pairs is give by **npairs**:

In [5]:
npairs = len(df['real'].index.unique())
print("Number of gene-intron pairs: %d" % npairs)

Number of gene-intron pairs: 66508


We can also query the number of evaluated gene-intron pairs across
chromossomes:

In [5]:
display(df['real'].groupby(level=[0, 1]).first()
        .groupby('chrom')['chrom'].count())

chrom
1     6760
2     4938
3     4057
4     2638
5     3082
6     3584
7     2948
8     2257
9     3046
10    2716
11    3810
12    3986
13    1486
14    2353
15    2655
16    3197
17    4296
18    1039
19    3554
20    1723
21     710
22    1673
Name: chrom, dtype: int64

We would like also have a single p-value associated with each
gene-intron pair, which we do by taking the minimum p-value and
Bonferroni correcting it in the following section.

# Bonferroni correction within gene-introns and aggregate the p-values

In [6]:
def bonferroni_and_aggregate(df):
    def pval_agg(x):
        return np.clip(np.min(x) * len(x), 0, 1)

    return df.groupby(level=[0, 1]).agg({
        'lmm-pval': pval_agg,
        'lmm-rank-pval': pval_agg,
        'qep-pval': pval_agg
    })

df_agg = dict()
for n in ['real', 'null']:
    df_agg[n] = bonferroni_and_aggregate(df[n].copy())
    display(df_agg[n].head())

Unnamed: 0_level_0,Unnamed: 1_level_0,lmm-pval,qep-pval,lmm-rank-pval
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,1.0,1.0,1.0
ENSG00000000419,6,1.0,1.0,0.405983
ENSG00000000419,7,0.820634,1.0,1.0
ENSG00000000419,8,1.0,1.0,1.0
ENSG00000000457,1,0.007733,0.057588,0.065261


Unnamed: 0_level_0,Unnamed: 1_level_0,lmm-pval,qep-pval,lmm-rank-pval
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,1.0,1.0,1.0
ENSG00000000419,6,1.0,1.0,1.0
ENSG00000000419,7,1.0,1.0,1.0
ENSG00000000419,8,0.278223,0.38103,0.408914
ENSG00000000457,1,0.039725,1.0,1.0


In order to use `limix_genetics` package to plot we need to conform those dataframes:

In [7]:
def transpose_df(df):
    DF = dict()
    for n in ['real', 'null']:
        data_list = []
        for method in ['lmm', 'lmm-rank', 'qep']:
            df_ = df[n]['%s-pval' % method]
            pval = df_.astype(float)
            data_list += [pd.DataFrame({'p-value': pval, 'label': method},
                                       index=df_.index)]
        DF[n] = pd.concat(data_list)
        DF[n].sort_index(inplace=True)
        display(DF[n].head())
    return DF

In [8]:
DF_agg = transpose_df(df_agg)
DF = transpose_df(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,label,p-value
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,5,lmm,1.0
ENSG00000000419,5,lmm-rank,1.0
ENSG00000000419,5,qep,1.0
ENSG00000000419,6,lmm,1.0
ENSG00000000419,6,lmm-rank,0.405983


Unnamed: 0_level_0,Unnamed: 1_level_0,label,p-value
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,5,lmm,1.0
ENSG00000000419,5,lmm-rank,1.0
ENSG00000000419,5,qep,1.0
ENSG00000000419,6,lmm,1.0
ENSG00000000419,6,lmm-rank,1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,label,p-value
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,5,lmm,0.757149
ENSG00000000419,5,lmm,0.149643
ENSG00000000419,5,lmm,0.466112
ENSG00000000419,5,lmm,0.364057
ENSG00000000419,5,lmm,0.466112


MemoryError: 

We will subselect results based on number of trials statistics to better understand
what is going on.

In [None]:
group = introns['real'].groupby(level=[0, 1]).agg({
    'ntri': {
        'max': 'max',
        'min': 'min',
        'median': 'median',
        'var': 'var',
        'mean': 'mean',
    }
})
group.sort_index(inplace=True)
group.head()

In [None]:
h = Histogram(group['ntri']['min'].values,
               title="Min. number of trials - across gene-intron pairs",
               xlabel='number of trials')
h.x_range = Range1d(0, 120)
show(h)

In [None]:
h = Histogram(group['ntri']['max'].values,
               title="Max. number of trials - across gene-intron pairs",
               xlabel='number of trials')
h.x_range = Range1d(0, 3000)
show(h)

In [None]:
show(Histogram(group['ntri']['mean'].values,
               title="Mean number of trials - across gene-intron pairs",
               xlabel='number of trials mean'))

In [None]:
show(Histogram(group['ntri']['median'].values,
               title="Median number of trials - across gene-intron pairs",
               xlabel='number of trials median'))

In [None]:
h = Histogram(group['ntri']['var'].values,
               title="Var number of trials - across gene-intron pairs",
               xlabel='number of trials variance')
h.x_range = Range1d(0, 60000)
show(h)

In [4]:
groupA = group[group['ntri']['mean'] < 100]
groupB = group[(group['ntri']['mean'] >= 100) & (group['ntri']['var'] < 10000)]
groupC = group[(group['ntri']['mean'] >= 100) & (group['ntri']['var'] >= 10000)]

In [None]:
from bokeh.plotting import figure
f = figure()

f.circle(groupA['ntri']['var'], groupA['ntri']['mean'], color='red')
f.circle(groupB['ntri']['var'], groupB['ntri']['mean'], color='green')
f.circle(groupC['ntri']['var'], groupC['ntri']['mean'], color='blue')

f.x_range = Range1d(0, group['ntri']['var'].quantile(0.999))
f.xaxis.axis_label = 'Variance( number of trials )'
f.yaxis.axis_label = 'Mean( number of trials )'

# Gene-intron variance versus mean number of trials

- red group  : small mean                    (groupA)
- green group: high mean and small variance  (groupB)
- blue group : hight mean and hight variance (groupC)

In [None]:
show(f)

In [4]:
def plot_grid(DF, DF_agg, ntri, introns, STAT):
    
    size = 400
    colors={'lmm': 'blue', 'qep': 'red', 'lmm-rank': 'green'}
    
    df_agg = DF_agg['real'].loc[ntri.index].copy()
    df_agg['p-value'] = np.clip(df_agg['p-value'] * npairs, 0, 1)
    
    df = DF['real'].loc[ntri.index].copy()
    
    hits0 = hitsplot(df_agg, colors=colors, perc=True,
                     width=size, height=size, min_threshold=1e-5, max_threshold=1e-2,
                     show=False)
    
    qq0 = qqplot(df, colors=colors, atleast_points=0.005, show=False,
                width=size, height=size)
    
    df_agg = DF_agg['null'].loc[ntri.index].copy()
    df_agg['p-value'] = np.clip(df_agg['p-value'] * npairs, 0, 1)
    
    df = DF['null'].loc[ntri.index].copy()
    
    hits1 = hitsplot(df_agg, colors=colors,
         perc=True, width=size, height=size, min_threshold=1e-5, max_threshold=1e-2,
        show=False)
    
    qq1 = qqplot(df, colors=colors, atleast_points=0.005, show=False,
                 width=size, height=size)

    
    def choice(vals):
        return np.random.RandomState(0).choice(vals, min(8000, len(vals)))
    
    ntris = choice(introns.loc[ntri.index]['ntri'].values)
    hist0 = Histogram(ntris, title="Number of trials - across samples",
                   tools=['save'], xlabel='number of trials',
                   width=size, height=size)
    hist0.x_range = Range1d(0, np.percentile(ntris, 99.9))
    
    values = choice(ntri[STAT].values)
    hist1 = Histogram(values, title="%s number of trials - across gene-intron pairs" % STAT,
                   tools=['save'], xlabel='number of trials', width=size, height=size)
    hist1.x_range = Range1d(0, np.percentile(values, 99.9))
    
    show(gridplot([[hits0, hits1], [qq0, qq1], [hist0, hist1]]))

## Group A

In [5]:
plot_grid(DF, DF_agg, groupA['ntri'].copy(), introns['real'], 'min')

KeyboardInterrupt: 

## Group B

In [None]:
plot_grid(DF, DF_agg, groupB['ntri'].copy(), introns['real'], 'min')

## Group C

In [5]:
plot_grid(DF, DF_agg, groupC['ntri'].copy(), introns['real'], 'min')