In [1]:
import pandas as pd
import numpy as np

from horta_exp.introns.fetch_data import get_intron_events
introns = get_intron_events()

from bokeh.io import push_notebook, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_file
from bokeh.charts import BoxPlot, Bar
from bokeh.layouts import gridplot
output_notebook()

Unnamed: 0_level_0,Unnamed: 1_level_0,chrom,lmm-pval,lmm-rank-pval,pos,qep-pval,snp_id
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000000460,6,1,0.190375,0.210347,169722490,0.313643,snp_1_169722490
ENSG00000000460,6,1,0.798096,0.577091,169722572,0.400403,snp_1_169722572
ENSG00000000460,6,1,0.052434,0.007609,169722634,0.000878,snp_1_169722634
ENSG00000000460,6,1,0.496457,0.388488,169722672,0.445755,snp_1_169722672
ENSG00000000460,6,1,0.238702,0.994849,169722751,0.933644,snp_1_169722751


In [None]:
data = pd.read_pickle('chrom_real_chrom22.pkl')
data.set_index(['gene', 'intron'], inplace=True)
data.sort_index(inplace=True)

# Bonferroni correction

In [2]:
ntests = len(data.index.unique())
print("Number of tests: %d" % ntests)

# Bonferroni within gene-intron
def pval_agg(x):
    return np.clip(np.min(x) * len(x), 0, 1)

def method_agg(x):
    return x[0]
    
def gene_intron_wise(df):
    return data.groupby(level=[0, 1]).agg({
        'lmm-pval': pval_agg,
        'lmm-rank-pval': pval_agg,
        'qep-pval': pval_agg
    })

data = gene_intron_wise(data)

Number of tests: 1926


In [4]:
def method_column(df):
    newdata = pd.DataFrame(columns=('pval', 'method'))

    def append_df(data, df, name):
        pval = df['%s-pval' % name]
        return pd.concat([data, pd.DataFrame({'pval': pval, 'method': name})])
        return data

    newdata = append_df(newdata, df, 'lmm')
    newdata = append_df(newdata, df, 'lmm-rank')
    newdata = append_df(newdata, df, 'qep')
    return newdata

In [5]:
def set_index(data):
    data = method_column(data)
    data.reset_index(inplace=True)
    data['gene'] = data['index'].apply(lambda x: x[0])
    data['intron'] = data['index'].apply(lambda x: int(x[1]))
    del data['index']
    data.set_index(['gene', 'intron'], inplace=True)
    data.sort_index(inplace=True)
    return data

In [6]:
data = set_index(data)

In [9]:
def plot_curve(data, perc=False, exclude=False, lthr=-5, rthr=-0.30103):
    
    nhits = {'lmm-rank':[], 'qep':[]}
    if not exclude:
        nhits['lmm'] = []
    
    pts = np.logspace(lthr, rthr)
    for pt in pts:

        if perc:
            grouped = data.groupby(data['method']).agg({
                'pval': lambda x: 100*sum(x*len(x)<=pt)/float(len(x))
            })
        else:
            grouped = data.groupby(data['method']).agg({
                'pval': lambda x: sum(x*len(x)<=pt)
            })


        nhits['lmm-rank'].append(grouped.loc['lmm-rank', 'pval'])
        if not exclude:
            nhits['lmm'].append(grouped.loc['lmm', 'pval'])
        nhits['qep'].append(grouped.loc['qep', 'pval'])

    p = figure(title="Power", tools=['save,reset'])
    p.grid.grid_line_alpha=0.3
    p.xaxis.axis_label = 'P-value threshold'
    if perc:
        p.yaxis.axis_label = 'Percentage of hits'
    else:
        p.yaxis.axis_label = 'Number of hits'
    p.line(pts, nhits['lmm-rank'], color='green', legend='lmm-rank')
    if not exclude:
        p.line(pts, nhits['lmm'], color='blue', legend='lmm')
    p.line(pts, nhits['qep'], color='red', legend='qep')
    p.legend.location = "bottom_right"
    return p

In [10]:
lthr = -5
rthr = -1
p1 = plot_curve(data, False, lthr=lthr, rthr=rthr)
p2 = plot_curve(data, False, True, lthr=lthr, rthr=rthr)
show(gridplot([[p1,p2]], plot_width=400, plot_height=400))

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nsuc,ntri
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,5,HG00096.1.M_111124_6,37,347
ENSG00000000419,5,HG00097.7.M_120219_2,68,451
ENSG00000000419,5,HG00099.1.M_120209_6,31,327
ENSG00000000419,5,HG00099.5.M_120131_3,34,289
ENSG00000000419,5,HG00100.2.M_111215_8,35,281


In [12]:
def plot_data(data, title='title'):
    print(title)
    lthr = -5
    rthr = -1
    p1 = plot_curve(data, True, lthr=lthr, rthr=rthr)
    p2 = plot_curve(data, True, True, lthr=lthr, rthr=rthr)
    show(gridplot([[p1,p2]], plot_width=400, plot_height=400, title=title))

In [13]:
g = introns.groupby(level=[0, 1])
g = g.agg({
    'ntri': {
        'max': 'max',
        'min': 'min',
    },
    'nsuc': {
        'max': 'max',
        'min': 'min',
    }
})
g.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ntri,ntri,nsuc,nsuc
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,min,max
gene,intron,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ENSG00000000419,5,29,951,5,148
ENSG00000000419,6,8,675,0,9
ENSG00000000419,7,28,863,0,50
ENSG00000000419,8,104,1243,4,571
ENSG00000000457,1,73,1309,0,83


In [14]:
g2 = introns.groupby(level=[0, 1])
g2 = g2.agg({
    'ntri': {
        'median': 'median',
    },
    'nsuc': {
        'median': 'median',
    }
})
g2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ntri,nsuc
Unnamed: 0_level_1,Unnamed: 1_level_1,median,median
gene,intron,Unnamed: 2_level_2,Unnamed: 3_level_2
ENSG00000000419,5,347,39
ENSG00000000419,6,218,1
ENSG00000000419,7,291,6
ENSG00000000419,8,449,95
ENSG00000000457,1,432,12


In [15]:
g['ntri']['max'].head()

gene             intron
ENSG00000000419  5          951
                 6          675
                 7          863
                 8         1243
ENSG00000000457  1         1309
Name: max, dtype: int32

In [16]:
for quant in [0.25, 0.5, 0.75, 1.]:
    d = data.loc[g['ntri']['max'] <= g['ntri']['max'].quantile(quant)]
    plot_data(d, title='Quantile: %f; Number of gene-introns: %d' % (quant, len(d.index.unique())))

Quantile: 0.250000; Number of gene-introns: 515


Quantile: 0.500000; Number of gene-introns: 998


Quantile: 0.750000; Number of gene-introns: 1470


Quantile: 1.000000; Number of gene-introns: 1926


In [17]:
for quant in [0.25, 0.5, 0.75, 1.]:
    d = data.loc[g2['ntri']['median'] <= g2['ntri']['median'].quantile(quant)]
    plot_data(d, title='Quantile: %f; Number of gene-introns: %d' % (quant, len(d.index.unique())))

Quantile: 0.250000; Number of gene-introns: 529


Quantile: 0.500000; Number of gene-introns: 1004


Quantile: 0.750000; Number of gene-introns: 1470


Quantile: 1.000000; Number of gene-introns: 1926


In [18]:
for quant in [0.75, 0.5, 0.25, 0.]:
    d = data.loc[g2['ntri']['median'] > g2['ntri']['median'].quantile(quant)]
    plot_data(d, title='Quantile: %f; Number of gene-introns: %d' % (quant, len(d.index.unique())))

Quantile: 0.750000; Number of gene-introns: 456


Quantile: 0.500000; Number of gene-introns: 922


Quantile: 0.250000; Number of gene-introns: 1397


Quantile: 0.000000; Number of gene-introns: 1926
