In [1]:
import pandas as pd
import numpy as np


chrom = 22
df = pd.read_pickle('chrom_%d.pkl' % chrom)
df.set_index(['gene', 'intron'], inplace=True)
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,chrom,lmm-pval,lmm-rank-pval,pos,qep-pval,snp_id
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000025770,1,22,0.613807,0.480221,50896901,0.389656,snp_22_50896901
ENSG00000025770,1,22,0.316533,0.574893,50897215,0.533459,snp_22_50897215
ENSG00000025770,1,22,0.043711,0.138918,50897314,0.099098,snp_22_50897314
ENSG00000025770,1,22,0.514677,0.750123,50897692,0.828003,snp_22_50897692
ENSG00000025770,1,22,0.000753,0.021789,50897868,0.007409,snp_22_50897868


# Bonferroni correction

In [2]:
ntests = len(df.index.unique())
print("Number of tests: %d" % ntests)

# Bonferroni within gene-intron
def pval_agg(x):
    return np.clip(np.min(x) * len(x), 0, 1)

def method_agg(x):
    return x[0]
    
def gene_intron_wise(df):
    df = df.groupby(level=[0, 1]).agg({
        'lmm-pval': pval_agg,
        'lmm-rank-pval': pval_agg,
        'qep-pval': pval_agg
    })
    return df

df = gene_intron_wise(df)

Number of tests: 296


In [3]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,qep-pval,lmm-rank-pval,lmm-pval
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000025770,1,1.0,1.0,0.0006825442
ENSG00000025770,2,1.0,1.0,4.163024e-06
ENSG00000025770,4,0.192797,0.625935,1.0
ENSG00000025770,5,1.0,1.0,1.0
ENSG00000025770,6,0.035485,0.194227,8.597851e-08


In [6]:
data = pd.DataFrame(columns=('pval', 'method'))

def append_df(data, df, name):
    pval = df['%s-pval' % name]
    data = pd.concat([data, pd.DataFrame({'pval': pval, 'method': name})])
    return data

data = append_df(data, df, 'lmm')
data = append_df(data, df, 'lmm-rank')
data = append_df(data, df, 'qep')

In [7]:
data.head()

Unnamed: 0,method,pval
"(ENSG00000025770, 1)",lmm,0.0006825442
"(ENSG00000025770, 2)",lmm,4.163024e-06
"(ENSG00000025770, 4)",lmm,1.0
"(ENSG00000025770, 5)",lmm,1.0
"(ENSG00000025770, 6)",lmm,8.597851e-08


In [8]:
from bokeh.io import push_notebook, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_file
from bokeh.charts import BoxPlot, Bar
from bokeh.layouts import gridplot
output_notebook()

In [9]:
def plot_curve(data, perc=False, exclude=False, lthr=-5, rthr=-0.30103):
    
    nhits = {'lmm-rank':[], 'qep':[]}
    if not exclude:
        nhits['lmm'] = []
    
    pts = np.logspace(lthr, rthr)
    for pt in pts:

        if perc:
            grouped = data.groupby(data['method']).agg({'method': lambda x: x[0], 'pval': lambda x: 100*sum(x*len(x)<=pt)/float(len(x))})
        else:
            grouped = data.groupby(data['method']).agg({'method': lambda x: x[0], 'pval': lambda x: sum(x*len(x)<=pt)})


        nhits['lmm-rank'].append(grouped.loc['lmm-rank', 'pval'])
        if not exclude:
            nhits['lmm'].append(grouped.loc['lmm', 'pval'])
        nhits['qep'].append(grouped.loc['qep', 'pval'])

    p = figure(title="Power", tools=['save,reset'])
    p.grid.grid_line_alpha=0.3
    p.xaxis.axis_label = 'P-value threshold'
    if perc:
        p.yaxis.axis_label = 'Percentage of hits'
    else:
        p.yaxis.axis_label = 'Number of hits'
    p.line(pts, nhits['lmm-rank'], color='green', legend='lmm-rank')
    if not exclude:
        p.line(pts, nhits['lmm'], color='blue', legend='lmm')
    p.line(pts, nhits['qep'], color='red', legend='qep')
    p.legend.location = "bottom_right"
    return p

In [12]:
p1 = plot_curve(data, True)
p2 = plot_curve(data, True, True)
p3 = plot_curve(data, False)
p4 = plot_curve(data, False, True)
show(gridplot([[p1,p2],[p3,p4]], plot_width=400, plot_height=400))

In [13]:
lthr = -5
rthr = -1
p1 = plot_curve(data, True, lthr=lthr, rthr=rthr)
p2 = plot_curve(data, True, True, lthr=lthr, rthr=rthr)
p3 = plot_curve(data, False, lthr=lthr, rthr=rthr)
p4 = plot_curve(data, False, True, lthr=lthr, rthr=rthr)
show(gridplot([[p1,p2],[p3,p4]], plot_width=400, plot_height=400))

In [14]:
lthr = -10
rthr = -2
p1 = plot_curve(data, True, lthr=lthr, rthr=rthr)
p2 = plot_curve(data, True, True, lthr=lthr, rthr=rthr)
p3 = plot_curve(data, False, lthr=lthr, rthr=rthr)
p4 = plot_curve(data, False, True, lthr=lthr, rthr=rthr)
show(gridplot([[p1,p2],[p3,p4]], plot_width=400, plot_height=400))