In [2]:
import pandas as pd
import numpy as np


chrom = 22
df = pd.read_pickle('chrom_%d_perm.pkl' % chrom)
df.set_index(['gene', 'intron'], inplace=True)
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,chrom,lmm-pval,pos,qep-pval,snp_id
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000015475,6,22,0.48816,18183494,0.442632,snp_22_18183494
ENSG00000015475,6,22,0.512587,18183623,0.703575,snp_22_18183623
ENSG00000015475,6,22,0.047945,18183724,0.059372,snp_22_18183724
ENSG00000015475,6,22,0.513477,18184011,0.493602,snp_22_18184011
ENSG00000015475,6,22,0.379372,18184169,0.429686,snp_22_18184169


In [27]:
from bokeh.io import push_notebook, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_file
output_notebook()

import numpy as np
import colour
from scipy.special import betaincinv
from limix_plot import cycler_ as cycler
from collections import OrderedDict
from numpy import asarray as asa

def expected(n):
    lnpv = np.linspace(1/(n+1), n/(n+1), n, endpoint=True)
    return np.flipud(-np.log10(lnpv))

def xy(pv):
    
    return 

def rank_confidence_band(nranks):
    alpha = 0.01
    n = nranks
    k0 = np.arange(1, n+1)
    k1 = np.flipud(k0).copy()
    mean = k0 / (n + 1)
    return mean

def qqplot(method, color, df0, thr=1e-1, fill_alpha=0.2):

    p = figure(title = "%s :: chromossome %d" % (method.upper(), chrom),
               tools=['hover,zoom_in,zoom_out,box_zoom,save,pan,reset'], width=900)
    
    pv = df0['%s-pval' % method].values[:]
    lpv = -np.log10(pv)
    lpv_sort = np.argsort(lpv)
    expected_lpv = expected(len(lpv))

    ok = pv[lpv_sort] <= thr
    
    gene = asa([i[0] for i in df0.index.values[lpv_sort]])
    intron = asa([i[1] for i in df0.index.values[lpv_sort]])
    
    source = ColumnDataSource(data=dict(
        xname=expected_lpv[ok],
        yname=lpv[lpv_sort][ok],
        gene=gene[ok],
        intron=intron[ok],
        snp_id=df0['snp_id'][lpv_sort][ok],
        pval=pv[lpv_sort][ok],
        pos=df0['pos'][lpv_sort][ok]
    ))
    
    p.circle('xname', 'yname', source=source, color=color,
             fill_alpha=fill_alpha, line_width=0, line_color=None)
    
    p.select_one(HoverTool).tooltips = [
        ('gene', '@gene'),
        ('intron', '@intron'),
        ('snp_id', '@snp_id'),
        ('pos', '@pos'),
        ('p-value', '@pval'),
    ]
    mean = rank_confidence_band(len(lpv))
    me = [-np.log10(m) for m in mean]
    p.line([me[0], me[-1]], [me[0], me[-1]], color='black')
    show(p)

In [4]:
qqplot('qep', 'red', df)

In [5]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,chrom,lmm-pval,pos,qep-pval,snp_id
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000015475,6,22,0.48816,18183494,0.442632,snp_22_18183494
ENSG00000015475,6,22,0.512587,18183623,0.703575,snp_22_18183623
ENSG00000015475,6,22,0.047945,18183724,0.059372,snp_22_18183724
ENSG00000015475,6,22,0.513477,18184011,0.493602,snp_22_18184011
ENSG00000015475,6,22,0.379372,18184169,0.429686,snp_22_18184169


In [11]:
pvals = df['qep-pval'][:]

# Get the gene-intron pair having the SNP with lowest p-value

In [18]:
gene = df.loc[pvals.argmin()]
gene = gene.reset_index()
gene_name, intron = gene['gene'][0], gene['intron'][0]
gene = gene.set_index('snp_id')

## Here is the SNP ID

In [21]:
gene['qep-pval'].argmin()

'snp_22_50743630'

# Look how low is the p-value: 1e-35

In [28]:
qqplot('qep', 'red', gene.reset_index().set_index(['gene', 'intron']), 1.0, 1.0)

In [29]:
gene.head()

Unnamed: 0_level_0,gene,intron,chrom,lmm-pval,pos,qep-pval
snp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
snp_22_50645650,ENSG00000188130,9,22,0.699322,50645650,0.595336
snp_22_50645994,ENSG00000188130,9,22,0.388784,50645994,0.340171
snp_22_50646202,ENSG00000188130,9,22,0.32356,50646202,0.358202
snp_22_50646318,ENSG00000188130,9,22,0.891083,50646318,0.771611
snp_22_50646675,ENSG00000188130,9,22,0.171735,50646675,0.154139


# I will look into the trait itself now

In [31]:
import six
try:
    import cPickle as pkl
except ImportError:
    import pickle as pkl
import blosc
from os.path import join

_root = '/hps/nobackup/stegle/users/lab/dataset/'
_folder_traits = join(_root, 'alternative-splicing',
                      'quant_splicing',
                      'transcript-qtls')

_folder_genotype = join(_root, '1000G', 'plink', 'horta')

def get_intron_events():
    with open(join(_folder_traits, 'intron_events_filter3.pkl.blp'), 'rb') as f:
        msg = blosc.decompress(f.read())
        if six.PY3:
            return pkl.loads(msg, encoding='latin-1')
        else:
            return pkl.loads(msg)

In [32]:
ie = get_intron_events()

In [34]:
traits = ie.loc[(gene_name, intron)]

In [35]:
traits.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nsuc,ntri
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000188130,9,HG00096.1.M_111124_6,45,122
ENSG00000188130,9,HG00097.7.M_120219_2,132,266
ENSG00000188130,9,HG00099.1.M_120209_6,14,62
ENSG00000188130,9,HG00099.5.M_120131_3,18,83
ENSG00000188130,9,HG00100.2.M_111215_8,88,178


# Lets see if the number of trials is crazy

In [43]:
import numpy as np

p = figure(title = "(%s, %d)" % (gene_name, intron),
           tools=['save,reset'], width=900)

hist, edges = np.histogram(traits['ntri'], density=True, bins=50)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")
p.xaxis.axis_label = '# of trials'
show(p)

In [47]:
gene.head()

Unnamed: 0_level_0,gene,intron,chrom,lmm-pval,pos,qep-pval
snp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
snp_22_50645650,ENSG00000188130,9,22,0.699322,50645650,0.595336
snp_22_50645994,ENSG00000188130,9,22,0.388784,50645994,0.340171
snp_22_50646202,ENSG00000188130,9,22,0.32356,50646202,0.358202
snp_22_50646318,ENSG00000188130,9,22,0.891083,50646318,0.771611
snp_22_50646675,ENSG00000188130,9,22,0.171735,50646675,0.154139


# Now number of trials versus number of successes

In [51]:
p = figure(title = "(%s, %d)" % (gene_name, intron),
           tools=['save,reset'], width=900)

p.circle(traits['ntri'], traits['nsuc'])
p.xaxis.axis_label = '# of trials'
p.yaxis.axis_label = '# of successes'
show(p)

Conclusion so far is that the trait is all fine, so it might have to do with genotype.
MAF, maybe? I will have a look at it now.

# Genotype

In [54]:
from sklearn.preprocessing import Imputer

def get_chromossome(cid):
    filename = join(_folder_genotype, 'maf1_chr%d' % cid)
    bim = pd.read_pickle('%s.bim.df' % filename)
    fam = pd.read_pickle('%s.fam.df' % filename)

    bed = np.load('%s.bed.npz' % filename)['bed']
    bed = bed.astype(float)
    bed[bed == 3] = np.nan

    imp = Imputer(axis=1, strategy='most_frequent')
    imp.fit(bed)
    X = imp.transform(bed).T

    return (bim, fam, X)

In [55]:
(bim, bam, X) = get_chromossome(22)

In [56]:
snp_id = gene['qep-pval'].argmin()

In [57]:
bim.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,snp,cm,a0,a1,i
chrom,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22,16050408,snp_22_16050408,0.0,C,T,0
22,16050612,snp_22_16050612,0.0,G,C,1
22,16050678,snp_22_16050678,0.0,T,C,2
22,16051107,snp_22_16051107,0.0,A,C,3
22,16051249,snp_22_16051249,0.0,C,T,4


In [58]:
bim0 = bim.reset_index().set_index('snp').sort_index()

In [59]:
bim0.head()

Unnamed: 0_level_0,chrom,pos,cm,a0,a1,i
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
indel:10D_22_17298053,22,17298053,0.0,A,ATATAATTCTC,2119
indel:10D_22_17666306,22,17666306,0.0,AAAAATAAAAT,A,4128
indel:10D_22_17729247,22,17729247,0.0,G,GTTTTGTTTTC,4502
indel:10D_22_17729257,22,17729257,0.0,C,CTTTTGTTTTG,4503
indel:10D_22_18169269,22,18169269,0.0,A,ATTTCTTTTTT,7371


In [60]:
bim0.loc[(snp_id, )]

chrom          22
pos      50743630
cm              0
a0              A
a1              C
i          168966
Name: snp_22_50743630, dtype: object

In [61]:
i = bim0.loc[(snp_id, )]['i']
snp_genotype = X[:, i]

In [62]:
snp_genotype

array([ 2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  1.,  2.,  2.,  2.,  2.,  1.,  2.,  2.,  2.,  2.,
        2.,  1.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2

In [65]:
from numpy import asarray, minimum

def _maf(X):
    r"""Compute minor allele frequencies.
    It assumes that `X` encodes 0, 1, and 2 representing the number
    of alleles.
    Args:
        X (array_like): Genotype matrix.
    Returns:
        array_like: minor allele frequencies.
    """
    X = asarray(X, float)
    s0 = X.sum(0)
    s0 /= float(2*X.shape[0])
    s1 = 1 - s0
    return minimum(s0, s1)

In [66]:
maf = _maf(snp_genotype[:, np.newaxis])[0]

In [68]:
print('%s MAF: %.5f' % (snp_id, maf))

snp_22_50743630 MAF: 0.01505


Interesting. The MAF is pretty low.
I will re-run QEP on that here.