In [1]:
import sys
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')

import numpy as np
import collections
import gzip
import random

from scipy import stats
from scipy.interpolate import interp1d
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib
from utils import readgtf

# from utils import mpl_stylesheet
# mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)

In [5]:
import os, re
from functools import wraps
import time

SNPINFO_FIELDS = ['chrom', 'varid', 'bp_pos', 'ref_allele', 'alt_allele', 'maf']
class SnpInfo(collections.namedtuple('_SnpInfo', SNPINFO_FIELDS)):
    __slots__ = ()

def timeit(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()
        print('{:s} took: {:.6f} seconds'.format(f.__name__, te-ts))
        return result
    return wrap
def normalize_expr(Y):
    if isinstance(Y, pd.DataFrame):
        Y_cent = (Y.values - np.mean(Y.values, axis = 1).reshape(-1, 1)) / np.std(Y.values, axis = 1).reshape(-1, 1)
        Y_cent = pd.DataFrame(Y_cent, index=Y.index, columns=Y.columns)
        Y_cent.index.name = Y.index.name
    else:
        Y_cent = (Y - np.mean(Y, axis = 1).reshape(-1, 1)) / np.std(Y, axis = 1).reshape(-1, 1)
    return Y_cent

def normalize_and_center_dosage(dosage, snpinfo):
    f = [snp.maf for snp in snpinfo]
    f = np.array(f).reshape(-1, 1)
    gtnorm = (dosage - (2 * f)) / np.sqrt(2 * f * (1 - f))
    gtcent = dosage - np.mean(dosage, axis = 1).reshape(-1, 1)
    return gtnorm, gtcent #rr uses gtcent

def sample_gt(snpinfo, nsample):
    nsnps = len(snpinfo)
    dosages = np.zeros(nsnps * nsample)
    i = 0
    for snp in snpinfo:
        gtcent = sample_from_maf(nsample, snp.maf)
        dosages[i,:] = gtcent
        i += 1
    return dosages    

def sample_from_maf(nsample, maf):
    dosage = np.zeros(nsample)
    mafratios = np.array([(1 - maf)**2, 2 * maf * (1 - maf), maf**2])
    nfreq  = np.random.multinomial(nsample, mafratios, size=1)[0]
    f1 = np.repeat(0, nfreq[0])
    f2 = np.repeat(1, nfreq[1])
    f3 = np.repeat(2, nfreq[2])
    x  = np.concatenate((f1,f2,f3))
    dosage = np.random.permutation(x)
    # gtnorm = (dosage - (2 * maf2d)) / np.sqrt(2 * maf2d * (1 - maf2d))
    gtcent = dosage - np.mean(dosage)

    return gtcent

def simulate_gt(nsnps, nsample):
    mafs = np.linspace(0.1, 0.9, nsnps)
    gtcent = np.zeros((nsnps, nsample))
    snpinfo = list()
    for i in range(nsnps):
        gtcent[i,:] = sample_from_maf(nsample, mafs[i])
        this_snp = SnpInfo(chrom      = 1,
                           bp_pos     = i*100,
                           varid      = "rsid"+str(i),
                           ref_allele = "A",
                           alt_allele = "G",
                           maf        = mafs[i])
        snpinfo.append(this_snp)
    return gtcent, snpinfo
def knn_correction(expr, dosage, K, f=1):
    pca = PCA(n_components=int(f * min(expr.shape[0], expr.shape[1]) ))
#     print("Original dimension: {:d} x {:d}".format(expr.shape[0], expr.shape[1]))
    pca.fit(expr) # requires N x G
    expr_pca = pca.transform(expr)
#     print("Reduced dimension: {:d} x {:d}".format(expr_pca.shape[0], expr_pca.shape[1]))

    def gene_distance(a, b):
        return np.linalg.norm(a - b)

    nsample = expr.shape[0]
    distance_matrix = np.zeros((nsample, nsample))
    for i in range(nsample):
        for j in range(i+1, nsample):
            dist = gene_distance(expr_pca[i,:], expr_pca[j,:])
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist

    kneighbor = K
    gx_knn = np.zeros_like(expr)
    gt_knn = np.zeros_like(dosage)
    neighbor_list = list()

    for i in range(nsample):
        neighbors = np.argsort(distance_matrix[i, :])[:kneighbor + 1][1:]
        gx_knn[i, :] = expr[i, :] - np.mean(expr[neighbors, :], axis = 0)
        # noisy_neighbors = np.random.choice(neighbors, size = int(2 * kneighbor / 3), replace = False)
        # noisy_neighbors = np.random.choice(neighbors, size = kneighbor, replace = True )
        noisy_neighbors = neighbors
        gt_knn[:, i] = dosage[:, i] - np.mean(dosage[:, noisy_neighbors], axis = 1)
        neighbor_list.append(neighbors)

    return gx_knn, gt_knn

# # PCA correction - Won't work: makes some singular values == 0
def PCA_correction(gx, nComp = 5):
    pca = PCA()
    pca.fit(gx.T)
    gx_pca = np.dot(pca.transform(gx.T)[:, nComp:], pca.components_[nComp:,:]).T
    gx_pca_norm = normalize_expr(gx_pca)
    return gx_pca_norm

In [7]:
# Use artificial genotype
nsample = 350
gtfull, snp_info = simulate_gt(10000, nsample)

# # Simulate some expression
ngene = 15000
gx_rand = np.random.normal(0, 1, size = nsample * ngene).reshape((ngene, nsample)) 
print(gx_rand.shape)

print(np.linalg.matrix_rank(gx_rand))


(15000, 350)
350


In [None]:
gx_norm = normalize_expr( gx[:, exprmask] )
nsample = gx_norm.shape[1]

## --- Apply KNN correction
f=1
gx_knn, gt_knn = knn_correction(gx_norm.T, dosage, K=30, f=f)

## --- Normalize GX and GT
gt_norm, gt_cent = normalize_and_center_dosage(dosage, snp_info)
sigmax2     = np.var(gt_cent, axis = 1)

gx_knn_norm = normalize_expr(gx_knn.T)
gt_knn_norm, gt_knn_cent = normalize_and_center_dosage(gt_knn, snp_info)
sigmax2_knn = np.var(gt_knn_cent, axis = 1)

shuffle_mask = np.arange(nsample)
np.random.shuffle(shuffle_mask)
gx_shuffled_norm = copy.deepcopy(gx_norm[:, shuffle_mask])
gx_shuffled_knn_norm = copy.deepcopy(gx_knn_norm[:, shuffle_mask])

In [None]:
# gx_knn60, gt_knn60 = knn_correction(gx_norm.T, dosage, K=60, f=f)
# gx_knn15, gt_knn15 = knn_correction(gx_norm.T, dosage, K=15, f=f)

# gx_knn_norm60 = normalize_expr(gx_knn60.T)
# gt_knn_norm60, gt_knn_cent60 = normalize_and_center_dosage(gt_knn60, snp_info)
# sigmax2_knn60 = np.var(gt_knn_cent60, axis = 1)

# gx_knn_norm15 = normalize_expr(gx_knn15.T)
# gt_knn_norm15, gt_knn_cent15 = normalize_and_center_dosage(gt_knn15, snp_info)
# sigmax2_knn15 = np.var(gt_knn_cent15, axis = 1)

# gx_shuffled_knn_norm60 = copy.deepcopy(gx_knn_norm60[:, shuffle_mask])
# gx_shuffled_knn_norm15 = copy.deepcopy(gx_knn_norm15[:, shuffle_mask])

In [None]:
## --- Obtain CisMasks

cismasklist = get_cismasklist(snp_info, genes, chrm, window=1e6)
cismaskcomp = compress_cismasklist(cismasklist)

In [None]:
# Y = gx_knn_norm # / np.sqrt(nsample)
# Yt = Y.T
# U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
# s2_median = np.median(np.square(S))
# s2_10     = np.percentile(np.square(S), 10)
# print(s2_median)
# print(s2_10)
# SIGMA_BETAS = np.repeat(0.05, gt_cent.shape[0])
# DY_SIGMA_BETAS = np.sqrt( sigmax2_knn / s2_median )
# DY_SIGMA_BETAS10 = np.sqrt( sigmax2_knn / s2_10 )
# plt.hist(DY_SIGMA_BETAS, alpha=0.4, label="median")
# plt.hist(DY_SIGMA_BETAS10, alpha=0.4, label="percentile")
# plt.legend()
# plt.show()

# # SIGMA_BETAS = DY_SIGMA_BETAS10
# SIGMA_BETAS = DY_SIGMA_BETAS

In [None]:
### Optimizes sb2 to a given Keff

# @timeit
# def optimize_sb2(S, sigmasx, target):
#     sbetas = list()
#     S2 = np.square(S)
#     S2_lim = np.percentile(S2, 50)
#     for sx2 in sigmasx:
#         sb2 =  sx2 / S2_lim       # start parameter at median
#         S2mod = S2 + (sx2 / sb2)
#         N = len(S2)
#         Keff = np.sum(S2/S2mod) / N

#         while np.abs(Keff - target) > 0.01:
#             diff = Keff - target
#             sb2 -= diff*(sb2)
#             S2mod = S2 + (sx2 / sb2)
#             Keff = np.sum(S2/S2mod) / N
#         #print("Keff",Keff)
#         #print("SB2=",sb2)
#         sbetas.append(sb2)
#     return np.array(sbetas)

# print(len(sigmax2_knn))
# sb2_opt = optimize_sb2(S, sigmax2_knn, 0.6)

# Keffs = []
# S2 = np.square(S)
# N = len(S2)
# for i in range(len(sigmax2_knn)):
#     S2mod = S2 + (sigmax2_knn[i] / sb2_opt[i])
#     Keffs.append(np.sum(S2/S2mod) / N)
# print(Keffs)

In [None]:
def pvals_perm(GT, R, W):
    mu2, mu4 = moment_data(GT)
    N = GT.shape[1]
    q11 = np.sum(W)
    q2  = np.sum(np.diag(W))
    muQ = mu2 * (N * q2 - q11) / (N - 1)

    v31 = - mu4 / (N - 1)
    v22 = v31 + (N * mu2 * mu2 / (N - 1)) #(N*(mu2**2) - mu4)/(N-1)
    v211 = - (v31 + v22) / (N - 2)
    v1111 = - 3 * v211 / (N - 3)

    q31 = np.dot(np.diag(W),np.sum(W,axis = 1))
    q4 = np.sum(np.square(np.diag(W)))
    q22 = np.sum(np.square(W))
    q211 = np.sum(np.square(np.sum(W,axis = 1)))

    sigma2 = v1111*(q11**2 - 2*q2*q11 - 4*q211 + 8*q31 + 2*q22 + q2**2 - 6*q4) + 2*v211*(q2*q11 + 2*q211 - 6*q31 - 2*q22 - q2**2 + 6*q4) + v22*(q2**2 + 2*q22 - 3*q4) + 4*v31*(q31 - q4) + mu4*q4

    sigma2 = sigma2 - muQ**2
    sigmaQ = np.sqrt(sigma2)
    p = 1 - stats.norm.cdf(R, loc=muQ, scale=sigmaQ)
    return p, muQ, sigmaQ

def moment_data(GT):   #GT ixN
    GT2 = np.square(GT)
    GT4 = np.square(GT2)
    mu2 = np.mean(GT2)
    mu4 = np.mean(GT4)
    return mu2, mu4

In [None]:
from scipy.optimize import minimize

class SBoptimizer:

    def __init__(self, GT, GX, sx2):

        self._GT  = np.ascontiguousarray(GT)
        self._GX  = np.ascontiguousarray(GX)
        self._sx2 = np.ascontiguousarray(sx2)
        self._nsnps = GT.shape[0]
        self._nsample = GX.shape[1]
        
        U, S, VT = np.linalg.svd(GX.T)
        self._S = S
        self._U = U
        self._S2 = np.square(S)
        self._opt_sb2 = np.zeros(self._nsnps)
    
    @property
    def sb2(self):
        return self._opt_sb2

    def get_ML(self, _sb2, i):
        # sb2 = sb * sb
        sb2 = np.exp(_sb2)
        S2mod = self._S2 + (self._sx2[i] / sb2)
        Rscore = np.sum(np.square(np.dot(self._U.T, self._GT[i,:])) * (self._S2 / S2mod)) / self._sx2[i]
        MLL = -0.5*np.sum(np.log( self._S2 * (sb2 / self._sx2[i]) + 1 )) + 0.5*Rscore

        denom = (self._S2 * sb2 + self._sx2[i])
        der = 0.5* np.sum( ( self._S2 / denom ) * ( (np.square(np.dot(self._U.T, self._GT[i,:])) / denom ) - 1 ) )
        return -MLL, sb2*np.array([-der])

    def fit(self):
        st = time.time()
        
        sb_init = np.exp(0.01)
        for i in range(self._nsnps):
            res = minimize(   self.get_ML,
                              sb_init, 
                              args = i,
                              method='L-BFGS-B',
                              jac = True,
                              #bounds = [[0,1]],
                              options={'maxiter': 200000,
                                       'maxfun': 2000000,
                                       #'ftol': 1e-9,
                                       #'gtol': 1e-9,
                                       'disp': True})

            # print(res)
            self._opt_sb2[i] = np.exp(res.x[0])
        et = time.time()
        print("optimization took in total: ",et-st)

In [None]:
import random

def tejaas_rr(myGT, myGX, sigmax2, sb2):
    nsnps  = myGT.shape[0]
    Rscore = np.zeros(nsnps)
    pvals  = np.zeros(nsnps)
    muQ    = np.zeros(nsnps)
    sigmaQ = np.zeros(nsnps)
    Keff   = np.zeros(nsnps)
    EmuQ   = np.zeros(nsnps)
    EsigmaQ= np.zeros(nsnps)
    
    # Yt = (GX / np.sqrt(nsample)).T
    Yt = myGX.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    S2 = np.square(S)
    
    for j in range(nsnps):       
        S2mod = S2 + (sigmax2[j] / sb2[j])
        Keff[j] = np.sum(S2/S2mod)
        
        W = np.dot(U, np.dot(np.diag(S2 / S2mod), U.T)) / sigmax2[j]
        Rscore[j] = np.sum(np.square(np.dot(U.T, myGT[j,:])) * (S2 / S2mod)) / sigmax2[j]
        
        pvals[j], muQ[j], sigmaQ[j] = pvals_perm(myGT[j, :].reshape(1, -1), Rscore[j], W)
        
#         rand_gt = copy.deepcopy(myGT[j,:])
#         emp_q = np.zeros(2000)
#         for k in range(2000):
#             random.shuffle(rand_gt)
#             emp_q[k] = np.sum(np.square(np.dot(U.T, rand_gt)) * (S2 / S2mod)) / sigmax2[j]
#         EmuQ[j]     = np.mean(emp_q)
#         EsigmaQ[j]  = np.var(emp_q)
        
        
    return S2, pvals, Rscore, muQ, sigmaQ, Keff #, EmuQ, EsigmaQ

def compare_basic_props(GX, GT, ax1, ax2, label, sigmax2, sb2 = None, Ucorr = False, cismasks = None):
        
    nsnps  = GT.shape[0]
    Rscore = np.zeros(nsnps)
    pvals  = np.zeros(nsnps)
    muQ    = np.zeros(nsnps)
    sigmaQ = np.zeros(nsnps)
    Keff   = np.zeros(nsnps)
    EmuQ   = np.zeros(nsnps)
    EsigmaQ= np.zeros(nsnps)
    myGT = GT
    myGX = GX
    opt_sb2 = np.zeros(nsnps)
    
    print("====== {:s} ======".format(label))
    Yt = myGX.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    S2 = np.square(S, dtype=np.float64 )
    
    ax1.scatter(np.arange(len(S)-1), S[:-1], label = label, alpha=0.4)
    ax2.hist(S, bins=30, alpha=0.3, label = label)
    
    if cismasks is not None:
        nmasks = len(cismasks)
        print("Found {:d} cismasks.".format(nmasks))
        for i,cismask in enumerate(cismasks):
            print("Processing cismask {:d}: {:d} genes masked for {:d} snps".format(i, len(cismask.rmv_id), cismask.nsnp))
            usegenes = np.ones(GX.shape[0], dtype=bool)
            if cismask.rmv_id.shape[0] > 0: usegenes[cismask.rmv_id] = False
            myGX = GX[usegenes]
            myGT = GT[cismask.apply2,:]
            
            SBOPT = SBoptimizer(myGT, myGX, sigmax2[cismask.apply2])
            SBOPT.fit()
            sb2 = SBOPT.sb2
            opt_sb2[cismask.apply2] = SBOPT.sb2
            
#             _s2, _pvals, _qscores, _muq, _sigmaq, _keff, _emuq, _esigq = tejaas_rr(myGT, myGX, sigmax2[cismask.apply2], sb2, Z, ZTZ_inv) #[cismask.apply2])
            _s2, _pvals, _qscores, _muq, _sigmaq, _keff = tejaas_rr(myGT, myGX, sigmax2[cismask.apply2], sb2) #[cismask.apply2])
            pvals[cismask.apply2]  = _pvals
            Rscore[cismask.apply2] = _qscores
            muQ[cismask.apply2]    = _muq
            sigmaQ[cismask.apply2] = _sigmaq
            Keff[cismask.apply2]   = _keff
#             EmuQ[cismask.apply2]   = _emuq
#             EsigmaQ[cismask.apply2]= _esigq
            
    else:
        S2, pvals, Rscore, muQ, sigmaQ = tejaas_rr(myGT, myGX, sigmax2, sb2)        
    return S2, pvals, Rscore, muQ, sigmaQ, opt_sb2, Keff #, EmuQ, EsigmaQ

fig = plt.figure(figsize = (14, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
# sb2 = np.square(SIGMA_BETAS)
sb2 = None # sb2_opt

# - No KNN
nS2, npvals, nQ, nmuQ, nscaledQ, nopt_sb2, nKeff = compare_basic_props(gx_norm, 
                                                                       gt_cent, 
                                                                       ax1, ax2, 
                                                                       'Norm - no KNN', 
                                                                       sigmax2, 
                                                                       sb2 = sb2, 
                                                                       cismasks=cismaskcomp)
shuf_nS2, shuf_npvals, shuf_nQ, shuf_nmuQ, shuf_nscaledQ, shuf_nopt_sb2, shuf_nKeff = compare_basic_props(gx_shuffled_norm, 
                                                                                                          gt_cent, 
                                                                                                          ax1, ax2, 
                                                                                                          'Norm Shuffled - no KNN', 
                                                                                                          sigmax2, 
                                                                                                          sb2 = sb2, 
                                                                                                          cismasks=cismaskcomp)
# nS2, npvals, nQ, nmuQ, nscaledQ, nopt_sb2, nKeff, nEmuQ, nEscaledQ = compare_basic_props(gx_norm, gt_cent, ax1, ax2, 'Norm - no KNN', sigmax2, Z, ZTZ_inv, sb2 = sb2, cismasks=cismaskcomp)
# shuf_nS2, shuf_npvals, shuf_nQ, shuf_nmuQ, shuf_nscaledQ, shuf_nopt_sb2, shuf_nKeff, shuf_nEmuQ, shuf_nEscaledQ = compare_basic_props(gx_shuffled_norm, gt_cent, ax1, ax2, 'Norm Shuffled - no KNN', sigmax2, Z, ZTZ_inv, sb2 = sb2, cismasks=cismaskcomp)

# - With KNN
# S2, pvals, Q, muQ, scaledQ, opt_sb2, Keff, EmuQ, EscaledQ = compare_basic_props(gx_knn_norm, gt_knn_cent, ax1, ax2, 'Norm KNN', sigmax2 = sigmax2_knn, sb2 = sb2, cismasks=cismaskcomp)
# shuf_S2, shuf_pvals, shuf_Q, shuf_muQ, shuf_scaledQ, shuf_opt_sb2, shuf_Keff, shuf_EmuQ, shuf_EscaledQ  = compare_basic_props(gx_shuffled_knn_norm, gt_knn_cent, ax1, ax2, 'Shuffled Norm KNN', sigmax2 = sigmax2_knn, sb2 = sb2, cismasks=cismaskcomp)
S2, pvals, Q, muQ, scaledQ, opt_sb2, Keff = compare_basic_props(gx_knn_norm, 
                                                                gt_knn_cent, 
                                                                ax1, ax2, 
                                                                'Norm KNN', 
                                                                sigmax2_knn, 
                                                                sb2 = sb2, 
                                                                cismasks=cismaskcomp)
shuf_S2, shuf_pvals, shuf_Q, shuf_muQ, shuf_scaledQ, shuf_opt_sb2, shuf_Keff = compare_basic_props(gx_shuffled_knn_norm, 
                                                                                                   gt_knn_cent, 
                                                                                                   ax1, ax2, 
                                                                                                   'Shuffled Norm KNN', 
                                                                                                   sigmax2_knn, 
                                                                                                   sb2 = sb2, 
                                                                                                   cismasks=cismaskcomp)

# # # - With KNN
# S260, pvals60, Q60, muQ60, scaledQ60 = compare_basic_props(gx_knn_norm60, gt_knn_cent60, ax1, ax2, 'Norm KNN 60', sigmax2 = sigmax2_knn60, sb2 = sb2, cismasks=cismaskcomp)
# shuf_S260, shuf_pvals60, shuf_Q60, shuf_muQ60, shuf_scaledQ60 = compare_basic_props(gx_shuffled_knn_norm60, gt_knn_cent60, ax1, ax2, 'Shuffled Norm KNN 60', sigmax2 = sigmax2_knn60, sb2 = sb2, cismasks=cismaskcomp)

# # - With KNN
# S215, pvals15, Q15, muQ15, scaledQ15 = compare_basic_props(gx_knn_norm15, gt_knn_cent15, ax1, ax2, 'Norm KNN15', sigmax2 = sigmax2_knn15, sb2 = sb2, cismasks=cismaskcomp)
# shuf_S215, shuf_pvals15, shuf_Q15, shuf_muQ15, shuf_scaledQ15 = compare_basic_props(gx_shuffled_knn_norm15, gt_knn_cent15, ax1, ax2, 'Shuffled Norm KNN15', sigmax2 = sigmax2_knn15, sb2 = sb2, cismasks=cismaskcomp)

ax1.set_xlabel("Singular values (rank)")
ax1.set_ylabel("Singular values S (value)")
ax1.legend()

ax2.legend()
plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(10,10), dpi=90)
ax1 = fig.add_subplot(2,2,1)
ax1.hist(shuf_npvals, bins=20, alpha=0.3, label="norm shuf - no KNN")
ax1.hist(npvals, bins=20, alpha=0.3, label="norm - no KNN")
ax1.legend()

ax2 = fig.add_subplot(2,2,2)
ax2.hist(shuf_pvals, bins=20, alpha=0.3, label="KNN shuf")
ax2.hist(pvals, bins=20, alpha=0.3, label="KNN")
ax2.legend()

# ax3 = fig.add_subplot(2,2,3)
# ax3.hist(shuf_pvals60, bins=20, alpha=0.3, label="KNN shuf 60")
# ax3.hist(pvals60, bins=20, alpha=0.3, label="KNN 60")
# ax3.legend()

# ax4 = fig.add_subplot(2,2,4)
# ax4.hist(shuf_pvals15, bins=20, alpha=0.3, label="KNN shuf 15")
# ax4.hist(pvals15, bins=20, alpha=0.3, label="KNN 15")
# ax4.legend()

plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(14,7), dpi=90)
ax1 = fig.add_subplot(1,2,1)
ax1.hist(shuf_npvals, bins=30, alpha=0.3, label="norm shuf - no KNN")
ax1.hist(npvals, bins=30, alpha=0.3, label="norm - no KNN")
ax1.legend()

ax2 = fig.add_subplot(1,2,2)
ax2.scatter((shuf_npvals), np.log10(shuf_nopt_sb2), s=2, alpha=0.3, label="shuf pvals sb2")
ax2.scatter((npvals), np.log10(nopt_sb2), s=2, alpha=0.3, label="pvals sb2")
ax2.set_ylabel("log10_sb2")
ax2.set_xlabel("pvalues")
ax2.legend()

plt.show()

In [None]:
plt.scatter(np.log10(shuf_nopt_sb2), np.log10(nopt_sb2))
plt.show()

In [None]:
# - no KNN
qscaled = (nQ - nmuQ)/ nscaledQ
shuf_qscaled = (shuf_nQ - shuf_nmuQ)/ shuf_nscaledQ

# - KNN 30
qscaled_knn = (Q - muQ)/ scaledQ
shuf_qscaled_knn = (shuf_Q - shuf_muQ)/ shuf_scaledQ

# - KNN 15
# qscaled = (Q15 - muQ15)/ scaledQ15
# shuf_qscaled = (shuf_Q15 - shuf_muQ15)/ shuf_scaledQ15

# - KNN 60
# qscaled2 = (Q60 - muQ60)/ scaledQ60
# shuf_qscaled2 = (shuf_Q60 - shuf_muQ60)/ shuf_scaledQ60

fig = plt.figure(figsize=(14,7))
ax1 = fig.add_subplot(121)
ax1.hist(qscaled, bins=30, alpha=0.3, density=True, label="Actual")
ax1.hist(shuf_qscaled, bins=30, alpha=0.3, density=True, label="Shuffled")
ax1.legend()

ax2 = fig.add_subplot(122)
ax2.hist(qscaled_knn, bins=30, alpha=0.3, density=True, label="Actual KNN")
ax2.hist(shuf_qscaled_knn, bins=30, alpha=0.3, density=True, label="Shuffled KNN")
ax2.legend()

x = np.linspace(-4, 4, 100)
rv = stats.norm(loc = 0, scale = 1)
ax1.plot(x, rv.pdf(x), label = 'analytical')
ax2.plot(x, rv.pdf(x), label = 'analytical')

plt.show()

In [None]:
len([snp_info[x] for x in np.where(pvals < 1e-7)[0]])

In [None]:
[snp_info[x] for x in np.where(pvals < 1e-7)[0]]

# Sbeta optimization

In [None]:
tissue = "wb"
df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
ngene, nsample = df.shape
gx_donors = list(df.columns)
gx = df.values
gene_names = list(df.index)

vcfmask, exprmask = select_donors(gt_donors, gx_donors)
genes, indices = select_genes(geneinfo, gene_names)
dosage_masked = gtfull[:, vcfmask]

snp_info, dosage = filter_snps(snpinfos, dosage_masked)

gx_norm = normalize_expr( gx[:, exprmask] )

## --- Apply KNN correction
f=1
gx_knn, gt_knn = knn_correction(gx_norm.T, dosage, K=30, f=f)

## --- Normalize GX and GT
gt_norm, gt_cent = normalize_and_center_dosage(dosage, snp_info)
sigmax2     = np.var(gt_cent, axis = 1)

gx_knn_norm = normalize_expr(gx_knn.T)
gt_knn_norm, gt_knn_cent = normalize_and_center_dosage(gt_knn, snp_info)
sigmax2_knn = np.var(gt_knn_cent, axis = 1)



In [None]:
GT = gt_knn_cent
GX = gx_knn_norm
Nsample = GX.shape[1]

cismask = cismaskcomp[0]
print("Processing cismask {:d}: {:d} genes masked for {:d} snps".format(0, len(cismask.rmv_id), cismask.nsnp))
usegenes = np.ones(GX.shape[0], dtype=bool)
if cismask.rmv_id.shape[0] > 0: usegenes[cismask.rmv_id] = False
myGX = GX[usegenes] #/ np.sqrt(Nsample)
myGT = GT[cismask.apply2,:]

Yt = myGX.T
U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
S2  = np.square(S)
YTY = np.matmul(myGX, Yt)

# Gene number
G = myGX.shape[0]

j = 0
print("sx2=",sigmax2_knn[j])
SIGMA_BETAS2 = np.linspace(0.02, 0.034, 10)
# SIGMA_BETAS2 = np.linspace(0.00001, 0.0001, 20)

gridsize   = SIGMA_BETAS2.shape[0]
betax_term = np.zeros(gridsize)
der        = np.zeros(gridsize)
Keff_term  = np.zeros(gridsize)
ML_sb2     = np.zeros(gridsize)
Rscore     = np.zeros(gridsize)
for i,sb2 in enumerate(SIGMA_BETAS2):
    S2mod = S2 + (sigmax2_knn[j] / sb2)
    
    Rscore[i] = np.sum(np.square(np.dot(U.T, myGT[j,:])) * (S2 / S2mod)) / sigmax2_knn[j]
    
    Ig = np.identity(G) * (sigmax2_knn[j] / sb2)
    
##     YTY_Ig = np.log(np.linalg.norm(YTY + Ig, 1))
##     a = np.power(2*np.pi*sb2, G/2) * YTY_Ig
##     b = np.exp(Rscore[i] / (2*sigmax2_knn[j]))
    L_YTY_Ig = np.linalg.cholesky(YTY + Ig)
    det_YTY_Ig = 2*np.log(np.trace(L_YTY_Ig))
    log_a = -(G/2)*np.log(2*np.pi*sb2) - 0.5*det_YTY_Ig
#     log_a = (-G)*np.log(sb2) + G*np.log(sigmax2_knn[j]) - 0.5*YTY_Ig
    
    
    log_b = Rscore[i] / (2*sigmax2_knn[j])
    ML_sb2[i] = log_a + log_b
    print(ML_sb2[i])

    Keff_term[i] = np.sum(S2/S2mod)
    betax_term[i] = np.sum(np.square(np.dot(U.T, myGT[j,:])) * (S2 / S2mod)) / sb2
    der[i]    = ( -Keff_term[i] + betax_term[i] ) / ( 2 * sb2 )

In [None]:
S2mod = S2 + (sigmax2_knn[j] / sb2)
Keff_term[i] = np.sum(S2/S2mod)

In [None]:
GT = gt_knn_cent
GX = gx_knn_norm
Nsample = GX.shape[1]

cismask = cismaskcomp[0]
print("Processing cismask {:d}: {:d} genes masked for {:d} snps".format(0, len(cismask.rmv_id), cismask.nsnp))
usegenes = np.ones(GX.shape[0], dtype=bool)
if cismask.rmv_id.shape[0] > 0: usegenes[cismask.rmv_id] = False
myGX = GX[usegenes] #/ np.sqrt(Nsample)
myGT = GT[cismask.apply2,:]

Yt = myGX.T
U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
S2  = np.square(S)
YTY = np.matmul(myGX, Yt)

# Gene number
G = myGX.shape[0]

j = 0
print("sx2=",sigmax2_knn[j])
SIGMA_BETAS2 = np.linspace(1.47e-07, 1.48e-7, 100)
# SIGMA_BETAS2 = np.linspace(0.027, 0.036, 10)
# SIGMA_BETAS2 = np.linspace(1, 2*Nsample, 100)

gridsize   = SIGMA_BETAS2.shape[0]
betax_term = np.zeros(gridsize)
der        = np.zeros(gridsize)
der_old    = np.zeros(gridsize)
Keff_term  = np.zeros(gridsize)
ML_sb2     = np.zeros(gridsize)
Rscore     = np.zeros(gridsize)
I = np.identity(Nsample)
for i,sb2 in enumerate(SIGMA_BETAS2):
    S2mod = S2 + (sigmax2_knn[j] / sb2)
    Rscore[i] = np.sum(np.square(np.dot(U.T, myGT[j,:])) * (S2 / S2mod)) / sigmax2_knn[j]
    ML_sb2[i] = -0.5*np.sum(np.log( S2 * (sb2 / sigmax2_knn[j]) + 1 )) + 0.5*Rscore[i]
    
#     # Gradient check, it matches
#     delta = 0.000001
#     newsb2 = sb2 + delta
#     newS2mod = S2 + (sigmax2_knn[j] / newsb2)
#     newR = np.sum(np.square(np.dot(U.T, myGT[j,:])) * (S2 / S2mod)) / sigmax2_knn[j]
#     newM = -0.5*np.sum(np.log( S2 * (newsb2 / sigmax2_knn[j]) + 1 )) + 0.5*newR
#     M_grad = (newM - ML_sb2[i]) / delta
#     print(M_grad)
        
    denom = (S2 * sb2 + sigmax2_knn[j])
    der[i] = 0.5* np.sum( ( S2 / denom ) * ( (np.square(np.dot(U.T, myGT[j,:])) / denom ) - 1 ) )

#     ## -- der_old below is equal to der above, good! it matches!
#     Keff_term[i]  = np.sum(S2/S2mod)
#     ## alternative check for beta term
#     innerLinv_ST = (S*I / (S2*I + (sigmax2_knn[j]/sb2))) * I
#     UT = np.transpose(U)
#     V = np.transpose(Vt)
#     innerLinv_STUT = np.matmul(innerLinv_ST, UT)
#     A2 = np.matmul(V, innerLinv_STUT)
#     Bi = np.matmul(A2, myGT[j,:][np.newaxis].T)
#     B2 = np.dot(Bi.T, Bi)

#     betax_term[i] = B2 / sb2
    # der_old[i]    = ( -Keff_term[i] + betax_term[i] ) / ( 2 * sb2 )

In [None]:
from scipy.optimize import minimize


GT = gt_knn_cent
GX = gx_knn_norm
Nsample = GX.shape[1]

cismask = cismaskcomp[0]
print("Processing cismask {:d}: {:d} genes masked for {:d} snps".format(0, len(cismask.rmv_id), cismask.nsnp))
usegenes = np.ones(GX.shape[0], dtype=bool)
if cismask.rmv_id.shape[0] > 0: usegenes[cismask.rmv_id] = False
myGX = GX[usegenes] #/ np.sqrt(Nsample)
myGT = GT[cismask.apply2,:]


class SBoptimizer:

    def __init__(self, GT, GX, sx2):

        self._GT  = np.ascontiguousarray(GT)
        self._GX  = np.ascontiguousarray(GX)
        self._sx2 = np.ascontiguousarray(sx2)
        self._nsnps = GT.shape[0]
        self._nsample = GX.shape[1]
        
        U, S, VT = np.linalg.svd(GX.T)
        self._S = S
        self._U = U
        self._S2 = np.square(S)
        self._opt_sb2 = np.zeros(self._nsnps)
    
    @property
    def sb2(self):
        return self._opt_sb2

    def get_ML(self, _sb2, i):
        # sb2 = sb * sb
        sb2 = np.exp(_sb2)
        S2mod = self._S2 + (self._sx2[i] / sb2)
        Rscore = np.sum(np.square(np.dot(self._U.T, self._GT[i,:])) * (self._S2 / S2mod)) / self._sx2[i]
        MLL = -0.5*np.sum(np.log( self._S2 * (sb2 / self._sx2[i]) + 1 )) + 0.5*Rscore

        denom = (self._S2 * sb2 + self._sx2[i])
        der = 0.5* np.sum( ( self._S2 / denom ) * ( (np.square(np.dot(self._U.T, self._GT[i,:])) / denom ) - 1 ) )
        return -MLL, sb2*np.array([-der])

    def fit(self):
        st = time.time()
        
        sb_init = np.exp(0.01)
        for i in range(self._nsnps):
            res = minimize(   self.get_ML,
                              sb_init, 
                              args = i,
                              method='L-BFGS-B',
                              jac = True,
                              #bounds = [[0,1]],
                              options={'maxiter': 200000,
                                       'maxfun': 2000000,
                                       #'ftol': 1e-9,
                                       #'gtol': 1e-9,
                                       'disp': True})

            print(res)
            self._opt_sb2[i] = np.exp(res.x[0])
        et = time.time()
        print("optimization took in total: ",et-st)
        
# GT was subsetted above
SBOPT = SBoptimizer(myGT, myGX, sigmax2_knn[cismask.apply2])
        
# Yt = myGX.T
# U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
# S2  = np.square(S)

# def get_ML(sb2, S, S2, U, GT, sx2, i):
#     S2mod = S2 + (sx2[i] / sb2)
#     Rscore = np.sum(np.square(np.dot(U.T, GT[i,:])) * (S2 / S2mod)) / sx2[i]
#     MLL = -0.5*np.sum(np.log( S2 * (sb2 / sx2[i]) + 1 )) + 0.5*Rscore
            
#     denom = (S2 * sb2 + sx2[i])
#     der = 0.5* np.sum( ( S2 / denom ) * ( (np.square(np.dot(U.T, GT[i,:])) / denom ) - 1 ) )
#     return -MLL, -der

# lml_min = optimize.minimize(self._log_marginal_likelihood,
#                                         scaledparams,
#                                         method='L-BFGS-B',
#                                         jac=True,
#                                         bounds=bounds,
#                                         callback=self._callback_zstates,
#                                         options={'maxiter': 200000,
#                                                  'maxfun': 2000000,
#                                                  'ftol': 1e-9,
#                                                  'gtol': 1e-9,
#                                                  'disp': True})

In [None]:
SBOPT.fit()

In [None]:
SBOPT.sb2

In [None]:
plt.hist(np.sqrt(SBOPT.sb2), bins=20)
plt.show()

In [None]:
fig = plt.figure(figsize=(14,8))
ax1  = fig.add_subplot(121)
ax2  = fig.add_subplot(122)

SIGMA_BETAS = SIGMA_BETAS2
ax1.scatter(SIGMA_BETAS,  ML_sb2, label="log_ML", c="blue")
ax1.scatter(SIGMA_BETAS,  der, label="log_derivative", c='red')
ax1.axhline(0, ls="--", c="red")
ax1.set_xlabel(r"$\sigma_{\beta}^2$")
ax1.set_xlim(min(SIGMA_BETAS), max(SIGMA_BETAS))
# ax1.set_xlim(0.025,0.03)
# ax1.set_ylim(0,10)

ax2.scatter(SIGMA_BETAS, Keff_term, label=r"$K_{eff}$")
ax2.scatter(SIGMA_BETAS, betax_term, label=r"$ \beta $")
ax2.axhline(GX.shape[1], c="red")
ax2.set_xlabel(r"$\sigma_{\beta}^2$")
ax2.set_xlim(min(SIGMA_BETAS), max(SIGMA_BETAS))
ax2.legend()
ax2.set_ylim(0,1000)

ax1.legend()
plt.show()

In [None]:
print(der)
print(ML_sb2)

# Singular values of some tissues

In [None]:
select_tissues = ["as", "ms", "sse", "ag", "wb"]

fig = plt.figure(figsize=(14,10))
ax1 = fig.add_subplot(111)

for tissue in select_tissues:
    # df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
    df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_cclm.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
    ngene, nsample = df.shape
    gx_donors = list(df.columns)
    gx = df.values
    gene_names = list(df.index)
    
    vcfmask, exprmask = select_donors(gt_donors, gx_donors)
    genes, indices = select_genes(geneinfo, gene_names)
    dosage_masked = gtfull[:, vcfmask]

    snp_info, dosage = filter_snps(snpinfos, dosage_masked)

    gx_norm = normalize_expr( gx[:, exprmask] )

    ## --- Apply KNN correction
    f=1
    gx_knn, gt_knn = knn_correction(gx_norm.T, dosage, K=30, f=f)

    ## --- Normalize GX and GT
    gt_norm, gt_cent = normalize_and_center_dosage(dosage, snp_info)
    sigmax2     = np.var(gt_cent, axis = 1)

    gx_knn_norm = normalize_expr(gx_knn.T)
    gt_knn_norm, gt_knn_cent = normalize_and_center_dosage(gt_knn, snp_info)
    sigmax2_knn = np.var(gt_knn_cent, axis = 1)
    
    Y = gx_knn_norm # / np.sqrt(nsample)
    Yt = Y.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    ax1.scatter(np.arange(len(S)-1), S[:-1], label=tissue, alpha=0.4)
    
ax1.legend()
plt.show()

In [None]:
## Without KNN
select_tissues = ["as", "ms", "sse", "ag", "wb"]

fig = plt.figure(figsize=(14,10))
ax1 = fig.add_subplot(111)

for tissue in select_tissues:
    # df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
    df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_cclm.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
    ngene, nsample = df.shape
    gx_donors = list(df.columns)
    gx = df.values
    gene_names = list(df.index)
    
    vcfmask, exprmask = select_donors(gt_donors, gx_donors)
    genes, indices = select_genes(geneinfo, gene_names)
    dosage_masked = gtfull[:, vcfmask]

    snp_info, dosage = filter_snps(snpinfos, dosage_masked)

    gx_norm = normalize_expr( gx[:, exprmask] )

    ## --- Normalize GX and GT
    gt_norm, gt_cent = normalize_and_center_dosage(dosage, snp_info)
    sigmax2     = np.var(gt_cent, axis = 1)
   
    Y = gx_norm # / np.sqrt(nsample)
    Yt = Y.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    ax1.scatter(np.arange(len(S)-1), S[:-1], label=tissue, alpha=0.4)
    
ax1.legend()
plt.show()

In [None]:
import pandas as pd

## What's the usual Keff in other tissues?
select_tissues = ["as", "ms", "sse", "ag", "wb"]
select_tissues = ["ms", "wb"]

fig = plt.figure(figsize=(14,10))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

def get_contribs(tissue, gx_norm, dosage, snp_info, ax1, ax2, keff_target=0.5, K=30, f=1):
    ## --- Apply KNN correction
    gx_knn, gt_knn = knn_correction(gx_norm.T, dosage, K=K, f=f)

    ## --- Normalize GX and GT
    gt_norm, gt_cent = normalize_and_center_dosage(dosage, snp_info)
    sigmax2     = np.var(gt_cent, axis = 1)

    gx_knn_norm = normalize_expr(gx_knn.T)
    gt_knn_norm, gt_knn_cent = normalize_and_center_dosage(gt_knn, snp_info)
    sigmax2_knn = np.var(gt_knn_cent, axis = 1)
    
    Y = gx_knn_norm # / np.sqrt(nsample)
    Yt = Y.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    
    sb2_opt = optimize_sb2(S, sigmax2_knn, keff_target)
    S2 = np.square(S)
    sb2 = sb2_opt # np.square(DY_SIGMA_BETAS10)
    S2mod = S2 + (sigmax2_knn[0] / sb2[0])
    contribs = S2/S2mod
    
    L=int(1 * len(S2)) 
    N = len(S2)
    print(tissue, np.sum(contribs[:L]), np.sum(contribs[:L]) / N)
    ax1.scatter(np.arange(len(S)), contribs, s=3, label="{:s}_{:d}_{:f}_{:f}".format(tissue, K, keff_target, f), alpha=0.4)
    ax2.scatter(np.arange(len(S)) / N, np.cumsum(contribs) / N, s=3, label="{:s}_{:d}_{:f}_{:f}".format(tissue, K, keff_target, f), alpha=0.4)

for tissue in select_tissues:
    df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
    ngene, nsample = df.shape
    gx_donors = list(df.columns)
    gx = df.values
    gene_names = list(df.index)
    
    vcfmask, exprmask = select_donors(gt_donors, gx_donors)
    genes, indices = select_genes(geneinfo, gene_names)
    dosage_masked = gtfull[:, vcfmask]

    snp_info, dosage = filter_snps(snpinfos, dosage_masked)

    gx_norm = normalize_expr( gx[:, exprmask] )
    get_contribs(tissue, gx_norm, dosage, snp_info, ax1, ax2, keff_target=0.7,  K=30, f=1)
#     get_contribs(tissue, gx_norm, dosage, snp_info, ax1, ax2, keff_target=0.8,  K=30, f=0.1)
    
## Add Framingham
df = pd.read_csv("/cbscratch/franco/datasets/FHS/expression/fhs.formatted.expr.one_pedno_rep.txt", header=0, index_col=0, sep="\t")
ngene, nsample = df.shape
gx_donors = list(df.columns)
gx = df.values
gene_names = list(df.index)
print("Loaded FHS ", gx.shape)

# Use real genotype
chrm=12
f_vcf = "/cbscratch/franco/datasets/FHS/genotypes/vcfs/chr{:d}.fhs.vcf.gz".format(chrm)
Fgtfull, Fsnpinfos, Fgt_donors = read_vcf(f_vcf, 0, 2000)

vcfmask, exprmask = select_donors(Fgt_donors, gx_donors)
genes, indices = select_genes(geneinfo, gene_names)
dosage_masked = Fgtfull[:, vcfmask]

snp_info, dosage = filter_snps(Fsnpinfos, dosage_masked)

gx_norm = normalize_expr( gx[:, exprmask] )
get_contribs("fhs", gx_norm, dosage, snp_info, ax1, ax2, keff_target=0.7,  K=30, f=1)

    
ax1.legend()
ax2.legend()
plt.show()

In [None]:
## What's the usual Keff in other tissues?
select_tissues = ["as", "ms", "sse", "ag", "wb"]

fig = plt.figure(figsize=(14,10))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

for tissue in select_tissues:
    df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_aug2019/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
    ngene, nsample = df.shape
    gx_donors = list(df.columns)
    gx = df.values
    gene_names = list(df.index)
    
    vcfmask, exprmask = select_donors(gt_donors, gx_donors)
    genes, indices = select_genes(geneinfo, gene_names)
    dosage_masked = gtfull[:, vcfmask]

    snp_info, dosage = filter_snps(snpinfos, dosage_masked)

    gx_norm = normalize_expr( gx[:, exprmask] )

    ## --- Apply KNN correction
    f=1
    gx_knn, gt_knn = knn_correction(gx_norm.T, dosage, K=30, f=f)

    ## --- Normalize GX and GT
    gt_norm, gt_cent = normalize_and_center_dosage(dosage, snp_info)
    sigmax2     = np.var(gt_cent, axis = 1)

    gx_knn_norm = normalize_expr(gx_knn.T)
    gt_knn_norm, gt_knn_cent = normalize_and_center_dosage(gt_knn, snp_info)
    sigmax2_knn = np.var(gt_knn_cent, axis = 1)
    
    Y = gx_knn_norm # / np.sqrt(nsample)
    Yt = Y.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    s2_median = np.median(np.square(S))
    s2_10     = np.percentile(np.square(S), 60)
    
    SIGMA_BETAS = np.repeat(0.1, gt_cent.shape[0])
    DY_SIGMA_BETAS = np.sqrt( sigmax2 / s2_median )
    DY_SIGMA_BETAS10 = np.sqrt( sigmax2 / s2_10 )
    avg_sb_m  = np.mean(DY_SIGMA_BETAS)
    avg_sb_10 = np.mean(DY_SIGMA_BETAS10)

    S2 = np.square(S)
    sb2 = np.square(DY_SIGMA_BETAS10)
    S2mod = S2 + (sigmax2_knn[0] / sb2[0])
    contribs = S2/S2mod
    
    L=int(0.1 * len(S2)) 
    print(tissue, np.sum(contribs[:L]), np.sum(contribs[:L]) / len(S2))
    ax1.scatter(np.arange(len(S)-1), contribs[:-1], label=tissue, alpha=0.4)
    ax2.scatter(np.arange(len(S)-1), np.log10(contribs[:-1]), label=tissue, alpha=0.4)
    
ax1.legend()
ax2.legend()
plt.show()

# Quick and dirty DHS enrichments

In [None]:
dhs_dict = collections.defaultdict(list)
pcutoffs = [5e-08, 1e-10, 1e-15, 1e-20, 1e-25, 1e-30, 1e-35]
title = "multi_tissue"
for pcutoff in pcutoffs:
    dhsfile = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_opt_sb2_EUR/raw/summary_5e-08/dhs_enrichments/dhs_enrichment_{:s}_{:g}.txt".format(title, pcutoff)
    with open(dhsfile) as instream:
        next(instream)
        for line in instream:
            arr = line.strip().split("\t")
            tissue = arr[0]
            enrichment = float(arr[5])
            dhs_dict[tissue].append(enrichment)
            

In [None]:
# dhs_dict = collections.defaultdict(list)
# keffs = ["0.5", "0.6", "0.7", "0.8"]
# for keff in keffs:
#     dhsfile = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_dysb/summary_5e-08_k{:s}/dhs_enrichments/dhs_enrichment_multi_tissue_keff{:s}.txt".format(keff, keff)
#     with open(dhsfile) as instream:
#         next(instream)
#         for line in instream:
#             arr = line.strip().split("\t")
#             tissue = arr[0]
#             enrichment = float(arr[5])
#             dhs_dict[tissue].append(enrichment)
            

In [None]:
fig = plt.figure(figsize=(14,8))
ax  = fig.add_subplot(111)
for t in dhs_dict:
    while len(dhs_dict[t]) != len(pcutoffs):
        dhs_dict[t] = np.append(dhs_dict[t],0.0)
    ax.plot([np.log10(x) for x in pcutoffs], dhs_dict[t], label=t)
    ax.set_title("ALL SAMPLES")
    ax.set_ylabel("enrichment")
    ax.set_xlabel("log p-val")
ax.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(14,8))
ax  = fig.add_subplot(111)
for t in dhs_dict:
    while len(dhs_dict[t]) != len(pcutoffs):
        dhs_dict[t] = np.append(dhs_dict[t],0.0)
    ax.plot([np.log10(x) for x in pcutoffs], dhs_dict[t], label=t)
    ax.set_title("ONLY EUR")
    ax.set_ylabel("enrichment")
    ax.set_xlabel("log p-val")
ax.legend()
plt.show()