In [None]:
import sys
from os import chdir, remove, mkdir
import multiprocessing
from os.path import basename, join, splitext
from glob import glob
import subprocess

import numpy as np
import pandas as pd
import pylab as pl

from matplotlib_venn import venn3, venn2, venn2_circles
from sklearn import linear_model
from scipy import stats
from progressbar import ProgressBar

from pygero.utils.mputils import multiproc_pbar

# chdir('')
n_jobs=multiprocessing.cpu_count()

In [None]:
import multiprocessing as MP
import itertools
import time
import progressbar

def multiproc_pbar(fun, args, mask, kwargs={}, n_jobs=None, progress=True):
    arg_iters = []
    for i, elem in enumerate(args):
        arg_iters.append(iter(elem) if mask[i] else itertools.repeat(elem))
    prms = list(itertools.izip(itertools.repeat(fun), itertools.repeat(kwargs), *arg_iters))
    n_tasks = len(prms)
    for i, elem in enumerate(arg_iters):
        if mask[i]:
            try:
                next(elem)
            except StopIteration:
                pass
            else:
                raise TypeError("expandable arguments must be of the same length")
    pool = MP.Pool(processes=n_jobs)
    result = pool.map_async(_func_star_many, prms, chunksize=1)
    if progress:
        pbar = progressbar.ProgressBar(maxval=n_tasks).start()
    while not result.ready():
        try:
            if progress:
                pbar.update(n_tasks-result._number_left)
            time.sleep(1)
        except KeyboardInterrupt:
            pool.terminate()
            pool.join()
            raise
    pool.close()
    pool.join()
    if progress:
        pbar.finish()
    return result.get()

In [None]:
def metaanalysis(slope, seslopes):

    weight = seslopes**-2
    beta_ma = (slope*weight).sum(1)/weight.sum(1)
    sigma_ma = weight.sum(1)**-.5
    t = beta_ma/sigma_ma
    return pd.concat([beta_ma, sigma_ma, t.apply(pval)], axis=1, keys=['beta','sigma','p'])

In [None]:
def LH_both(X, x, u, t1, e):
    M0, g, beta = X[0], X[1], X[2:]

    N = 1.*u.shape[0]
    gt2 = g * (t1 + u)
    gt1 = g * t1
#     gt3 = g * t3

    gu = g * u
    z = np.dot(x, np.array(beta).reshape((-1,1))).flatten()

    logLikelihood = np.sum(np.exp(M0)/g * np.exp(z) * (np.exp(gt1) - np.exp(gt2)))
    logLikelihood += np.dot(gt2 + z + M0, e)
#     logLikelihood -= np.dot(np.log(1.0 - np.exp(-np.exp(M0) * np.exp(z) * (np.exp(gt3) - 1) / g)), psi)
    
    return -logLikelihood

In [None]:
def pval(t_stat):
    return stats.chi2.sf(t_stat**2, 1)

In [None]:
def parallel_p(dataslice):
    return dataslice.apply(pval)

In [None]:
from scipy.optimize import minimize

def optimize_cox_gomperz(df_main):
    
    x = df_main.drop(['t1','t2','event'], axis=1).values
    u = (df_main['t2'] - df_main['t1']).values
    args = (x, u, df_main['t1'].values, df_main['event'].values)
    X0 = [np.log(1e-5), 0.085] + [0]*x.shape[1]

    res = minimize(LH_both, X0, args=args, method='L-BFGS-B')
    betas = pd.Series(res.x[2:], index=df_main.columns[:-3])
    betas['gamma'] = res.x[1]
    betas['M0'] = np.exp(res.x[0])
    return betas

In [None]:
def cox_gm_gwas(df_slice_input, betas, rawdata):

    M0, gamma = betas[['M0','gamma']].values
    df_slice = df_slice_input.copy()
    df_slice['betax'] = df_slice.loc[:,betas.index.drop(['M0','gamma'])].mul(betas, axis=1).sum(1)
    df_slice['z'] = np.exp(df_slice['betax'])
    df_slice['gt1'] = np.exp(df_slice['t1']*gamma)
    df_slice['gt2'] = np.exp(df_slice['t2']*gamma)

    xs = df_slice.index.intersection(rawdata.index)
    dfxs = df_slice.loc[xs,:]

    delta, t1, t2, betax, z, gt1, gt2 = dfxs.loc[:,['event','t1','t2','betax','z','gt1','gt2']].values.T
    g = z*(gt2-gt1)/gamma
    Nd = np.sum(delta)
    phi = M0*g - delta
    theta = np.copy(g)
    Loo = (Nd/M0**2)

    snpdata = rawdata.loc[xs,:].values.T

    Ls = np.sum(snpdata*phi, axis=1)
    snpdatatheta = snpdata*theta
    Los = np.sum(snpdatatheta, axis=1)
    Lss = M0*np.sum(snpdata*snpdatatheta, axis=1)

    commonpart = 1./(Lss-Los**2/Loo)
    beta = -Ls*commonpart
#     sigma = np.sqrt(commonpart)
    sigma = commonpart**0.5
    res = pd.DataFrame(np.stack([beta, sigma], axis=-1),
             columns=['beta', 'sigma'], index=rawdata.columns)
    res['Nd'] = Nd
    res['p'] = res.apply(lambda x: pval(x['beta']/x['sigma']), axis=1)
    return res

In [None]:
def replication_pipeline(df_main_, cojodata, clean_index):
    
    df_main = df_main_.loc[clean_index,:].dropna()
    df_main.iloc[:,:-3] -= df_main.iloc[:,:-3].mean(0)
    df_main.iloc[:,:-3] /= df_main.iloc[:,:-3].std(0)
    df_main = df_main.dropna(how='all', axis=1)
    betas_replication = optimize_cox_gomperz(df_main)
    gwas_replication = cox_gm_gwas(df_main, betas_replication, cojodata)
    return gwas_replication, betas_replication#, gwas_replication_covs, betas_replication_covs

In [None]:
descodes = {
            'MI':{'sr':[1075],'icd':['I21','I22','I23','I24','I25']},
            'stroke':{'sr':[1081,1086,1491,1583],'icd':['I60','I61','I62','I63','I64']},
            'CHF':{'sr':[1076],'icd':['I50']},
            'dementia':{'sr':[1263,1258,1259,1260,1261,1262],'icd':['F00','F01','F02','F03','F04','F05']},
#             'hypertension':{'sr':[1065,1072,1073],'icd':['I10','I11','I12','I13','I14','I15']},
            'COPD':{'sr':[1112],'icd':['J44']},
#             'renal failure':{'sr':[1192,1193,1194],'icd':['N17','N18','N19']},
            'diabetes':{'sr':[1220,1221,1222,1223,1521],'icd':['E10','E11','E12','E13','E14']}
            }