In [1]:
import numpy as np
import collections
import os
import time
import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib
import sys
sys.path.append('../')
import pandas as pd
# from utils import mpl_stylesheet
# mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 24, colors = 'banskt', dpi = 72)

In [2]:
def read_tejaas(filename):
    rsidlist = list()
    bplist   = list()
    pvallist = list()
    qlist    = list()
    maflist  = list()
    with open(filename, 'r') as instream:
        next(instream)
        for line in instream:
            linesplit = line.strip().split()
            rsid = linesplit[0]
            bppos = int(linesplit[2])
            pval = float(linesplit[7])
            logp  = np.log10(pval) if pval!=0 else np.log10(1e-30)
            # qval = float(linesplit[4])
            maf  = float(linesplit[3])
            rsidlist.append(rsid)
            bplist.append(bppos)
            pvallist.append(pval)
            # qlist.append(qval)
            maflist.append(maf)
    return rsidlist, pvallist, bplist, qlist, maflist

def read_tejaas_old(filename):
    rsidlist = list()
    bplist   = list()
    pvallist = list()
    qlist    = list()
    maflist  = list()
    with open(filename, 'r') as instream:
        next(instream)
        for line in instream:
            linesplit = line.strip().split("\t")
            rsid = linesplit[0]
            bppos = int(linesplit[1])
            pval = float(linesplit[5])
            # qval = float(linesplit[2])
            maf  = float(linesplit[7])
            rsidlist.append(rsid)
            bplist.append(bppos)
            pvallist.append(pval)
            # qlist.append(qval)
            maflist.append(maf)
    return rsidlist, pvallist, bplist, qlist, maflist

def read_tejaas_all_chrom(filefmt, chrmlist):
    gwrsids = list()
    gwpvals = list()
    gwnsnps = dict()
    gwbppos = dict()
    gwqvals = list()
    gwmafs  = list()
    
    for chrm in chrmlist:
        filepath = filefmt.format(chrm)
        rsids, pvals, bplist, qlist, maflist = read_tejaas(filepath)
        gwrsids += rsids
        gwpvals += pvals
        gwqvals += qlist
        gwmafs  += maflist
        gwnsnps[chrm] = len(rsids)
        gwbppos[chrm] = np.array(bplist)
        
    gwpvals = np.array(gwpvals)
    pmax  = np.max(gwpvals[np.where(gwpvals > 0)])
    nonzero_pvals = gwpvals.copy()
    nonzero_pvals[np.where(gwpvals == 0)] = pmax
    
    return gwrsids, gwnsnps, gwbppos, nonzero_pvals


In [3]:
from utils import utils
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissue_table.txt"
tshorts, tdescs, tstrings = utils.read_tissues_str(tissue_file)
optim_tissues = ['ag', 'haa', 'liv', 'msg', 'pan', 'pit', 'si', 'spl', 'va', 'wb']
optim_tissues = ['haa', 'pan', 'spl', 'wb']

# Compile new SHAPEIT2 results

In [4]:
resdir  = "/cbscratch/franco/from_saikat/gtex_v8_202003/"
tissuelist = tshorts

tejaas_preproc = ["raw_std"]
tejaas_method_variant  = ["permnull_sb0.1_knn30", "permnull_sb0.006_knn30"]

rrfile = "rr.txt"
chrmlist = np.arange(1,23)

compilation = pd.DataFrame([])
for tissue in tissuelist:
    if not os.path.exists(f'{resdir}/{tissue}'):
        print(f"{tissue} folder does not exist")
        continue
    print(f"{tissue} .. ", end="")
    st = time.time()
    if tissue in optim_tissues:
        filefmt = f'{resdir}/{tissue}/tejaas/{tejaas_preproc[0]}/{tejaas_method_variant[1]}/chr' + "{:d}" + f'/{rrfile}'
    else:
        filefmt = f'{resdir}/{tissue}/tejaas/{tejaas_preproc[0]}/{tejaas_method_variant[0]}/chr' + "{:d}" + f'/{rrfile}'
    gwrsids, gwnsnps, gwbppos, gwpvals = read_tejaas_all_chrom(filefmt, chrmlist)
    gwlog10pvals = -np.log10(gwpvals)
    df = pd.DataFrame(gwlog10pvals, index=gwrsids, columns=[tissue])
    if compilation.shape[0] == 0:
        compilation = df
    else:
        compilation = pd.merge(compilation, df, how='outer', left_index=True, right_index=True)
    endt = time.time()
    print(f"took {endt - st}")
compilation.index.name = "tissues"

as .. took 24.036720752716064
av .. took 46.764017820358276
ag .. took 44.11942458152771
aa .. took 45.384321212768555
ac .. took 45.558910608291626
at .. took 46.34559202194214
bam .. took 42.92615270614624
ban .. took 46.3984055519104
bca .. took 47.25814771652222
bceh .. took 45.4301917552948
bce .. took 45.327085971832275
bco .. took 45.89703583717346
bfr .. took 47.95907139778137
bhi .. took 51.444358348846436
bhy .. took 52.84697079658508
bnu .. took 62.20241165161133
bpu .. took 46.89466643333435
bsp .. took 46.27078032493591
bsu .. took 46.18451738357544
br .. took 48.01198697090149
ebv .. took 47.42896318435669
fib .. took 47.902137994766235
cols .. took 47.43076252937317
colt .. took 47.4527907371521
esog .. took 48.549033880233765
esom .. took 50.53219771385193
esomu .. took 50.76396322250366
haa .. took 55.72065615653992
hlv .. took 53.64565372467041
kc .. took 68.6923463344574
liv .. took 47.14991211891174
lu .. took 46.322105884552
msg .. took 50.33691382408142
ms .. took

In [5]:
compilation.to_csv(f"{resdir}/all_variants_pvalues_tejaas_4optims.txt", sep="\t", header=True, index=True)

# Compile for crossmap results

In [8]:
resdir  = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_SHAPEIT2/"
tissuelist = tshorts

tejaas_preproc = ["raw"]
tejaas_method_variant  = ["permnull_sb0.1_knn30_crossmap", "permnull_sb0.006_knn30_crossmap"]

rrfile = "rr.txt"
chrmlist = np.arange(1,23)

compilation_cm = pd.DataFrame([])
for tissue in tissuelist:
    if not os.path.exists(f'{resdir}/{tejaas_preproc[0]}/gtex_v8-{tissue}'):
        print(f"{tissue} folder does not exist")
        continue
    print(f"{tissue} .. ", end="")
    st = time.time()
    if tissue in optim_tissues:
        filefmt = f'{resdir}/{tejaas_preproc[0]}/gtex_v8-{tissue}/tejaas/{tejaas_method_variant[1]}/chr' + "{:d}" + f'/{rrfile}'
    else:
        filefmt = f'{resdir}/{tejaas_preproc[0]}/gtex_v8-{tissue}/tejaas/{tejaas_method_variant[0]}/chr' + "{:d}" + f'/{rrfile}'
    gwrsids, gwnsnps, gwbppos, gwpvals = read_tejaas_all_chrom(filefmt, chrmlist)
    gwlog10pvals = -np.log10(gwpvals)
    df = pd.DataFrame(gwlog10pvals, index=gwrsids, columns=[tissue])
    if compilation_cm.shape[0] == 0:
        compilation_cm = df
    else:
        compilation_cm = pd.merge(compilation_cm, df, how='outer', left_index=True, right_index=True)
    endt = time.time()
    print(f"took {endt - st}")
compilation_cm.index.name = "tissues"

as .. took 65.84159469604492
av .. took 111.05250263214111
ag .. took 110.74026155471802
aa .. took 109.77238488197327
ac .. took 110.77158117294312
at .. took 111.30867528915405
bam folder does not exist
ban folder does not exist
bca folder does not exist
bceh folder does not exist
bce .. took 69.44670343399048
bco .. took 69.87239241600037
bfr folder does not exist
bhi folder does not exist
bhy folder does not exist
bnu folder does not exist
bpu folder does not exist
bsp folder does not exist
bsu folder does not exist
br .. took 75.83008527755737
ebv .. took 74.92946124076843
fib .. took 77.00100612640381
cols .. took 72.19249963760376
colt .. took 76.4660165309906
esog .. took 74.64367341995239
esom .. took 75.68238759040833
esomu .. took 72.38919067382812
haa .. took 73.70332646369934
hlv .. took 84.23066520690918
kc .. took 71.499685049057
liv .. took 71.0481595993042
lu .. took 71.47844815254211
msg .. took 72.70255994796753
ms .. took 73.38420486450195
nt .. took 77.557099342346

In [9]:
compilation_cm.to_csv(f"{resdir}/all_variants_pvalues_tejaas_4optims_crossmap.txt", sep="\t", header=True, index=True)

# Old results compilation

In [40]:
resdir  = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_lncRNA_freeze/"
tissuelist = ["gtex_v8-"+t for t in tshorts]

tejaas_preproc = ["raw"]
tejaas_method_variant  = ["permnull_sb0.1_knn30"]

rrfile = "rr.txt"
chrmlist = np.arange(1,23)

compilation = pd.DataFrame([])
for tissue in tissuelist:
    print(f"{tissue} .. ", end="")
    st = time.time()
    filefmt = f'{resdir}/{tejaas_preproc[0]}/{tissue}/tejaas/{tejaas_method_variant[0]}/chr' + "{:d}" + f'/{rrfile}'
    gwrsids, gwnsnps, gwbppos, gwpvals = read_tejaas_all_chrom(filefmt, chrmlist)
    gwlog10pvals = -np.log10(gwpvals)
    df = pd.DataFrame(gwlog10pvals, index=gwrsids, columns=[tissue])
    if compilation.shape[0] == 0:
        compilation = df
    else:
        compilation = pd.merge(compilation, df, how='outer', left_index=True, right_index=True)
    endt = time.time()
    print(f"took {endt - st}")
compilation.index.name = "tissues"    


gtex_v8-as .. took 8.610471487045288
gtex_v8-av .. took 19.227826356887817
gtex_v8-ag .. took 32.07252049446106
gtex_v8-aa .. took 32.85236406326294
gtex_v8-ac .. took 29.209378242492676
gtex_v8-at .. took 28.964735746383667
gtex_v8-bl .. took 22.996384859085083
gtex_v8-bam .. took 29.1875159740448
gtex_v8-ban .. took 26.892094612121582
gtex_v8-bca .. took 26.00601887702942
gtex_v8-bceh .. took 26.045687913894653
gtex_v8-bce .. took 25.931861877441406
gtex_v8-bco .. took 25.806033611297607
gtex_v8-bfr .. took 26.66654658317566
gtex_v8-bhi .. took 24.95831823348999
gtex_v8-bhy .. took 26.02777123451233
gtex_v8-bnu .. took 27.509639024734497
gtex_v8-bpu .. took 26.556803226470947
gtex_v8-bsp .. took 28.262579441070557
gtex_v8-bsu .. took 28.234033823013306
gtex_v8-br .. took 28.405027866363525
gtex_v8-ebv .. took 28.48702383041382
gtex_v8-fib .. took 27.016149520874023
gtex_v8-cols .. took 31.52826952934265
gtex_v8-colt .. took 26.844249963760376
gtex_v8-esog .. took 26.15397000312805
gt

In [46]:
compilation.to_csv("/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_lncRNA_freeze/raw/all_variants_pvalues_tejaas.txt", sep="\t", header=True, index=True)