In [32]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import sys
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')
from iotools import readgtf
import numpy as np
import os
import re
import collections
import time 
import gzip 

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'dbSNP', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

dbsnp_dict = collections.defaultdict(lambda: False)
with gzip.open("/cbscratch/franco/datasets/gtex_v8/genotypes/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz") as instream:
    next(instream)
    for line in instream:
        arr = line.decode().rstrip().split()
        #if len(arr[3]) == 1 and len(arr[4]) == 1:
        dbsnp_dict[arr[2]] = arr[6]
        
def tejaas(filepath, dbsnp_dict):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            #chrom = rsid.split("_")[0][3:]
            chrom = int(arr[1])
            pos   = int(arr[2])
            p     = float(arr[7])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            maf   = float(arr[3])
            dbsnp = dbsnp_dict[rsid]
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, dbSNP=dbsnp, maf=maf))
    return res    

def tejaas_old(filepath, dbsnp_dict):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            chrom = int(rsid.split("_")[0][3:])
            pos   = int(arr[1])
            p     = float(arr[5])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            maf   = float(arr[7])
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, dbSNP=dbsnp_dict[rsid], maf=maf))
    return res  

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
from utils import utils
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissues.txt"
tissuenames, descriptions = utils.read_tissues(tissue_file)

basepath = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_dysb/"
trans_dict = dict()
for tissue in tissuenames:
    tejaas_file = os.path.join(basepath, "summary_5e-08_k0.6", tissue, "tejaas", "trans_eqtls.txt")
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    print("Loading ", tissue, end="")
    transeqtls = tejaas_old(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        trans_dict[tissue] = transeqtls
        print(" has {:d} trans-eqtls".format(len(transeqtls)))
    else:
        print(" has 0 trans-eqtls")

Loading  aa has 11254 trans-eqtls
Loading  ms has 29079 trans-eqtls
Loading  wb has 1529 trans-eqtls


In [22]:
import gzip
eqtlgenfile = "/cbscratch/franco/datasets/EQTLgen/trans-eQTL_significant_20181017.txt.gz"
eqtlgen_dict = collections.defaultdict(lambda: False)
eqtlgen_coord_dict = dict()
for chrm in np.arange(1,23):
    eqtlgen_coord_dict[chrm] = collections.defaultdict(lambda: False)
with gzip.open(eqtlgenfile) as instream:
    next(instream)
    for line in instream:
        arr = line.decode().strip().split("\t")
        eqtlgen_dict[arr[1]] = True
        eqtlgen_coord_dict[int(arr[2])][int(arr[3])] = True
    

In [44]:
for tissue in tissuenames:
    counter = 0
    for snp in trans_dict[tissue]:
        if eqtlgen_dict[snp.dbSNP]:
            print(snp.dbSNP)
            counter += 1
    print("{:s}: Replicated {:d} trans-eQTLs".format(tissue, counter))

rs10934853
rs4705952
rs4646450
rs10744777
rs1129038
aa: Replicated 5 trans-eQTLs
rs670523
rs2340727
rs6430538
rs1990760
rs13098914
rs4833095
rs17616434
rs10214237
rs4705952
rs4388254
rs917116
rs4646450
rs10885122
rs1000778
rs10744777
rs1129038
rs12593813
rs3810291
ms: Replicated 18 trans-eQTLs
rs4646450
wb: Replicated 1 trans-eQTLs


In [26]:
basepath = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_dysb/raw_oneref/fhs/tejaas/"
fhs_trans_dict = dict()
keffs = ["0.4", "0.5", "0.6", "0.7", "0.8"]
for keff in keffs:
    tejaas_file = os.path.join(basepath, "permnull_sbDynamic{:s}_knn30".format(keff), "trans_eqtls_5e-08.txt")
    if not os.path.exists(tejaas_file):
        print("File does not exist")
        continue
    print("Loading FHS Keff {:s}".format(keff), end="")
    transeqtls = tejaas(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        fhs_trans_dict[keff] = transeqtls
        print(" has {:d} trans-eqtls".format(len(transeqtls)))
    else:
        print(" has 0 trans-eqtls")

Loading FHS Keff 0.4 has 30 trans-eqtls
Loading FHS Keff 0.5 has 169 trans-eqtls
Loading FHS Keff 0.6 has 248 trans-eqtls
Loading FHS Keff 0.7 has 401 trans-eqtls
Loading FHS Keff 0.8 has 407 trans-eqtls


In [29]:
for keff in keffs:
    counter = 0
    for snp in fhs_trans_dict[keff]:
        if eqtlgen_coord_dict[snp.chrom][snp.pos]:
            print(snp.rsid)
            counter += 1
    print("{:s}: Replicated {:d} trans-eQTLs".format(keff, counter))

3:56849749_T_C
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
7:50427982_G_A
0.4: Replicated 5 trans-eQTLs
3:56849749_T_C
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
7:50427982_G_A
7:50428445_C_T
0.5: Replicated 6 trans-eQTLs
3:56849749_T_C
6:31238318_A_G
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
7:50427982_G_A
7:50428445_C_T
0.6: Replicated 7 trans-eQTLs
3:56849749_T_C
6:31238318_A_G
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
6:32664458_C_T
7:50427982_G_A
7:50428445_C_T
0.7: Replicated 8 trans-eQTLs
3:56849749_T_C
6:31238318_A_G
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
6:32401079_G_A
6:32664458_C_T
7:50427982_G_A
7:50428445_C_T
0.8: Replicated 9 trans-eQTLs


In [36]:
fhs_hg38_coord_dict = dict()
for chrom in np.arange(1,23):
    fhs_hg38_coord_dict[chrom] = collections.defaultdict(lambda: False)

liftfile="/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_dysb/raw_oneref/fhs/tejaas/permnull_sbDynamic0.8_knn30/hglft_genome_2cb9c_a305b0.bed"
with open(liftfile) as instream:
    for line in instream:
        chrom = int(line.strip().split(":")[0][3:])
        pos   = int(line.strip().split(":")[1].split("-")[0])
        fhs_hg38_coord_dict[chrom][pos] = True    

In [45]:
counter = 0
for t in trans_dict['wb']:
    if fhs_hg38_coord_dict[t.chrom][t.pos]:
        counter += 1
        print(t)