In [3]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import sys
sys.path.append('../../')
sys.path.append('/usr/users/fsimone/tejaas')
from iotools import readgtf
import numpy as np
import os
import re
import collections
import time 
import gzip 

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'dbSNP', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

dbsnp_dict = collections.defaultdict(lambda: False)
with gzip.open("/cbscratch/franco/datasets/gtex_v8/genotypes/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz") as instream:
    next(instream)
    for line in instream:
        arr = line.decode().rstrip().split()
        #if len(arr[3]) == 1 and len(arr[4]) == 1:
        dbsnp_dict[arr[2]] = arr[6]
        
def tejaas(filepath, dbsnp_dict):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            #chrom = rsid.split("_")[0][3:]
            chrom = int(arr[1])
            pos   = int(arr[2])
            p     = float(arr[7])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            maf   = float(arr[3])
            dbsnp = dbsnp_dict[rsid]
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, dbSNP=dbsnp, maf=maf))
    return res    

# special_tissues = ['ag', 'haa', 'liv', 'msg', 'pan', 'pit', 'si', 'spl', 'va', 'wb']
special_tissues = ['haa', 'pan', 'spl', 'wb']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from utils import utils
# tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissues.txt"
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissue_table.txt"
tissuenames, descriptions = utils.read_tissues(tissue_file)
tissue_names = dict(zip(tissuenames, descriptions))

In [9]:
basepath = "/cbscratch/franco/trans-eqtl"
trans_dict = dict()
trans_dict0006 = dict()
for tissue in tissuenames:
    tejaas_file = os.path.join(basepath, "protein_coding_lncRNA_gamma01_knn30_cut5e-8", tissue, "trans_eqtls.txt")
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    print("Loading ", tissue, end="")
    transeqtls = tejaas(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        trans_dict[tissue] = transeqtls
        print(" has {:d} trans-eqtls".format(len(transeqtls)))
    else:
        trans_dict[tissue] = []
        print(" has 0 trans-eqtls")
        
for tissue in special_tissues:
    tejaas_file = os.path.join(basepath, "protein_coding_lncRNA_gamma0006_knn30_cut5e-8", tissue, "trans_eqtls.txt")
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    print("Loading ", tissue, end="")
    transeqtls = tejaas(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        trans_dict0006[tissue] = transeqtls
        print(" has {:d} trans-eqtls".format(len(transeqtls)))
    else:
        trans_dict0006[tissue] = []
        print(" has 0 trans-eqtls")

Loading  as has 2011 trans-eqtls
Loading  av has 1664 trans-eqtls
Loading  ag has 822 trans-eqtls
Loading  aa has 5760 trans-eqtls
Loading  ac has 7551 trans-eqtls
Loading  at has 1463 trans-eqtls
Loading  bam has 4780 trans-eqtls
Loading  ban has 3890 trans-eqtls
Loading  bca has 27 trans-eqtls
Loading  bceh has 270 trans-eqtls
Loading  bce has 96 trans-eqtls
Loading  bco has 113 trans-eqtls
Loading  bfr has 58 trans-eqtls
Loading  bhi has 170 trans-eqtls
Loading  bhy has 325 trans-eqtls
Loading  bnu has 1451 trans-eqtls
Loading  bpu has 14 trans-eqtls
Loading  bsp has 186 trans-eqtls
Loading  bsu has 202 trans-eqtls
Loading  br has 1764 trans-eqtls
Loading  ebv has 1241 trans-eqtls
Loading  fib has 767 trans-eqtls
Loading  cols has 320 trans-eqtls
Loading  colt has 5282 trans-eqtls
Loading  esog has 71 trans-eqtls
Loading  esom has 343 trans-eqtls
Loading  esomu has 68 trans-eqtls
Loading  haa has 0 trans-eqtls
Loading  hlv has 953 trans-eqtls
Loading  kc has 39 trans-eqtls
Loading  

In [10]:
import gzip
eqtlgenfile = "/cbscratch/franco/datasets/EQTLgen/trans-eQTL_significant_20181017.txt.gz"
eqtlgenfile = "/cbscratch/franco/datasets/EQTLgen/2018-09-04-trans-eQTLsFDR0.05-CohortInfoRemoved-BonferroniAdded.txt.gz"
eqtlgen_dict = collections.defaultdict(lambda: False)
eqtlgen_coord_dict = dict()
eqtlgen_list = list()
for chrm in np.arange(1,23):
    eqtlgen_coord_dict[chrm] = collections.defaultdict(lambda: False)
with gzip.open(eqtlgenfile) as instream:
    next(instream)
    for line in instream:
        arr = line.decode().strip().split("\t")
        eqtlgen_dict[arr[1]] = True   #SNP rsid
        eqtlgen_coord_dict[int(arr[2])][int(arr[3])] = True   # SNPchr and SNPpos
        eqtlgen_list.append(arr[1])
print(f"reported trans-eqtls: {len(eqtlgen_list)}")
print(f"unique trans-eqtls: {len(set(eqtlgen_list))}")
eqtlgen_list = list(set(eqtlgen_list))

reported trans-eqtls: 59786
unique trans-eqtls: 3853


In [16]:
snpdatafile = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_SHAPEIT2/0.01/gtex_v8_snpinfo_SHAPEIT2.txt"
variants_list = list()
rsids_list = list()
with open (snpdatafile) as infmt:
    for line in infmt:
        arr = line.strip().split()
        variants_list.append(arr[1])
        if arr[1] in dbsnp_dict:
            rsids_list.append(dbsnp_dict[arr[1]])
        else:
            print(f"fail {arr[1]}")

In [26]:
### Obtain random enrichment in EQTLgen

backgrounds = collections.defaultdict(dict)
all_transeqtls = list()
for tissue in tissuenames:
    if tissue in special_tissues:
        res_dict = trans_dict0006
    else:
        res_dict = trans_dict
    nteqtl = len(res_dict[tissue]) # nÂº of discoveries in that tissue, so sample accordingly
    nchoose = nteqtl
    ntimes  = 100
    counter_list = list()
    # sample randomly here <----
    for i in range(ntimes):
        counter = 0
        if i%10 == 0:
            print(f"{i} ", end="")
        chooseidx = np.sort(np.random.choice(len(rsids_list), nchoose, replace = False))
        for idx in chooseidx:
            if eqtlgen_dict[rsids_list[idx]]:
                counter += 1
        counter_list.append(counter)
    avg_hits = np.mean( np.array(counter_list) )
    bg_freq  = avg_hits / nchoose 
    backgrounds[tissue]["avg_hits"] = avg_hits
    backgrounds[tissue]["bg_freq"]  = bg_freq
    print(f"{tissue} | {avg_hits} | {bg_freq}")

0 10 20 30 40 50 60 70 80 90 as | 0.57 | 0.00028344107409249126
0 10 20 30 40 50 60 70 80 90 av | 0.69 | 0.0004146634615384615
0 10 20 30 40 50 60 70 80 90 ag | 0.36 | 0.00043795620437956203
0 10 20 30 40 50 60 70 80 90 aa | 1.94 | 0.0003368055555555555
0 10 20 30 40 50 60 70 80 90 ac | 2.78 | 0.00036816315719772214
0 10 20 30 40 50 60 70 80 90 at | 0.75 | 0.0005126452494873548
0 10 20 30 40 50 60 70 80 90 bam | 1.62 | 0.0003389121338912134
0 10 20 30 40 50 60 70 80 90 ban | 1.42 | 0.00036503856041131104
0 10 20 30 40 50 60 70 80 90 bca | 0.01 | 0.00037037037037037035
0 10 20 30 40 50 60 70 80 90 bceh | 0.11 | 0.00040740740740740744
0 10 20 30 40 50 60 70 80 90 bce | 0.03 | 0.0003125
0 10 20 30 40 50 60 70 80 90 bco | 0.07 | 0.0006194690265486726
0 10 20 30 40 50 60 70 80 90 bfr | 0.02 | 0.0003448275862068966
0 10 20 30 40 50 60 70 80 90 bhi | 0.04 | 0.00023529411764705883
0 10 20 30 40 50 60 70 80 90 bhy | 0.05 | 0.00015384615384615385
0 10 20 30 40 50 60 70 80 90 bnu | 0.56 | 0.00038



va | 0.0 | nan
0 10 20 30 40 50 60 70 80 90 wb | 11.72 | 0.0003477125734290631


In [27]:
with open("eqtlgen_backgrounds.txt", 'w') as outfmt:
    for tissue in tissuenames:
        outfmt.write(f"{tissue}\t{backgrounds[tissue]['avg_hits']}\t{backgrounds[tissue]['bg_freq']}\n")

In [41]:
import scipy.stats as ss

all_transeqtls = list()
for tissue in tissuenames:
    counter = 0
    if tissue in special_tissues:
        res_dict = trans_dict0006
    else:
        res_dict = trans_dict
    neqtls = len(res_dict[tissue])
    for snp in res_dict[tissue]:
        all_transeqtls.append(snp.dbSNP)
        if eqtlgen_dict[snp.dbSNP]:
            #print(snp.dbSNP)
            counter += 1
    # print(f"{tissue}: Replicated {counter} trans-eQTLs ({100 * counter/len(eqtlgen_list):g} %)")
    replication_frac = 100 * counter/len(eqtlgen_list)
    if backgrounds[tissue]['bg_freq'] > 0:
        enrich = (counter / neqtls ) / backgrounds[tissue]['bg_freq']
    else:
        enrich = 0
    pval = ss.binom_test(counter, neqtls, backgrounds[tissue]['bg_freq'], alternative='greater')
    if counter > 0:
        print(f"{tissue_names[tissue]}  &  {counter} ({replication_frac :.2f}\%) & {enrich :.2f} ($p={pval :g}$) \\\\")
    else:
        print(f"{tissue_names[tissue]}  &  {counter} ({replication_frac :.2f}\%) & - \\\\")
    
all_transeqtls = list(set(all_transeqtls))
counter = 0
for snp in all_transeqtls:
    if eqtlgen_dict[snp]:
        #print(snp.dbSNP)
        counter += 1
print(f"---> ALL: Replicated {counter} trans-eQTLs ({100 * counter/len(eqtlgen_list):g} %)")

Adipose Subcutaneous  &  0 (0.00\%) & - \\
Adipose Visceral Omentum  &  1 (0.03\%) & 1.45 ($p=0.498496$) \\
Adrenal Gland  &  3 (0.08\%) & 8.33 ($p=0.00593454$) \\
Artery Aorta  &  2 (0.05\%) & 1.03 ($p=0.577555$) \\
Artery Coronary  &  3 (0.08\%) & 1.08 ($p=0.5258$) \\
Artery Tibial  &  0 (0.00\%) & - \\
Brain Amygdala  &  0 (0.00\%) & - \\
Brain Anterior cingulate cortex BA24  &  2 (0.05\%) & 1.41 ($p=0.415078$) \\
Brain Caudate basal ganglia  &  0 (0.00\%) & - \\
Brain Cerebellar Hemisphere  &  0 (0.00\%) & - \\
Brain Cerebellum  &  0 (0.00\%) & - \\
Brain Cortex  &  0 (0.00\%) & - \\
Brain Frontal Cortex BA9  &  0 (0.00\%) & - \\
Brain Hippocampus  &  0 (0.00\%) & - \\
Brain Hypothalamus  &  0 (0.00\%) & - \\
Brain Nucleus accumbens basal ganglia  &  0 (0.00\%) & - \\
Brain Putamen basal ganglia  &  0 (0.00\%) & - \\
Brain Spinal cord cervical c-1  &  0 (0.00\%) & - \\
Brain Substantia nigra  &  0 (0.00\%) & - \\
Breast Mammary Tissue  &  3 (0.08\%) & 4.55 ($p=0.0294283$) \\
Cells 

# Framingham replication in eQTLGEN

# Framingham is broken, needs to be fixed

In [10]:
basepath = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_dysb/raw_oneref/fhs/tejaas/"
fhs_trans_dict = dict()
keffs = ["0.4", "0.5", "0.6", "0.7", "0.8"]
for keff in keffs:
    tejaas_file = os.path.join(basepath, "permnull_sbDynamic{:s}_knn30".format(keff), "trans_eqtls_5e-08.txt")
    if not os.path.exists(tejaas_file):
        print("File does not exist")
        continue
    print("Loading FHS Keff {:s}".format(keff), end="")
    transeqtls = tejaas(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        fhs_trans_dict[keff] = transeqtls
        print(" has {:d} trans-eqtls".format(len(transeqtls)))
    else:
        print(" has 0 trans-eqtls")

Loading FHS Keff 0.4 has 30 trans-eqtls
Loading FHS Keff 0.5 has 169 trans-eqtls
Loading FHS Keff 0.6 has 248 trans-eqtls
Loading FHS Keff 0.7 has 401 trans-eqtls
Loading FHS Keff 0.8 has 407 trans-eqtls


In [11]:
for keff in keffs:
    counter = 0
    for snp in fhs_trans_dict[keff]:
        if eqtlgen_coord_dict[snp.chrom][snp.pos]:
            print(snp.rsid)
            counter += 1
    print("{:s}: Replicated {:d} trans-eQTLs".format(keff, counter))

3:56849749_T_C
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
7:50427982_G_A
0.4: Replicated 5 trans-eQTLs
3:56849749_T_C
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
7:50427982_G_A
7:50428445_C_T
0.5: Replicated 6 trans-eQTLs
3:56849749_T_C
6:31238318_A_G
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
7:50427982_G_A
7:50428445_C_T
0.6: Replicated 7 trans-eQTLs
3:56849749_T_C
6:31238318_A_G
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
6:32664458_C_T
7:50427982_G_A
7:50428445_C_T
0.7: Replicated 8 trans-eQTLs
3:56849749_T_C
6:31238318_A_G
6:31252396_C_A
6:31265490_C_T
6:31272261_T_C
6:32401079_G_A
6:32664458_C_T
7:50427982_G_A
7:50428445_C_T
0.8: Replicated 9 trans-eQTLs


In [13]:
1500* 400/4000000 

0.15

In [36]:
fhs_hg38_coord_dict = dict()
for chrom in np.arange(1,23):
    fhs_hg38_coord_dict[chrom] = collections.defaultdict(lambda: False)

liftfile="/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_dysb/raw_oneref/fhs/tejaas/permnull_sbDynamic0.8_knn30/hglft_genome_2cb9c_a305b0.bed"
with open(liftfile) as instream:
    for line in instream:
        chrom = int(line.strip().split(":")[0][3:])
        pos   = int(line.strip().split(":")[1].split("-")[0])
        fhs_hg38_coord_dict[chrom][pos] = True    

In [45]:
counter = 0
for t in trans_dict['wb']:
    if fhs_hg38_coord_dict[t.chrom][t.pos]:
        counter += 1
        print(t)