In [1]:
import os, sys

import numpy as np
import pandas as pd

# shapeit_file = "/cbscratch/franco/from_saikat/gtex_v8_202003/all_variants_pvalues_tejaas.txt"
# shapeit_df = pd.read_csv(shapeit_file, sep="\t", header=0, index_col=0)
# snps_list = list(shapeit_df.index)

raqtl_snp_file = "/cbscratch/franco/datasets/raQTLs/SuRE_SNP_table_LP190708.txt.gz"
sure_df = pd.read_csv(raqtl_snp_file, sep="\t", header=0, usecols=[1])
snps_list = list(sure_df.values.reshape(-1))

In [2]:
import gzip 

dbsnp_dict = dict()
with gzip.open("/cbscratch/franco/datasets/gtex_v8/genotypes/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz") as instream:
    next(instream)
    for line in instream:
        arr = line.decode().rstrip().split()
        #if len(arr[3]) == 1 and len(arr[4]) == 1:
        dbsnp_dict[arr[2]] = arr[6]

In [3]:
sys.path.append('../../')
sys.path.append('/usr/users/fsimone/tejaas')
import json
from utils import utils

#from utils import mpl_stylesheet
#mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 22, colors = 'banskt', dpi = 300)
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissue_table.txt"
tissuenames, descriptions, tstring = utils.read_tissues_str(tissue_file)

json_file = "../../gtex_v8_metadata.json"
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_samples = dict()
for tshort, tfull in zip(tissuenames, descriptions):
    tissue_names[tshort] = tfull
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]
    
# special_tissues = ['ag', 'haa', 'liv', 'msg', 'pan', 'pit', 'si', 'spl', 'va', 'wb']
# special_tissues = ['haa', 'pan', 'spl', 'wb']

In [58]:
import collections
import re
SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'dbSNP', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

def tejaas(filepath, dbsnp_dict):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            chrom = rsid.split("_")[0][3:]
            pos   = int(arr[2])
            p     = float(arr[7])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            maf   = float(arr[3])
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, dbSNP=dbsnp_dict[rsid], maf=maf))
    return res

def smart_LD_filter(trans_eqtls, trans_eqtls_ld_regions_file):
    pass_snps = list()
    with open(trans_eqtls_ld_regions_file) as instream:
        for line in instream:
            arr = line.strip("\n").split("\t")
            if len(arr[4]) > 0:
                pass_snps.append(arr[0])
    return [x for x in trans_eqtls if x.rsid in pass_snps]

paths = dict()
paths["gamma01"] = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8/"
#paths["optim"] = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_optim_gamma_knn30_cut5e-8/"
paths["gamma0006"] = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma0006_knn30_cut5e-8/"


set_bg   = "old"
set_path = "gamma0006"

# PATH=f"{set_path}_LDregions" # with smartLD filter
PATH=f"{set_path}_noregions" # without smartLD filter
PATH="combined_noregions" # without smartLD filter


trans_dict = dict()
trans_filename = "trans_eqtls_ldpruned.txt"
for tissue in tissuenames:
    if tissue in special_tissues:
        basepath = paths['gamma0006']
    else:
        basepath = paths['gamma01']
#     basepath = paths[set_path]
    tejaas_file = os.path.join(basepath , tissue, trans_filename)
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        trans_dict[tissue] = []
        continue
    print("Loading ", tissue)
    transeqtls = tejaas(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        trans_dict[tissue] = transeqtls
        if re.search("ldpruned",trans_filename) and re.search("LDregions", PATH):
            regions_file = os.path.join(basepath, tissue, "ld_regions.txt")
            trans_dict[tissue] = smart_LD_filter(transeqtls, regions_file)
    else:
        trans_dict[tissue] = []
        print("{:s} has 0 trans-eqtls".format(tissue))

all_transeqtls = list()
for tissue in tissuenames:
    all_transeqtls += [x.rsid for x in trans_dict[tissue]]
    
all_uniq_teqtls = list(set(all_transeqtls))
print("uniq trans-eqtls:", len(all_uniq_teqtls))


Loading  as
Loading  av
Loading  ag
Loading  aa
Loading  ac
Loading  at
Loading  bam
Loading  ban
Loading  bca
Loading  bceh
Loading  bce
Loading  bco
Loading  bfr
Loading  bhi
Loading  bhy
Loading  bnu
Loading  bpu
Loading  bsp
Loading  bsu
Loading  br
Loading  ebv
Loading  fib
Loading  cols
Loading  colt
Loading  esog
Loading  esom
Loading  esomu
Loading  haa
Loading  hlv
Loading  kc
Loading  liv
Loading  lu
Loading  msg
Loading  ms
Loading  nt
Loading  ov
Loading  pan
Loading  pit
Loading  pro
Loading  snse
Loading  sse
Loading  si
Loading  spl
Loading  sto
Loading  tes
Loading  thy
Loading  ut
Loading  va
va has 0 trans-eqtls
Loading  wb
uniq trans-eqtls: 18851


In [59]:
k562_file_control = "/cbscratch/franco/datasets/raQTLs/k562.matched.control.LP190708.txt.gz"
k562_file = "/cbscratch/franco/datasets/raQTLs/k562.sign.id.LP190708.txt.gz"
hepg2_file_control = "/cbscratch/franco/datasets/raQTLs/hepg2.matched.control.LP190708.txt.gz"
hepg2_file = "/cbscratch/franco/datasets/raQTLs/hepg2.sign.id.LP190708.txt.gz"

def load_raqtls(filename):
    raqtl_dict = collections.defaultdict(lambda: False)
    with gzip.open(filename) as instream:
        header = next(instream).decode().strip().split("\t")
        for line in instream:
            arr = line.decode().strip().split("\t")
            raqtl_dict[arr[1]] = True
    return raqtl_dict
        
# similar to Blood
raqtl_k562 = load_raqtls(k562_file)
raqtl_k562_control = load_raqtls(k562_file_control)

# similar to Liver
raqtl_hepg2 = load_raqtls(hepg2_file)
raqtl_hepg2_control = load_raqtls(hepg2_file_control)

In [60]:
def sample_rand_bg(snps_list, raqtl_dict, dbsnp_dict, nchoose = 20000, niter = 50):
    counters = list()
    print(f'Iteration', end="")
    for k in range(niter):
        counter = 0
        chooseidx = np.sort(np.random.choice(len(snps_list), nchoose, replace = False))
        print(f' {k}', end="")
        for idx in chooseidx:
            var_id = snps_list[idx]
            if raqtl_dict[dbsnp_dict[var_id]]:
                counter += 1
        counters.append(counter)
    print("")
    return np.mean(np.array(counters)) / nchoose

def sample_rand_bg_dbSNP(snps_list, raqtl_dict, dbsnp_dict, nchoose = 20000, niter = 50):
    counters = list()
    print(f'Iteration', end="")
    for k in range(niter):
        counter = 0
        chooseidx = np.sort(np.random.choice(len(snps_list), nchoose, replace = False))
        print(f' {k}', end="")
        for idx in chooseidx:
            var_id = snps_list[idx]
            if raqtl_dict[var_id]:
                counter += 1
        counters.append(counter)
    print("")
    return np.mean(np.array(counters)) / nchoose

# # K562 background
# bg_k562         = sample_rand_bg_dbSNP(snps_list, raqtl_k562, dbsnp_dict)
# bg_control_k562 = sample_rand_bg_dbSNP(snps_list, raqtl_k562_control, dbsnp_dict)

# # HEPG2 background
# bg_hepg2         = sample_rand_bg_dbSNP(snps_list, raqtl_hepg2, dbsnp_dict)
# bg_control_hepg2 = sample_rand_bg_dbSNP(snps_list, raqtl_hepg2_control, dbsnp_dict)

# import json
# bg_data = dict(zip(['k562', 'k562_control', 'hepg2', 'hepg2_control'], [bg_k562, bg_control_k562, bg_hepg2, bg_control_hepg2]))
# bg_json = json.dumps(bg_data)
# with open("/cbscratch/franco/datasets/raQTLs/raqtls_background_orig_paper.txt", 'w') as outstream:
#     outstream.write(bg_json)

In [61]:
import scipy.stats as ss

if set_bg == "old":
    with open("/cbscratch/franco/datasets/raQTLs/raqtls_background_shapeit2.txt") as ins:
        bg_data = json.load(ins)
if set_bg == "orig":
    with open("/cbscratch/franco/datasets/raQTLs/raqtls_background_orig_paper.txt") as ins:
        bg_data = json.load(ins)

data = dict(zip(['k562', 'k562_control', 'hepg2', 'hepg2_control'],[raqtl_k562, raqtl_k562_control, raqtl_hepg2, raqtl_hepg2_control]))
results = collections.defaultdict(dict)

variant_in_dataset = collections.defaultdict(list)
for dataset in data:
    raqtl_dict = data[dataset]
    this_res = collections.defaultdict(dict)
    for tissue in tissuenames:
        if len(trans_dict[tissue]) > 0:    
            counter = 0
            for teqtl in trans_dict[tissue]:
                if raqtl_dict[teqtl.dbSNP]:
                    counter += 1
            frac = counter / len(trans_dict[tissue])
            enrichment = frac/bg_data[dataset]
            pval = ss.binom_test(counter, len(trans_dict[tissue]), bg_data[dataset], alternative='greater')
            print(dataset, tissue, counter, frac, enrichment, pval)
            this_res[tissue]['count'] = counter
            this_res[tissue]['frac'] = frac
            this_res[tissue]['enrichment'] = enrichment
            this_res[tissue]['pval'] = pval
    counter = 0
    for teqtl in all_uniq_teqtls:
        if raqtl_dict[dbsnp_dict[teqtl]]:
            variant_in_dataset[dataset].append(teqtl)
            counter += 1
    frac = counter / len(all_uniq_teqtls)
    enrichment = frac/bg_data[dataset]
    pval = ss.binom_test(counter, len(all_uniq_teqtls), bg_data[dataset], alternative='greater')
    this_res['all']['count'] = counter
    this_res['all']['frac'] = frac
    this_res['all']['enrichment'] = enrichment
    this_res['all']['pval'] = pval
    print(dataset, 'all', counter, frac, enrichment, pval)
    results[dataset] = this_res
        

k562 as 3 0.005119453924914676 3.470816220281136 0.05698388226276416
k562 av 0 0.0 0.0 1.0
k562 ag 1 0.005434782608695652 3.6845983787767134 0.23784120646472884
k562 aa 4 0.0030816640986132513 2.089263795670001 0.12765780702202212
k562 ac 4 0.002599090318388564 1.7620951311108908 0.19447842196869689
k562 at 0 0.0 0.0 1.0
k562 bam 8 0.00676818950930626 4.588603057156787 0.0004503264147058246
k562 ban 4 0.0041928721174004195 2.8426251643392675 0.05440121344573258
k562 bca 0 0.0 0.0 1.0
k562 bceh 0 0.0 0.0 1.0
k562 bce 0 0.0 0.0 1.0
k562 bco 0 0.0 0.0 1.0
k562 bfr 0 0.0 0.0 1.0
k562 bhi 0 0.0 0.0 1.0
k562 bhy 0 0.0 0.0 1.0
k562 bnu 1 0.002638522427440633 1.788828764366531 0.4284696474284174
k562 bpu 0 0.0 0.0 1.0
k562 bsp 0 0.0 0.0 1.0
k562 bsu 0 0.0 0.0 1.0
k562 br 3 0.005940594059405941 4.027521396207417 0.03970542416872813
k562 ebv 5 0.016835016835016835 11.413570735604633 9.093065307333595e-05
k562 fib 0 0.0 0.0 1.0
k562 cols 0 0.0 0.0 1.0
k562 colt 5 0.004258943781942078 2.8874195131

hepg2_control wb 15 0.002392726112617642 1.9774595972046631 0.011216047266037218
hepg2_control all 50 0.002652379184128163 2.192048912502614 5.764635462947197e-07


In [62]:
json_dict = json.dumps(results)
with open(f"raqtl_enrichment_{set_bg}bg_{PATH}_ldpruned.txt", 'w') as outstream:
    outstream.write(json_dict)

In [63]:
for d in variant_in_dataset:
    print(d, len(variant_in_dataset[d]))
    
int_n = len(set.intersection(set(variant_in_dataset['k562']), set(variant_in_dataset['hepg2'])))
print("intersection:", int_n)
total_raqtl = 64 + 45 - int_n
print(f"Total raQTLs: {total_raqtl}")
print(f"Total fraction of trans-eQTLs that are raQTLs: {total_raqtl/len(all_uniq_teqtls)}")

k562 64
k562_control 48
hepg2 45
hepg2_control 50
intersection: 8
Total raQTLs: 101
Total fraction of trans-eQTLs that are raQTLs: 0.005357805951938889
