In [1]:
import os, sys

import numpy as np
import pandas as pd

shapeit_file = "/cbscratch/franco/from_saikat/gtex_v8_202003/all_variants_pvalues_tejaas.txt"
shapeit_df = pd.read_csv(shapeit_file, sep="\t", header=0, index_col=0)

In [2]:
import gzip 

dbsnp_dict = dict()
with gzip.open("/cbscratch/franco/datasets/gtex_v8/genotypes/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz") as instream:
    next(instream)
    for line in instream:
        arr = line.decode().rstrip().split()
        #if len(arr[3]) == 1 and len(arr[4]) == 1:
        dbsnp_dict[arr[2]] = arr[6]

In [18]:
sys.path.append('../../')
sys.path.append('/usr/users/fsimone/tejaas')
import json
from utils import utils

#from utils import mpl_stylesheet
#mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 22, colors = 'banskt', dpi = 300)
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissue_table.txt"
tissuenames, descriptions, tstring = utils.read_tissues_str(tissue_file)

json_file = "../../gtex_v8_metadata.json"
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_samples = dict()
for tshort, tfull in zip(tissuenames, descriptions):
    tissue_names[tshort] = tfull
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]

In [19]:
import collections
SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'dbSNP', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

def tejaas(filepath, dbsnp_dict):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            chrom = rsid.split("_")[0][3:]
            pos   = int(arr[2])
            p     = float(arr[7])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            maf   = float(arr[3])
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, dbSNP=dbsnp_dict[rsid], maf=maf))
    return res

def smart_LD_filter(trans_eqtls, trans_eqtls_ld_regions_file):
    pass_snps = list()
    with open(trans_eqtls_ld_regions_file) as instream:
        for line in instream:
            arr = line.strip("\n").split("\t")
            if len(arr[4]) > 0:
                pass_snps.append(arr[0])
    return [x for x in trans_eqtls if x.rsid in pass_snps]

basepath = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8/"
trans_dict = dict()
trans_filename = "trans_eqtls.txt" # "trans_eqtls_ldpruned.txt"
for tissue in tissuenames:
    tejaas_file = os.path.join(basepath, tissue, trans_filename)
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    print("Loading ", tissue)
    transeqtls = tejaas(tejaas_file, dbsnp_dict)
    if len(transeqtls) > 0:
        trans_dict[tissue] = transeqtls
    else:
        trans_dict[tissue] = []
        print("{:s} has 0 trans-eqtls".format(tissue))

Loading  as
Loading  av
Loading  ag
Loading  aa
Loading  ac
Loading  at
Loading  bam
Loading  ban
Loading  bca
Loading  bceh
Loading  bce
Loading  bco
Loading  bfr
Loading  bhi
Loading  bhy
Loading  bnu
Loading  bpu
Loading  bsp
Loading  bsu
Loading  br
Loading  ebv
Loading  fib
Loading  cols
Loading  colt
Loading  esog
Loading  esom
Loading  esomu
Loading  haa
haa has 0 trans-eqtls
Loading  hlv
Loading  kc
Loading  liv
Loading  lu
Loading  msg
Loading  ms
Loading  nt
Loading  ov
Loading  pan
pan has 0 trans-eqtls
Loading  pit
Loading  pro
Loading  snse
Loading  sse
Loading  si
Loading  spl
Loading  sto
Loading  tes
Loading  thy
Loading  ut
Loading  va
va has 0 trans-eqtls
Loading  wb
wb has 0 trans-eqtls


In [22]:
raqtl_file_control = "/cbscratch/franco/datasets/raQTLs/k562.matched.control.LP190708.txt.gz"
raqtl_file = "/cbscratch/franco/datasets/raQTLs/k562.sign.id.LP190708.txt.gz"

raqtl_dict = collections.defaultdict(lambda: False)
with gzip.open(raqtl_file) as instream:
    header = next(instream).decode().strip().split("\t")
    for line in instream:
        arr = line.decode().strip().split("\t")
        raqtl_dict[arr[1]] = True
        
raqtl_control_dict = collections.defaultdict(lambda: False)
with gzip.open(raqtl_file_control) as instream:
    header = next(instream).decode().strip().split("\t")
    for line in instream:
        arr = line.decode().strip().split("\t")
        raqtl_control_dict[arr[1]] = True


In [24]:
for tissue in tissuenames:
    if len(trans_dict[tissue]) > 0:
        counter = 0
        counter_control = 0
        for teqtl in trans_dict[tissue]:
            if raqtl_dict[teqtl.dbSNP]:
                counter += 1
            if raqtl_control_dict[teqtl.dbSNP]:
                counter_control += 1
        print(tissue, counter, counter_control)
        

as 5 3
av 4 7
ag 1 5
aa 26 13
ac 29 23
at 5 4
bam 19 9
ban 18 12
bca 0 0
bceh 1 2
bce 0 0
bco 0 1
bfr 0 1
bhi 0 0
bhy 1 1
bnu 3 4
bpu 0 0
bsp 0 0
bsu 0 2
br 9 3
ebv 6 1
fib 1 1
cols 2 1
colt 19 18
esog 1 0
esom 0 0
esomu 0 0
hlv 4 1
kc 0 1
liv 1 0
lu 7 5
msg 0 0
ms 48 28
nt 7 4
ov 0 0
pit 0 0
pro 15 17
snse 11 11
sse 7 4
si 0 0
spl 0 0
sto 4 3
tes 20 14
thy 14 9
ut 27 21
