In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')
from iotools import readgtf

base_dir = "/cbscratch/franco/datasets"

import collections

GENEINFO_FIELDS = ['name', 'ensembl_id', 'chrom', 'start', 'end']
class GeneInfo(collections.namedtuple('_GeneInfo', GENEINFO_FIELDS)):
    __slots__ = ()

def read_TFannot(infile):
    TF_list = list()
    with open(infile) as instream:
        next(instream)
        for line in instream:
            arr = line.rstrip().split()
            TF_list.append(GeneInfo(ensembl_id=arr[0], chrom=int(arr[1]), start=int(arr[2]), end=int(arr[3]), name=arr[4]))
    return TF_list

TF_annot = read_TFannot("../TF_annotation.txt")

import collections
TF_dict = collections.defaultdict(dict)
for g in TF_annot:
    TF_dict[g.chrom][g.ensembl_id] = g.start
    
def search_TF(TF_dict, chrom, pos):
    TSS_distance = 1e10
    cisTF = None
    found = False
    for TF in TF_dict[chrom].keys():
        dist = pos - TF_dict[chrom][TF]
        if np.abs(dist) < TSS_distance:
            TSS_distance = dist
            cisTF = TF
            found = True
    if not found:
        print(chrom, pos)
    return TSS_distance, cisTF

In [2]:
import mpmath
from operator import attrgetter

mpmath.mp.dps = 50
def pval(x): return float(mpmath.log10(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2)))))

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'TFdist', 'TFname']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

CISTF_FIELDS = ['rsid', 'chrom', 'logp', 'TFpos', 'TFname', 'fdr']
class CisTF(collections.namedtuple('_CisTF', CISTF_FIELDS)):
    __slots__ = ()
       
        
def tejaas(filepath, chrom):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            pos   = int(arr[1])
            p     = float(arr[5])
            q     = float(arr[2])
            mu    = float(arr[3])
            sigma = float(arr[4])
            if sigma == 0:
                continue
            logp  = np.log10(p) if p != 0 else pval( (q - mu) / sigma)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, TFdist=None, TFname=None))
    return res

def matrixeqtl(filepath, chrom):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr  = line.strip().split("\t")
            rsid = arr[0]
            gene = arr[1].split(".")[0]
            logp = np.log10(float(arr[4]))
            fdr  = np.log10(float(arr[5]))
            res.append(CisTF(rsid=rsid, chrom=chrom, logp=-logp, TFpos=None, TFname=gene, fdr=-fdr))
    return res

def filter_meqtl_TFs(ciseqtls, TF_dict):
    res = list()
    for eqtl in ciseqtls:
        if eqtl.TFname in TF_dict[eqtl.chrom]:
            res.append(eqtl._replace(TFpos=TF_dict[eqtl.chrom][eqtl.TFname]))
    return res

def get_positive_set(snplist, TF_dict, cutoff = -np.log10(0.01), nsnps = 1000, window=2e6):
    sorted_list = sorted(snplist, key=attrgetter('logp'), reverse=True)
    
    counter = 0
    snp_objects = list()
    for snp in sorted_list:
        if snp.logp > cutoff:
            dist, tf = search_TF(TF_dict, snp.chrom, snp.pos)
            newsnp = snp._replace(TFdist = dist, TFname=tf)
            snp_objects.append(newsnp)
            counter += 1
            if counter >= nsnps:
                break
    snp_objects_dist = [x for x in snp_objects if np.abs(x.TFdist) <= window]
    print("selected {:d}/{:d} snps, cutoff: {:g}".format(len(snp_objects_dist), len(snp_objects),snp_objects_dist[-1].logp))
    return snp_objects_dist

In [152]:
from utils import utils
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/main/tissues.txt"
tissuenames, descriptions = utils.read_tissues(tissue_file)
tissues = ["gtex-"+t for t in tissuenames]
# tissues = ["gtex-ms"]


datadir = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v6_norm/"

chrms = np.arange(1,23)
# chrms = [6, 7, 8]
snp_res_dict = collections.defaultdict(dict)
snp_res_dict_top = collections.defaultdict(dict)
expressions = ["norm"] #["lmcorrected_age"] #
methods = ["matrixeqtl_rand", "tejaas_rand"]
sbs = ["0.05"]#, "0.1"]
LD = False
isworst = False
for sb in sbs:
    for expr in expressions:
        snp_res_dict[sb][expr] = collections.defaultdict(dict)
        snp_res_dict_top[sb][expr] = collections.defaultdict(dict)
        for tissue in tissues:
            snp_res_dict[sb][expr][tissue] = collections.defaultdict(dict)
            snp_res_dict_top[sb][expr][tissue] = collections.defaultdict(dict)
            for method in methods:
                snp_res = list()
                snp_res_top = list()
                print(tissue, method, expr, end=" ")
                for chrom in chrms:
                    print(chrom, end=" ")
                    if method.startswith("tejaas"):
                        # inputfile = os.path.join(datadir, expr, tissue, method, "permnull_sb"+sb, "chr"+str(chrom), "rr.txt")
                        # snp_res += tejaas(inputfile, chrom)
                        inputfile = os.path.join(datadir, expr, tissue, method, "permnull_sb"+sb, "chr"+str(chrom), "rr.txt.top2000")
                        snp_res_top += tejaas(inputfile, chrom)
                    else:
                        inputfile = os.path.join(datadir, expr, tissue, method, "chr"+str(chrom), "cis_eqtl.txt")
                        snp_res += matrixeqtl(inputfile, chrom)
                print("")
                snp_res_dict[sb][expr][tissue][method] = snp_res
                snp_res_dict_top[sb][expr][tissue][method] = snp_res_top
            
            mytissue = tissue
            mysb = "0.05"
            myexpr = expr
            mycismethod = "matrixeqtl_rand"
            fdrcutoff = 0.05
            cis_list = snp_res_dict[mysb][myexpr][mytissue][mycismethod]
            topcis_list = sorted(cis_list, key=attrgetter('fdr'), reverse=True)
            cis_TFs = filter_meqtl_TFs(topcis_list, TF_dict)
            sigcis_TFs = [x for x in cis_TFs if x.fdr > -np.log(fdrcutoff)]
            print("{:s} {:d}/{:d} of cis-eQTLs target TFs".format(mytissue, len(sigcis_TFs), len(topcis_list)))
            cis_dict = collections.defaultdict(list)
            for e in cis_TFs:
                cis_dict[e.rsid].append(e)

            nsnps = 1000
            window = 1e6
            Numbins = 50
            mytransmethod = "tejaas_rand"
            outdir   = "/cbscratch/franco/trans-eqtl/analysis/new_multitissue_TFvalidation"
            paramline = "sb{:s}_n{:d}_{:s}Mb".format(mysb, nsnps, str(int(window/1e6)))
            print(mytissue, end=" ")

            GWSsnps = get_positive_set(snp_res_dict_top[mysb][myexpr][mytissue][mytransmethod], TF_dict, nsnps = nsnps, window = window)
            cis_trans_list = list()
            for transeqtl in sorted(GWSsnps, key=attrgetter('chrom')):
                if transeqtl.rsid in cis_dict:
                    for cis in cis_dict[transeqtl.rsid]:
                        if cis.fdr > -np.log(fdrcutoff):
                            cis_trans_list.append([transeqtl.pos, cis_dict[transeqtl.rsid]])
            print("{:s} {:d}/{:d} snps are also cis-eQTLs at {:g}% FDR".format(mytissue, len(cis_trans_list), len(GWSsnps), fdrcutoff*100))

gtex-aa matrixeqtl_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
gtex-aa tejaas_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
gtex-aa 0/45128 of cis-eQTLs target TFs
gtex-aa selected 477/1000 snps, cutoff: 41.0891
gtex-aa 0/477 snps are also cis-eQTLs at 5% FDR
gtex-as matrixeqtl_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
gtex-as tejaas_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
gtex-as 0/49202 of cis-eQTLs target TFs
gtex-as selected 501/1000 snps, cutoff: 5.11427
gtex-as 0/501 snps are also cis-eQTLs at 5% FDR
gtex-at matrixeqtl_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
gtex-at tejaas_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
gtex-at 0/44084 of cis-eQTLs target TFs
gtex-at selected 589/1000 snps, cutoff: 4.98818
gtex-at 0/589 snps are also cis-eQTLs at 5% FDR
gtex-esom matrixeqtl_rand norm 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 

In [149]:
cis_trans_list

[[144838590,
  [CisTF(rsid='rs6558387', chrom=8, logp=7.117058422874809, TFpos=144328990, TFname='ENSG00000181638', fdr=4.083665536964707),
   CisTF(rsid='rs6558387', chrom=8, logp=3.4380547807836725, TFpos=144766621, TFname='ENSG00000181135', fdr=1.2007069168730171),
   CisTF(rsid='rs6558387', chrom=8, logp=3.3662439858673583, TFpos=144371845, TFname='ENSG00000185730', fdr=1.1555869247913728)]],
 [144829407,
  [CisTF(rsid='rs11992439', chrom=8, logp=6.907179444082068, TFpos=144328990, TFname='ENSG00000181638', fdr=3.9081326070959217),
   CisTF(rsid='rs11992439', chrom=8, logp=3.522707370765762, TFpos=144766621, TFname='ENSG00000181135', fdr=1.255745845804176),
   CisTF(rsid='rs11992439', chrom=8, logp=3.158910563725538, TFpos=144371845, TFname='ENSG00000185730', fdr=1.0230535163909544),
   CisTF(rsid='rs11992439', chrom=8, logp=3.004707022038018, TFpos=144680073, TFname='ENSG00000179886', fdr=0.9313577114264779)]],
 [144840867,
  [CisTF(rsid='rs6997644', chrom=8, logp=7.11793793587941

In [134]:
mytissue = "gtex-ms"
mysb = "0.05"
myexpr = "norm"
mycismethod = "matrixeqtl"
cis_list = snp_res_dict[mysb][myexpr][mytissue][mycismethod]
topcis_list = sorted(cis_list, key=attrgetter('fdr'), reverse=True)
cis_TFs = filter_meqtl_TFs(topcis_list, TF_dict)
print("{:s} {:d}/{:d} of cis-eQTLs target TFs".format(mytissue, len(cis_TFs), len(topcis_list)))
cis_dict = collections.defaultdict(list)
for e in cis_TFs:
    cis_dict[e.rsid].append(e)



nsnps = 1000
window = 1e6
Numbins = 50
mytransmethod = "tejaas"
outdir   = "/cbscratch/franco/trans-eqtl/analysis/new_multitissue_TFvalidation"
paramline = "sb{:s}_n{:d}_{:s}Mb".format(mysb, nsnps, str(int(window/1e6)))
print(mytissue, end=" ")

fdrcutoff = 0.05
GWSsnps = get_positive_set(snp_res_dict_top[mysb][myexpr][mytissue][mytransmethod], TF_dict, nsnps = nsnps, window = window)
cis_trans_list = list()
for transeqtl in sorted(GWSsnps, key=attrgetter('chrom')):
    if transeqtl.rsid in cis_dict:
        for cis in cis_dict[transeqtl.rsid]:
            if cis.fdr > -np.log(fdrcutoff):
                cis_trans_list.append([transeqtl.pos, cis_dict[transeqtl.rsid]])
print("{:s} {:d}/{:d} snps are also cis-eQTLs at {:g}% FDR".format(mytissue, len(cis_trans_list), len(GWSsnps), fdrcutoff*100))

gtex-ms 34695/455594 of cis-eQTLs target TFs
gtex-ms selected 471/1000 snps, cutoff: 33.7095
gtex-ms 0/471 snps are also cis-eQTLs at 5% FDR
