In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import time
from statsmodels.distributions.empirical_distribution import ECDF
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')
from utils import readgtf
import collections
from utils import readgtf

gene_info = readgtf.gencode_v12("/cbscratch/franco/datasets/GENCODE/gencode.v26.annotation.gtf.gz", trim=True)
gene_info_dict = collections.defaultdict(dict)
for gene in gene_info:
    gene_info_dict[gene.chrom][gene.ensembl_id] = gene.typ

In [2]:
import mpmath
from operator import attrgetter

mpmath.mp.dps = 50
def pvalue(x): return float(mpmath.log10(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2)))))

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'fdr', 'target']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()
    
def tejaas_saikat(filepath):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            chrom = int(arr[1])
            pos   = int(arr[2])
            p     = float(arr[3])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, fdr=None, target=None))
    return res
        
def tejaas(filepath):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            pos   = int(arr[1])
            p     = float(arr[5])
            chrom = int(arr[6])
            q     = float(arr[2])
            mu    = float(arr[3])
            sigma = float(arr[4])
            if sigma == 0:
                continue
            logp  = np.log10(p) if p != 0 else pvalue( (q - mu) / sigma)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, fdr=None, target=None))
    return res

def matrixeqtl(filepath, chrom, fdrcutoff):
    res = list()
    if not os.path.exists(filepath) or os.stat(filepath).st_size == 0:
        print("File empty or does not exist")
        return res
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr  = line.strip().split("\t")
            rsid = arr[0]
            pos = int(rsid.split("_")[1])
            gene = arr[1].split(".")[0]
            logp = np.log10(float(arr[4]))
            fdr  = np.log10(float(arr[5]))
            if fdr > fdrcutoff:
                break
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, fdr=-fdr, target=gene))
    return res

def matrixeqtl_signif(filepath, snp_maf_dict):
    res = list()
    discard = 0
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr  = line.strip().split("\t")
            chrom = arr[0]
            rsid = arr[1]
            if not snp_maf_dict[rsid]:
                discard += 1
                continue
            pos = arr[2]
            gene = arr[5]
            logp = float(arr[3])
            fdr  = float(arr[4])
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=logp, fdr=fdr, target=gene))
    # print("Discarded {:d} SNPs with low MAF".format(discard))
    return res

In [3]:
# Filter by allowed snps according to MAF
basepath = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_lncRNA/"
baseoutdir = os.path.join(basepath, "cis_eqtls_analysis")
if not os.path.exists(baseoutdir): os.makedirs(baseoutdir)

title = "maf1"
maffile = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_0.01/gtex_v8_snpinfo.txt"
randompath = "/usr/users/fsimone/vcfs_0.01/"

# title = "maf5"
# maffile = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_0.05/gtex_v8_snpinfo.txt"
# randompath = "/usr/users/fsimone/vcfs_0.05/"

outdir = os.path.join(baseoutdir, title)
if not os.path.exists(outdir): os.makedirs(outdir)

# NTOT_SNPS_MAF1 = 4522283
# NTOT_SNPS_MAF5 = 2135526

NTOT_SNPS = 0
snp_maf_dict = collections.defaultdict(lambda:False)
with open(maffile) as instream:
    for line in instream:
        snp_maf_dict[line.rstrip().split()[1]] = True
        NTOT_SNPS += 1
print(NTOT_SNPS)

4522283


In [4]:
import os 
from utils import utils
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/main/tissues.txt"
tissues, descriptions = utils.read_tissues(tissue_file)

dataset = "gtex_v8"
expressions = ["tmm_cclm"]
methods = ["matrixeqtl"]
chroms = [str(x) for x in np.arange(1,23)]
fdrcutoff = np.log10(0.05)

In [5]:
def write_eqtls(snp_res, outfile):
    with open(outfile, 'w') as outstream:
        for g in snp_res:
            line = "{:s}\t{:s}\t{:d}\t{:g}\t{:g}\t{:s}\n".format(g.chrom, g.rsid, g.pos, g.logp, g.fdr, g.target)
            outstream.write(line)

basepath_patch="/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_lncRNA/"
for tissue in tissues:
    gtex_t = "-".join([dataset, tissue])
    print("Processing: {:s}".format(tissue), end=" ")
    for expression in expressions:
        for method in methods:
            tissue_path = os.path.join(basepath_patch, expression, gtex_t)
            signif_cisfile = os.path.join(basepath_patch, expression, gtex_t, method, "cis_eqtl_signif_fdr0.05.txt")
            signif_transfile = os.path.join(basepath_patch, expression, gtex_t, method, "trans_eqtl_signif_fdr0.05.txt")
            if os.path.exists(tissue_path):
                if not os.path.exists(signif_cisfile) and not os.path.exists(signif_transfile):
                    print("")
                    snp_res = list()
                    trans_snp_res = list()
                    for chrom in chroms:
                        cisfile = os.path.join(basepath_patch, expression, gtex_t, method, "chr"+chrom, "cis_eqtl.txt.pos")
                        snp_res += matrixeqtl(cisfile, chrom, fdrcutoff)        
                        transfile = os.path.join(basepath_patch, expression, gtex_t, method, "chr"+chrom, "trans_eqtl.txt.pos")
                        trans_snp_res += matrixeqtl(transfile, chrom, fdrcutoff)
                    write_eqtls(snp_res, signif_cisfile)
                    write_eqtls(trans_snp_res, signif_transfile)
                else:
                    print(" - Files exists")
            else:
                print(tissue_path, "does not exist")

Processing: as  - Files exists
Processing: av  - Files exists
Processing: ag  - Files exists
Processing: aa  - Files exists
Processing: ac  - Files exists
Processing: at  - Files exists
Processing: bam  - Files exists
Processing: ban  - Files exists
Processing: bca  - Files exists
Processing: bceh  - Files exists
Processing: bce  - Files exists
Processing: bco  - Files exists
Processing: bfr  - Files exists
Processing: bhi  - Files exists
Processing: bhy  - Files exists
Processing: bnu  - Files exists
Processing: bpu  - Files exists
Processing: bsp  - Files exists
Processing: bsu  - Files exists
Processing: br  - Files exists
Processing: ebv  - Files exists
Processing: fib  - Files exists
Processing: cols  - Files exists
Processing: colt  - Files exists
Processing: esog  - Files exists
Processing: esom  - Files exists
Processing: esomu  - Files exists
Processing: haa  - Files exists
Processing: hlv  - Files exists
Processing: kc  - Files exists
Processing: liv  - Files exists
Processin

In [6]:
def cis_typespecific_eqtls(cistrans_target_eqtls, genetype_dict):
    cis_typespecific_eqtls = [x for x in cistrans_target_eqtls if genetype_dict[x.target]]
    
    uniq_cis_snps = list(set([x.rsid for x in cis_typespecific_eqtls]))
    unique_targets = list(set([x.target for x in cis_typespecific_eqtls]))
    return cis_typespecific_eqtls, uniq_cis_snps, unique_targets

def cross_ref_cis_trans(trans_ids, cis_eqtls):
    cis_ids = list(set([x.rsid for x in cis_eqtls]))
    
    #Intersection between cis-eqtls (MatrixEQTL) and trans-eqtls (TEJAAS)
    cis_trans_eqtls_ids = list(set.intersection(set(trans_ids), set(cis_ids)))
    
    #set up a dict for fast look up later
    cis_trans_dict = dict()
    for x in cis_trans_eqtls_ids:
        cis_trans_dict[x] = True
    
    # List of cis-trans-eqtls with its target gene
    cis_target_eqtls = [x for x in cis_eqtls if cis_trans_dict.get(x.rsid, False)]

    return cis_trans_eqtls_ids, cis_target_eqtls

def crossref_trans_tejaas(transeqtls, cis_eqtls):
    trans_ids = [x.rsid for x in transeqtls]
    a, b = cross_ref_cis_trans(trans_ids, cis_eqtls)
    return a, b

import random

def get_cistype_fractions(ciseqtls, valid_types, alltypes_dict):
    cistype_frac_dict = dict()
    NCIS = len(list(set([x.rsid for x in ciseqtls])))
    for gtype in valid_types:
        cishits = list()
        for ciseqtl in ciseqtls:
            if alltypes_dict[gtype][ciseqtl.target]:
                cishits.append(ciseqtl.rsid)
        NCIS_TYPE = len(list(set(cishits)))
        cistype_frac_dict[gtype] = NCIS_TYPE / NCIS
        # print("CIS_frac:", gtype, NCIS_TYPE, NCIS)
    return cistype_frac_dict


In [33]:
###################################
########### MAF 0.01 ##############
###################################

TARGET = "ENSG00000128573"   
    
meqtl_expr  = "tmm_cclm"
tejaas_expr = "raw"

# basepath = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_new"

res_dict = dict()
pair_res_dict = dict()
for tissue in tissues:
    gtex_t = "-".join([dataset, tissue])
    print(tissue)
    
    tejaas_file = os.path.join(basepath, "raw", "gtex_v8-"+tissue, "tejaas", "permnull_sb0.1_knn30", "trans_eqtls_0.0001.txt") #5e-08.txt")
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    transeqtls = tejaas(tejaas_file)
    transeqtls = [x for x in transeqtls if snp_maf_dict[x.rsid]]
    
    if len(transeqtls) == 0:
        print("{:s} has less no trans-eqtls".format(tissue))
        continue
    
    signif_cisfile = os.path.join(basepath, meqtl_expr, gtex_t, "matrixeqtl", "cis_eqtl_signif_fdr0.05.txt")
    if not os.path.exists(signif_cisfile) or os.stat(signif_cisfile).st_size == 0:
        print("{:s} has no cis-file (probably no covariates)".format(tissue))
        continue
    ciseqtls = matrixeqtl_signif(signif_cisfile, snp_maf_dict)
    cis_ids = list(set([x.rsid for x in ciseqtls]))
    
    if len(ciseqtls) == 0:
        print("{:s} has less no cis-eqtls".format(tissue))
        continue
    
    cis_trans_eqtls_ids, cistrans_target_eqtls = crossref_trans_tejaas(transeqtls, ciseqtls)
    
    cis_targets = [x for x in ciseqtls if x.target.startswith(TARGET)]
    print("Found {:d} cis Snps targeting FOXP2".format(len(cis_targets)))
    
    found_targets = [x for x in cistrans_target_eqtls if x.target.startswith(TARGET)]
    if len(found_targets) > 0:
        print("Found!")
        print(found_targets)  
    
    
    dist = 10000
    pairs = list()
    for t in transeqtls:
        if t.chrom == 7:
            for c in cis_targets:
                diff = np.abs(int(t.pos) - int(c.pos))
                if diff < dist:
                    pair = (t, c)
                    pairs.append(pair)
    if len(pairs) > 0:
        print("{:d} close SNPs!".format(len(pairs)))
        pair_res_dict[tissue] = pairs

as
Found 33 cis Snps targeting FOXP2
4 close SNPs!
av
Found 1 cis Snps targeting FOXP2
ag
Found 0 cis Snps targeting FOXP2
aa
Found 0 cis Snps targeting FOXP2
ac
Found 1 cis Snps targeting FOXP2
at
Found 7 cis Snps targeting FOXP2
bam
Found 0 cis Snps targeting FOXP2
ban
Found 0 cis Snps targeting FOXP2
bca
Found 0 cis Snps targeting FOXP2
bceh
Found 0 cis Snps targeting FOXP2
bce
Found 0 cis Snps targeting FOXP2
bco
Found 0 cis Snps targeting FOXP2
bfr
Found 1 cis Snps targeting FOXP2
bhi
Found 2 cis Snps targeting FOXP2
bhy
Found 1 cis Snps targeting FOXP2
bnu
Found 3 cis Snps targeting FOXP2
bpu
Found 0 cis Snps targeting FOXP2
bsp
Found 1 cis Snps targeting FOXP2
bsu
Found 0 cis Snps targeting FOXP2
br
Found 0 cis Snps targeting FOXP2
ebv
Found 0 cis Snps targeting FOXP2
fib
Found 22 cis Snps targeting FOXP2
cols
Found 1 cis Snps targeting FOXP2
colt
Found 11 cis Snps targeting FOXP2
2 close SNPs!
esog
Found 4 cis Snps targeting FOXP2
esom
Found 1 cis Snps targeting FOXP2
esomu
Fou

In [29]:
pairs

[(SNPRes(rsid='chr7_114689251_C_A_b38', chrom=7, pos=114689251, logp=5.56091847996197, fdr=None, target=None),
  SNPRes(rsid='chr7_114698760_T_C_b38', chrom='7', pos='114698760', logp=3.47237, fdr=2.03479, target='ENSG00000128573')),
 (SNPRes(rsid='chr7_114694515_A_G_b38', chrom=7, pos=114694515, logp=4.058319155997669, fdr=None, target=None),
  SNPRes(rsid='chr7_114698760_T_C_b38', chrom='7', pos='114698760', logp=3.47237, fdr=2.03479, target='ENSG00000128573')),
 (SNPRes(rsid='chr7_114695200_C_T_b38', chrom=7, pos=114695200, logp=5.56091847996197, fdr=None, target=None),
  SNPRes(rsid='chr7_114698760_T_C_b38', chrom='7', pos='114698760', logp=3.47237, fdr=2.03479, target='ENSG00000128573')),
 (SNPRes(rsid='chr7_114697041_A_G_b38', chrom=7, pos=114697041, logp=5.56091847996197, fdr=None, target=None),
  SNPRes(rsid='chr7_114698760_T_C_b38', chrom='7', pos='114698760', logp=3.47237, fdr=2.03479, target='ENSG00000128573'))]

In [32]:
np.power(10,-5.56)

2.754228703338169e-06