In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import time
from statsmodels.distributions.empirical_distribution import ECDF
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')
from utils import readgtf

base_dir = "/cbscratch/franco/datasets"

import collections

GENEINFO_FIELDS = ['name', 'ensembl_id', 'chrom', 'start', 'end', 'typ']
class GeneInfo(collections.namedtuple('_GeneInfo', GENEINFO_FIELDS)):
    __slots__ = ()

def read_TFannot(infile):
    TF_list = list()
    with open(infile) as instream:
        next(instream)
        for line in instream:
            arr = line.rstrip().split()
            TF_list.append(GeneInfo(ensembl_id=arr[0], chrom=int(arr[1]), start=int(arr[2]), end=int(arr[3]), name=arr[4], typ="TF"))
    return TF_list

TF_annot = read_TFannot("../TF_annotation.txt")

import collections
TF_dict = collections.defaultdict(dict)
for g in TF_annot:
    TF_dict[g.chrom][g.ensembl_id] = "TF"

In [2]:
from utils import readgtf

gene_info = readgtf.gencode_v12("/cbscratch/franco/datasets/GENCODE/gencode.v26.annotation.gtf.gz", trim=True)
gene_info_dict = collections.defaultdict(dict)
for gene in gene_info:
    gene_info_dict[gene.chrom][gene.ensembl_id] = gene.typ

In [18]:
import mpmath
from operator import attrgetter

mpmath.mp.dps = 50
def pvalue(x): return float(mpmath.log10(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2)))))

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'fdr', 'target', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()
    
CT_FIELDS = ['tissue', 'ncis', 'ntrans', 'ncistrans', 'randtrans', 'enrichment', 'pval']
class CisTrans(collections.namedtuple('_CisTrans', CT_FIELDS)):
    __slots__ = ()
    
# def tejaas_saikat(filepath):
#     res = list()
#     with open(filepath, 'r') as mfile:
#         next(mfile)
#         for line in mfile:
#             arr   = line.strip().split("\t")
#             rsid  = arr[0]
#             chrom = int(arr[1])
#             pos   = int(arr[2])
#             p     = float(arr[3])
#             logp  = np.log10(p) if p!=0 else np.log10(10e-30)
#             res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, fdr=None, target=None))
#     return res

def load_snp_maf(filepath, tissue):
    snp_maf_dict = collections.defaultdict(lambda:False)
    for chrm in np.arange(1,23):
        with open(filepath.format(tissue, chrm)) as instream:
            for line in instream:
                arr = line.strip().split("\t")
                snp_maf_dict[arr[0]] = float(arr[1])
    return snp_maf_dict

def tejaas(filepath, mafcutoff=0.01):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            chrom = int(arr[1])
            pos   = int(arr[2])
            maf   = float(arr[3])
            if maf < mafcutoff or maf > (1-mafcutoff):
                continue
            q     = float(arr[4])
            mu    = float(arr[5])
            sigma = float(arr[6])
            p     = float(arr[7])
            if sigma == 0:
                continue
            logp  = np.log10(p) if p != 0 else pvalue( (q - mu) / sigma)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, maf=maf, fdr=None, target=None))
    return res

def matrixeqtl(filepath, chrom, fdrcutoff):
    res = list()
    if not os.path.exists(filepath) or os.stat(filepath).st_size == 0:
        print("File empty or does not exist")
        return res
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr  = line.strip().split("\t")
            rsid = arr[0]
            pos = int(rsid.split("_")[1])
            gene = arr[1].split(".")[0]
            logp = np.log10(float(arr[4]))
            fdr  = np.log10(float(arr[5]))
            if fdr > fdrcutoff:
                break
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, maf=None, fdr=-fdr, target=gene))
    return res

def matrixeqtl_signif(filepath, snp_maf_dict):
    res = list()
    discard = 0
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr  = line.strip().split("\t")
            chrom = arr[0]
            rsid = arr[1]
            if not snp_maf_dict[rsid]:
                discard += 1
                continue
            pos = arr[2]
            gene = arr[5]
            logp = float(arr[3])
            fdr  = float(arr[4])
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=logp, maf=None, fdr=fdr, target=gene))
    # print("Discarded {:d} SNPs with low MAF".format(discard))
    return res

In [16]:
# Filter by allowed snps according to MAF
basepath = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_lncRNA_freeze/"
baseoutdir = os.path.join(basepath, "cis_eqtls_analysis")
if not os.path.exists(baseoutdir): os.makedirs(baseoutdir)

maf = 0.01    
    
title = "maf{:g}".format(maf*100)
# maffile = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_{:g}/gtex_v8_snpinfo.txt".format(maf)
maffile = "/cbscratch/franco/datasets/gtex_v8/genotypes/snpMAFs/{:s}_snp_CHR{:d}.maf"
randompath = "/usr/users/fsimone/vcfs_{:g}/".format(maf)

# title = "maf5"
# maffile = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_0.05/gtex_v8_snpinfo.txt"
# randompath = "/usr/users/fsimone/vcfs_0.05/"

outdir = os.path.join(baseoutdir, title)
if not os.path.exists(outdir): os.makedirs(outdir)


# NTOT_SNPS_MAF1 = 4522283
# NTOT_SNPS_MAF5 = 2135526

# NTOT_SNPS = 0
# snp_maf_dict = collections.defaultdict(lambda:False)
# with open(maffile) as instream:
#     for line in instream:
#         snp_maf_dict[line.rstrip().split()[1]] = True
#         NTOT_SNPS += 1
# print(NTOT_SNPS)

In [5]:
import os 
from utils import utils
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/main/tissues.txt"
tissues, descriptions = utils.read_tissues(tissue_file)

dataset = "gtex_v8"
expressions = ["tmm_cclm"]
methods = ["matrixeqtl"]
chroms = [str(x) for x in np.arange(1,23)]
fdrcutoff = np.log10(0.05)

In [6]:
def write_eqtls(snp_res, outfile):
    with open(outfile, 'w') as outstream:
        for g in snp_res:
            line = "{:s}\t{:s}\t{:d}\t{:g}\t{:g}\t{:s}\n".format(g.chrom, g.rsid, g.pos, g.logp, g.fdr, g.target)
            outstream.write(line)

basepath_patch="/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_pc_freeze/"
for tissue in tissues:
    gtex_t = "-".join([dataset, tissue])
    print("Processing: {:s}".format(tissue), end=" ")
    for expression in expressions:
        for method in methods:
            tissue_path = os.path.join(basepath_patch, expression, gtex_t)
            signif_cisfile = os.path.join(basepath_patch, expression, gtex_t, method, "cis_eqtl_signif_fdr0.05.txt")
            signif_transfile = os.path.join(basepath_patch, expression, gtex_t, method, "trans_eqtl_signif_fdr0.05.txt")
            if os.path.exists(tissue_path):
                print("")
                if not os.path.exists(signif_cisfile) and not os.path.exists(signif_transfile):
                    snp_res = list()
                    trans_snp_res = list()
                    for chrom in chroms:
                        cisfile = os.path.join(basepath_patch, expression, gtex_t, method, "chr"+chrom, "cis_eqtl.txt.pos")
                        snp_res += matrixeqtl(cisfile, chrom, fdrcutoff)        
                        transfile = os.path.join(basepath_patch, expression, gtex_t, method, "chr"+chrom, "trans_eqtl.txt.pos")
                        trans_snp_res += matrixeqtl(transfile, chrom, fdrcutoff)
                    write_eqtls(snp_res, signif_cisfile)
                    write_eqtls(trans_snp_res, signif_transfile)
                else:
                    print(" - Files exists")
            else:
                print(tissue_path, "does not exist")

Processing: as 
 - Files exists
Processing: av 
 - Files exists
Processing: ag 
 - Files exists
Processing: aa 
 - Files exists
Processing: ac 
 - Files exists
Processing: at 
 - Files exists
Processing: bam 
 - Files exists
Processing: ban 
 - Files exists
Processing: bca 
 - Files exists
Processing: bceh 
 - Files exists
Processing: bce 
 - Files exists
Processing: bco 
 - Files exists
Processing: bfr 
 - Files exists
Processing: bhi 
 - Files exists
Processing: bhy 
 - Files exists
Processing: bnu 
 - Files exists
Processing: bpu 
 - Files exists
Processing: bsp 
 - Files exists
Processing: bsu 
 - Files exists
Processing: br 
 - Files exists
Processing: ebv 
 - Files exists
Processing: fib 
 - Files exists
Processing: cols 
 - Files exists
Processing: colt 
 - Files exists
Processing: esog 
 - Files exists
Processing: esom 
 - Files exists
Processing: esomu 
 - Files exists
Processing: haa 
 - Files exists
Processing: hlv 
 - Files exists
Processing: kc 
 - Files exists
Processing:

In [7]:
def cis_typespecific_eqtls(cistrans_target_eqtls, genetype_dict):
    cis_typespecific_eqtls = [x for x in cistrans_target_eqtls if genetype_dict[x.target]]
    
    uniq_cis_snps = list(set([x.rsid for x in cis_typespecific_eqtls]))
    unique_targets = list(set([x.target for x in cis_typespecific_eqtls]))
    return cis_typespecific_eqtls, uniq_cis_snps, unique_targets

def cross_ref_cis_trans(trans_ids, cis_eqtls):
    cis_ids = list(set([x.rsid for x in cis_eqtls]))
    
    #Intersection between cis-eqtls (MatrixEQTL) and trans-eqtls (TEJAAS)
    cis_trans_eqtls_ids = list(set.intersection(set(trans_ids), set(cis_ids)))
    
    #set up a dict for fast look up later
    cis_trans_dict = dict()
    for x in cis_trans_eqtls_ids:
        cis_trans_dict[x] = True
    
    # List of cis-trans-eqtls with its target gene
    cis_target_eqtls = [x for x in cis_eqtls if cis_trans_dict.get(x.rsid, False)]

    return cis_trans_eqtls_ids, cis_target_eqtls

def crossref_trans_tejaas(transeqtls, cis_eqtls):
    trans_ids = [x.rsid for x in transeqtls]
    a, b = cross_ref_cis_trans(trans_ids, cis_eqtls)
    return a, b

import random

def get_cistype_fractions(ciseqtls, valid_types, alltypes_dict):
    cistype_frac_dict = dict()
    NCIS = len(list(set([x.rsid for x in ciseqtls])))
    for gtype in valid_types:
        cishits = list()
        for ciseqtl in ciseqtls:
            if alltypes_dict[gtype][ciseqtl.target]:
                cishits.append(ciseqtl.rsid)
        NCIS_TYPE = len(list(set(cishits)))
        cistype_frac_dict[gtype] = NCIS_TYPE / NCIS
        # print("CIS_frac:", gtype, NCIS_TYPE, NCIS)
    return cistype_frac_dict

def sample_background_50000(ciseqtls, randompath, valid_types = [], alltypes_dict = dict()):
    randtrans = list()
    res_dict  = collections.defaultdict(list)
    chroms    = [str(x) for x in np.arange(1,23)]
    for nid in ["{:03d}".format(x) for x in np.arange(1, 11)]:
        Nrand="50000"
        randomfile = randompath+"random_"+Nrand+"_"+nid

        rand_ids = list()
        for chrm in chroms:
            with open(os.path.join(randomfile, "chr{:s}.txt".format(chrm))) as ins:
                rand_ids += [line.rstrip() for line in ins]

        a, b = cross_ref_cis_trans(rand_ids, ciseqtls)
        randtrans.append( len(a) )
        if len(valid_types) > 0:
            for gtype in valid_types:
                cis_types_eqtls, uniq_cis_snps, uniq_targets = cis_typespecific_eqtls(b, alltypes_dict[gtype])
                # save only the number of uniq cistrans snps found for that gene type
                res_dict[gtype].append(len(uniq_cis_snps)) 
    return np.mean(randtrans), res_dict


def sample_binomial(n, p, NTIMES):
    array_n = list()
    for i in range(NTIMES):
        n_success = np.random.binomial(n, p)
        array_n.append(n_success)
    return array_n

In [8]:

# Reformat genetype dict, we can add as many gene annotations as we want here
alltypes_dict = collections.defaultdict(dict)
genetypes = []
for chrm in range(1,23):
    gene_info_dict[chrm]
    for k in gene_info_dict[chrm].keys():
        genetype = gene_info_dict[chrm][k]
        if genetype not in alltypes_dict:
            alltypes_dict[genetype] = collections.defaultdict(lambda:False)
            genetypes.append(genetype)
        alltypes_dict[genetype][k] = True
    # Add TF dictionary
    for k in TF_dict[chrm].keys():
        genetype = "TF"
        if genetype not in alltypes_dict:
            alltypes_dict[genetype] = collections.defaultdict(lambda:False)
            genetypes.append(genetype)
        alltypes_dict[genetype][k] = True

In [19]:
###################################
########### MAF 0.01 ##############
###################################

CT_TYPE_FIELDS = ['tissue', 'genetype', 'hits', 'enrichment', 'pval', 'frac']
class CisTrans_type(collections.namedtuple('_CisTransType', CT_TYPE_FIELDS)):
    __slots__ = ()
    
    
meqtl_expr  = "tmm_cclm"
tejaas_expr = "raw"

res_dict = dict()
res_dict_cistype = collections.defaultdict(dict)
res_dict_randomtype = collections.defaultdict(dict)
for tissue in tissues:
    gtex_t = "-".join([dataset, tissue])
    
    snp_maf_dict = load_snp_maf(maffile, tissue)
#     tejaas_file = os.path.join(basepath+"_tejaas_permnull_sb0.1_knn", tissue, "trans_eqtls.txt")
#     if not os.path.exists(tejaas_file):
#         print("{:s} has no trans-eqtl results".format(tissue))
#         continue
#     transeqtls = tejaas_saikat(tejaas_file)
    
    tejaas_file = os.path.join(basepath, "raw", "gtex_v8-"+tissue, "tejaas", "permnull_sb0.1_knn30", "trans_eqtls_5e-08.txt")
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    transeqtls = tejaas(tejaas_file)
    transeqtls = [x for x in transeqtls if snp_maf_dict[x.rsid]]
    
    
    if len(transeqtls) < 100:
        print("{:s} has less than 100 trans-eqtls".format(tissue))
        continue
    
    signif_cisfile = os.path.join(basepath, meqtl_expr, gtex_t, "matrixeqtl", "cis_eqtl_signif_fdr0.05.txt")
    if not os.path.exists(signif_cisfile) or os.stat(signif_cisfile).st_size == 0:
        print("{:s} has no cis-file (probably no covariates)".format(tissue))
        continue
    ciseqtls = matrixeqtl_signif(signif_cisfile, snp_maf_dict)
    cis_ids = list(set([x.rsid for x in ciseqtls]))
    
    if len(ciseqtls) < 100:
        print("{:s} has less than 100 cis-eqtls".format(tissue))
        continue
    
    cis_trans_eqtls_ids, cistrans_target_eqtls = crossref_trans_tejaas(transeqtls, ciseqtls)
    
    valid_types = []
    genetarget_counts = dict()
    for genetype in genetypes:
        cis_types_eqtls, uniq_cis_snps, unique_targets = cis_typespecific_eqtls(cistrans_target_eqtls, alltypes_dict[genetype])
        if len(uniq_cis_snps) > 0:
            # print(genetype, len(uniq_cis_snps), len(unique_targets))
            valid_types.append(genetype)
            genetarget_counts[genetype] = [cis_types_eqtls, uniq_cis_snps, unique_targets]
            
    # FRAC_GWCIS = len(ciseqtls)/NTOT_SNPS
    randtrans, rand_res_dict = sample_background_50000(ciseqtls, randompath, valid_types, alltypes_dict)
    FRAC_CISTRANS = len(cis_trans_eqtls_ids) / len(transeqtls)
    FRAC_RANDOM_GWCISTRANS = randtrans / 50000 
    
    enrichment = FRAC_CISTRANS / FRAC_RANDOM_GWCISTRANS

#     ### Calculate empirical p-value
#     ntrans = len(transeqtls)
#     randtrans1k = sample_1000_pval(ciseqtls, ntrans, randompath)
#     num_null = np.array(randtrans1k) /  ntrans
#     null_enrichments = num_null / FRAC_RANDOM_GWCISTRANS
#     ecdf = ECDF(null_enrichments)
#     pval = 1 - ecdf(actual_enrichment)
#     print("Empirical p-val:", pval)
    
    ncis = len(cis_ids)
    ntrans = len(transeqtls)
    ncistrans = len(cis_trans_eqtls_ids)
    
    randtrans1k_bin = sample_binomial(ntrans, FRAC_RANDOM_GWCISTRANS, 10000000)
    num_null = np.array(randtrans1k_bin) /  ntrans

    null_enrichments = num_null / FRAC_RANDOM_GWCISTRANS
    ecdf = ECDF(null_enrichments)
    pval = 1 - ecdf(enrichment)
    
    
    res_dict[tissue] = CisTrans(tissue=tissue, ncis=ncis, ntrans=ntrans, 
                                ncistrans=ncistrans, randtrans=FRAC_RANDOM_GWCISTRANS,
                                enrichment=enrichment, pval=pval)
    
    print(f"########## Tissue: {tissue} - {ntrans} trans-eqtls - {ncistrans} cis-trans-eqtls #########")
    print(f"{tissue:>20}        Enrichment: {enrichment:>g} - pval: {pval:>g}")
    
    cistype_fracs_dict = get_cistype_fractions(ciseqtls, valid_types, alltypes_dict)
    
    for vt in valid_types:
        # from the 50000 random SNPs, randtrans is the nº that are cistrans
        FRAC_RANDOM_TYPE_NULL = np.mean(np.array(rand_res_dict[vt])) / 50000 #randtrans
        FRAC_CIS_TYPE_NULL    = cistype_fracs_dict[vt]
        uniq_snps_targettype = len(genetarget_counts[vt][1])
        FRAC_CISTYPE =  uniq_snps_targettype / ntrans
        
        if FRAC_CIS_TYPE_NULL != 1.0:        
            cis_type_enrichment    = FRAC_CISTYPE / FRAC_CIS_TYPE_NULL
            # Calculate pvalue
            randtrans1k_bin = sample_binomial(ntrans, FRAC_CIS_TYPE_NULL, 10000000)
            num_null = np.array(randtrans1k_bin) /  ntrans
            null_enrichments = num_null / FRAC_CIS_TYPE_NULL
            ecdf = ECDF(null_enrichments)
            cis_pval = 1 - ecdf(cis_type_enrichment)
        else:
            cis_pval = 1.0
            cis_type_enrichment = 1.0
          
        random_type_enrichment = FRAC_CISTYPE / FRAC_RANDOM_TYPE_NULL
        # Calculate pvalue
        randtrans1k_bin = sample_binomial(ntrans, FRAC_RANDOM_TYPE_NULL, 10000000)
        num_null = np.array(randtrans1k_bin) /  ntrans
        null_enrichments = num_null / FRAC_RANDOM_TYPE_NULL
        ecdf = ECDF(null_enrichments)
        random_pval = 1 - ecdf(random_type_enrichment)
                  
        print(f"{vt:>20}    CIS Enrichment: {cis_type_enrichment:>g} - pval: {cis_pval:>g}")
        print(f"{vt:>20} RANDOM Enrichment: {random_type_enrichment:>g} - pval: {random_pval:>g}")
    
        res_dict_cistype[tissue][vt]    = CisTrans_type(tissue=tissue, genetype=vt,
                                                        hits=uniq_snps_targettype,
                                                        frac=FRAC_CIS_TYPE_NULL,
                                                        enrichment=cis_type_enrichment, pval=cis_pval)
        res_dict_randomtype[tissue][vt] = CisTrans_type(tissue=tissue, genetype=vt, 
                                                        hits=uniq_snps_targettype,
                                                        frac=FRAC_RANDOM_TYPE_NULL,
                                                        enrichment=random_type_enrichment, pval=random_pval)

########## Tissue: as - 1280 trans-eqtls - 226 cis-trans-eqtls #########
                  as        Enrichment: 0.787656 - pval: 0.999984
             lincRNA    CIS Enrichment: 0.0350237 - pval: 1
             lincRNA RANDOM Enrichment: 0.1565 - pval: 1
      protein_coding    CIS Enrichment: 0.186786 - pval: 1
      protein_coding RANDOM Enrichment: 0.833482 - pval: 0.998949
           antisense    CIS Enrichment: 0.113785 - pval: 1
           antisense RANDOM Enrichment: 0.510899 - pval: 0.999864
processed_transcript    CIS Enrichment: 0.0751527 - pval: 1
processed_transcript RANDOM Enrichment: 0.338937 - pval: 0.933684


KeyboardInterrupt: 

In [29]:
cis_trans_eqtls_ids

['chr14_94121934_G_A_b38',
 'chr1_155687151_G_A_b38',
 'chr2_84213880_T_C_b38',
 'chr20_32574766_G_A_b38',
 'chr20_20052966_A_C_b38',
 'chr22_42480785_G_A_b38',
 'chr20_19998228_C_T_b38',
 'chr15_52289286_T_C_b38',
 'chr10_89164501_A_G_b38',
 'chr15_52242147_G_A_b38',
 'chr20_20045611_C_A_b38',
 'chr12_22356161_T_C_b38',
 'chr13_49746825_T_C_b38',
 'chr10_89494542_T_C_b38',
 'chr11_2917931_G_A_b38',
 'chr5_61259437_A_G_b38',
 'chr10_114281518_G_A_b38',
 'chr2_84278845_G_A_b38',
 'chr2_84232904_A_C_b38',
 'chr6_110932797_T_G_b38',
 'chr6_54016213_G_A_b38',
 'chr6_3714832_T_C_b38',
 'chr20_32580637_C_T_b38',
 'chr17_19338192_G_A_b38',
 'chr12_22551497_G_A_b38',
 'chr11_2307463_C_T_b38',
 'chr9_85530029_T_C_b38',
 'chr10_89195717_A_C_b38',
 'chr20_20054592_T_C_b38',
 'chr11_61954740_G_T_b38',
 'chr15_52275027_C_T_b38',
 'chr17_78035144_C_T_b38',
 'chr17_19301550_T_C_b38',
 'chr2_84263152_C_T_b38',
 'chr2_84317935_C_T_b38',
 'chr2_84225011_G_A_b38',
 'chr2_84315354_T_C_b38',
 'chr11_291847

In [28]:
cistrans_target_eqtls

[SNPRes(rsid='chr1_204415772_C_T_b38', chrom='1', pos='204415772', logp=34.0452, fdr=31.2018, target='ENSG00000226330', maf=None),
 SNPRes(rsid='chr1_204415772_C_T_b38', chrom='1', pos='204415772', logp=25.4566, fdr=22.9629, target='ENSG00000158615', maf=None),
 SNPRes(rsid='chr1_156905245_T_G_b38', chrom='1', pos='156905245', logp=18.2561, fdr=16.0195, target='ENSG00000187800', maf=None),
 SNPRes(rsid='chr1_156903671_T_C_b38', chrom='1', pos='156903671', logp=17.0857, fdr=14.8949, target='ENSG00000187800', maf=None),
 SNPRes(rsid='chr1_156905300_G_A_b38', chrom='1', pos='156905300', logp=16.023, fdr=13.8701, target='ENSG00000187800', maf=None),
 SNPRes(rsid='chr1_156905315_C_A_b38', chrom='1', pos='156905315', logp=15.827, fdr=13.6813, target='ENSG00000187800', maf=None),
 SNPRes(rsid='chr1_156902975_T_C_b38', chrom='1', pos='156902975', logp=15.6949, fdr=13.5544, target='ENSG00000187800', maf=None),
 SNPRes(rsid='chr1_156900823_G_A_b38', chrom='1', pos='156900823', logp=15.3358, fdr=

In [23]:
uniq_snps_targettype

3

In [None]:
outcisfilename = os.path.join(outdir,"CisEQTL_enrichment_results_"+title+".txt")
if os.path.exists(outcisfilename):
    print("Warning! File exists")
    raise
with open(outcisfilename, 'w') as outstream:
    for tissue in tissues:
        if tissue in res_dict:
            line = f"{tissue}\t{res_dict[tissue].ncis}\t{res_dict[tissue].ntrans}\t{res_dict[tissue].ncistrans}\t{res_dict[tissue].randtrans}\t{res_dict[tissue].enrichment}\t{res_dict[tissue].pval}\n"
            outstream.write(line)

outcistypefilename = os.path.join(outdir,"CisEQTL_target_enrichment_results_"+title+".txt")
if os.path.exists(outcisfilename):
    print("Warning! File exists")
    raise
with open(outcistypefilename, 'w') as outstream:
    for tissue in tissues:
        if tissue in res_dict_cistype:
            for genetype in res_dict_cistype[tissue].keys():
                cis_hits = res_dict_cistype[tissue][genetype].hits
                cis_frac = res_dict_cistype[tissue][genetype].frac
                cis_e    = res_dict_cistype[tissue][genetype].enrichment
                cis_pval = res_dict_cistype[tissue][genetype].pval
                random_hits = res_dict_randomtype[tissue][genetype].hits
                random_frac = res_dict_randomtype[tissue][genetype].frac
                random_e    = res_dict_randomtype[tissue][genetype].enrichment
                random_pval = res_dict_randomtype[tissue][genetype].pval
                cisline    = f"{tissue}\t{genetype}\tCIS\t{cis_hits}\t{cis_frac}\t{cis_e}\t{cis_pval}\n"
                randomline = f"{tissue}\t{genetype}\tRANDOM\t{random_hits}\t{random_frac}\t{random_e}\t{random_pval}\n"
                outstream.write(cisline)
                outstream.write(randomline)

In [None]:
import json
import os 
from utils import utils

json_file = "../gtex_v8_metadata.json"
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_samples = dict()
for tshort, tfull in zip(tissues, descriptions):
    tissue_names[tshort] = tfull
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]

In [None]:
import operator
sorted_tissues = [x[0] for x in sorted(tissue_samples.items(), key=operator.itemgetter(1))]


In [None]:
# Load cis-eQTL enrichments (for our matrixeqtl cis-eqtls)
cisfilename = os.path.join(basepath, "cis_eqtls_analysis", title, "CisEQTL_enrichment_results_"+title+".txt")
# df_cis = pd.read_csv(cisfilename, sep="\t", header= None)
# df_cis.columns = ["tissue", "ncis", "ntrans", "ncistrans", "rand_frac", "cis_enrichment", "pval"]
# df_cis.index = df_cis.tissue

res_dict = dict()
with open(cisfilename) as instream:
    for line in instream:
        arr = line.strip().split("\t")
        tissue = arr[0]
        ncis   = int(arr[1])
        ntrans = int(arr[2])
        ncistrans = int(arr[3])
        rand_frac = float(arr[4])
        enrichment = float(arr[5])
        pval = float(arr[6])
        res_dict[tissue] = CisTrans(tissue=tissue, ncis=ncis, ntrans=ntrans, 
                                        ncistrans=ncistrans, randtrans=rand_frac,
                                        enrichment=enrichment, pval=pval)

In [None]:
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 22, colors = 'banskt', dpi = 300)

objects = list()
performance = list()
significance = list()
bar_colors = list()
bar_width = list()

for tshort in sorted_tissues:
    if tshort in res_dict:
        ntrans = res_dict[tshort].ntrans
        if ntrans > 0:
#             performance.append(res_dict[tshort].enrichment - 1.0))
            if np.log2(res_dict[tshort].enrichment) != -np.inf:
                objects.append(tissue_names[tshort] + f" ({tissue_samples[tshort]})")
                performance.append(np.log2(res_dict[tshort].enrichment))
                if res_dict[tshort].pval <= 0.0001:
                    significance.append('***')
                elif res_dict[tshort].pval <= 0.001:
                    significance.append('**')
                elif res_dict[tshort].pval <= 0.05:
                    significance.append('*')
                else:
                    significance.append('')
                bar_colors.append(tissue_colors[tshort])
                bar_width.append(tissue_samples[tshort])

bar_width = np.array(bar_width)

fig = plt.figure(figsize = (20, 15))
ax1 = fig.add_subplot(111)

y_pos = np.arange(len(objects))
bar1 = ax1.barh(y_pos, performance, align='center', color = bar_colors, linewidth = 0, height = 0.5) #01 * bar_width)

ax1.set_yticks(y_pos)
ax1.set_yticklabels(objects)
ax1.set_xlabel('Cis-EQTL $log_2(Enrichment)$')
xmax = np.around(np.max(performance) + 0.05, decimals = 1)
xmin = np.around(max(np.min(performance), -2) - 0.05, decimals = 1)
ax1.set_xlim([xmin, xmax])

ax1.tick_params(bottom = True, top = False, left = False, right = False, labelleft = True, labelbottom = True)
for side, border in ax1.spines.items():
    if not side == 'bottom':
        border.set_visible(False)
        
# Add p-values beside the bars
for i, rect in enumerate(bar1):
    width = rect.get_width()
    xpos = rect.get_width() + 0.02
    ypos = rect.get_y() + rect.get_height() / 2.0
    ax1.text(xpos, ypos, f'{significance[i]}', ha='left', va='center')

fig.canvas.draw()
# xticklabels = [f'{x.get_position()[0] + 1 :3.1f}' for x in ax1.get_xticklabels()]
# ax1.set_xticklabels(xticklabels)

outfile = "cis_eqtl_enrichment_barplot.png"
plt.tight_layout()
plt.savefig(outfile, bbox_inches='tight')
plt.show()

In [None]:
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 22, colors = 'banskt', dpi = 300)

genetypes = []
for tshort in tissues:
    genetypes += list(res_dict_randomtype[tshort].keys())
genetypes = list(set(genetypes))

for genetype in genetypes:
    objects = list()
    performance = list()
    significance = list()
    bar_colors = list()
    bar_width = list()
    for tshort in sorted_tissues:
        if tshort in res_dict:
            ntrans = res_dict[tshort].ntrans
            if ntrans > 0 and res_dict_randomtype[tshort].get(genetype, False):
                performance.append(res_dict_randomtype[tshort][genetype].enrichment - 1.0)
                objects.append(tissue_names[tshort])
                bar_colors.append(tissue_colors[tshort])
                bar_width.append(tissue_samples[tshort])
                if res_dict_randomtype[tshort][genetype].pval <= 0.0001:
                    significance.append('***')
                elif res_dict_randomtype[tshort][genetype].pval <= 0.001:
                    significance.append('**')
                elif res_dict_randomtype[tshort][genetype].pval <= 0.05:
                    significance.append('*')
                else:
                    significance.append('')
#                 objects.append(tissue_names[tshort])
#                 performance.append(res_dict[tshort].enrichment - 1.0)
#                 bar_colors.append(tissue_colors[tshort])
#                 bar_width.append(1)#tissue_samples[tshort])
#                 if res_dict[tshort].pval <= 0.0001:
#                     significance.append('***')
#                 elif res_dict[tshort].pval <= 0.001:
#                     significance.append('**')
#                 elif res_dict[tshort].pval <= 0.05:
#                     significance.append('*')
#                 else:
#                     significance.append('')


    bar_width = np.array(bar_width)

    fig = plt.figure(figsize = (15, 12))
    ax1 = fig.add_subplot(111)

    y_pos = np.arange(len(objects))
    bar1 = ax1.barh(y_pos, performance, align='center', color = bar_colors, linewidth = 0, height = 0.001 * bar_width)

    ax1.set_yticks(y_pos)
    ax1.set_yticklabels(objects)
    ax1.set_xlabel('Cis-EQTL Target-gene Enrichment in {:s}'.format(genetype))
    ax1.set_title(genetype)
    xmax = np.around(np.max(performance) + 0.05, decimals = 1)
    xmin = np.around(np.min(performance) - 0.05, decimals = 1)
    ax1.set_xlim([xmin, xmax])

    ax1.tick_params(bottom = True, top = False, left = False, right = False, labelleft = True, labelbottom = True)
    for side, border in ax1.spines.items():
        if not side == 'bottom':
            border.set_visible(False)

    # Add p-values beside the bars
    for i, rect in enumerate(bar1):
        width = rect.get_width()
        xpos = rect.get_width() + 0.02
        ypos = rect.get_y() + rect.get_height() / 2.0
        ax1.text(xpos, ypos, f'{significance[i]}', ha='left', va='center')

    fig.canvas.draw()
    xticklabels = [f'{x.get_position()[0] + 1 :3.1f}' for x in ax1.get_xticklabels()]
    ax1.set_xticklabels(xticklabels)

    outfile_type = os.path.join(outdir, "cis_eqtl_enrichment_{:s}.png".format(genetype))
    plt.tight_layout()
#     plt.savefig(outfile_type, bbox_inches='tight')
    plt.show()