In [1]:
import time
import sys, os, collections
import numpy as np
sys.path.append('../')
from operator import attrgetter

def read_ldfile(ldfile):
    ldict = collections.defaultdict(dict) #lambda: False)
    with open(ldfile) as instream:
        next(instream)
        for line in instream:
            arr = line.rstrip().split()
            chrm = int(arr[0])
            pos1 = str(arr[1])
            pos2 = str(arr[2])
            n = int(arr[3])
            r2 = float(arr[4])
            ldict[pos1][pos2] = r2
    return ldict

import mpmath

mpmath.mp.dps = 500
def pval(x): return float(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2))))

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'q', 'mu', 'sigma', 'p']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()
        
def tejaas(filepath, chrom):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            pos   = int(arr[1])
            p     = float(arr[5])
            q     = float(arr[2])
            mu    = float(arr[3])
            sigma = float(arr[4])
            if sigma == 0:
                continue
            p = p if p != 0 else pval( (q - mu) / sigma)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, q=q, mu=mu, sigma=sigma, p=p))
    return res

def tejaas_write(snplist, filepath):
    with open(filepath, 'w') as mfile:
        mfile.write("ID\tPos\tQ\tMu\tSigma\tP\n")
        for snp in snplist:
            fmtstring = "{:s}\t{:d}\t{:g}\t{:g}\t{:g}\t{:g}\n"
            mfile.write(fmtstring.format(snp.rsid, snp.pos, snp.q, snp.mu, snp.sigma, snp.p))

In [2]:
def prune_rr(rrfile, chrm, ld_dict, isworst=False):
    start = time.time()
    snplist = tejaas(rrfile, chrm)
    pruned_snps = list()
    best = None
    i = 0
    added = collections.defaultdict(lambda: False)
    switched = collections.defaultdict(lambda: False)
    while i < (len(snplist) - 2):
        best = snplist[i]
        if switched[best.pos]:
            next
        # print("best", best)
        if ld_dict.get("{:d}".format(snplist[i].pos), None) is None:
            ## This snp is not in LD with anyone, add and next
            if not added[best.pos] and not switched[best.pos]:
                pruned_snps.append(best)
                # print("add1", best)
                added[best.pos] = True
        else:
            ## This snp is in LD with at least one other SNP
            j = 1
            ## while the SNPs are less than 100kb appart and indices are within possible
            while (np.abs(snplist[i].pos - snplist[i+j].pos) <= 100000) and (i+j < len(snplist) - 1):
                # print(len(snplist), i, j)
                ## if my current snp is in LD with the next snp, does it have lower pvalue?
                if ld_dict["{:d}".format(snplist[i].pos)].get("{:d}".format(snplist[i+j].pos), None) is not None:
                    if isworst:
                        if snplist[i+j].p > best.p:
                            best = snplist[i+j]
                            # print("switch", best)
                        else:
                            # print("no switch", snplist[i+j])
                            ## this snp is in LD and has no better pval than one before it (or equal)
                            switched[snplist[i+j].pos] = True
                    else:
                        if snplist[i+j].p < best.p:
                            best = snplist[i+j]
                            # print("switch", best)
                        else:
                            # print("no switch", snplist[i+j])
                            ## this snp is in LD and has no better pval than one before it (or equal)
                            switched[snplist[i+j].pos] = True
                else:
                    pass
                    ## this pair of SNPs is not in LD, the SNP ahead (i+j) will be checked later
                    # pruned_snps.append(best)
                j += 1
            if not added[best.pos] and not switched[best.pos]:
                pruned_snps.append(best)
                # print("add2", best)
                added[best.pos] = True
        i += 1
    took = time.time() - start
    print("LD prunning took", took)
    return sorted(pruned_snps, key=attrgetter('pos'), reverse=False)

In [3]:
from utils import utils
# tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/main/tissues.txt"
# tissuenames, descriptions = utils.read_tissues(tissue_file)
# tissues = ["gtex-"+t for t in tissuenames]
tissues = ["gtex-ms"]

expressions = ["norm"] #["lmcorrected_age_precorr"]
randmethods = ["tejaas_rand_"+str(i) for i in range(1,45)]
methods = ["tejaas", "tejaas_rand"] #+ randmethods
# basedir = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v6"
basedir = "/cbscratch/franco/trans-eqtl/dev-pipeline/precorr"
chroms = np.arange(1,23)
isworst = True

In [4]:
# Pruning

for chrm in chroms: #range(1,23):
    start = time.time()
    print("Loading CHR ", chrm, end="")
    ldfile = "/cbscratch/franco/datasets/gtex/genotypes/vcfs_allsamples/LD/chr{:d}_ld_window_100000.geno.ld".format(chrm)
    myldict = read_ldfile(ldfile)
    took = time.time() - start
    print(" - {:g} seconds".format(took))
    
    # Do the actual pruning on all datasets and stuff
    for tissue in tissues:
        for expr in expressions:
            pruned_snps = list()
            for method in methods:
                inputfile = os.path.join(basedir,expr,tissue,method,"permnull_sb0.05/chr"+str(chrm)+"/rr.txt")
                print(inputfile)
                
                # prune snps
                snplist = tejaas(inputfile, chrm)
                pruned_snps2 = prune_rr(inputfile, chrm, myldict, isworst)
                
                # write pruned snps
                pruned_outfile = inputfile+".ld"
                if isworst:
                    pruned_outfile += ".worst"
                tejaas_write(pruned_snps2, pruned_outfile)

Loading CHR  1 - 14.5737 seconds
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/norm/gtex-ms/tejaas/permnull_sb0.05/chr1/rr.txt
LD prunning took 210.1786768436432
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/norm/gtex-ms/tejaas_rand/permnull_sb0.05/chr1/rr.txt
LD prunning took 206.39970469474792
Loading CHR  2 - 17.1821 seconds
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/norm/gtex-ms/tejaas/permnull_sb0.05/chr2/rr.txt
LD prunning took 226.17120265960693
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/norm/gtex-ms/tejaas_rand/permnull_sb0.05/chr2/rr.txt
LD prunning took 223.6771969795227
Loading CHR  3 - 16.4313 seconds
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/norm/gtex-ms/tejaas/permnull_sb0.05/chr3/rr.txt
LD prunning took 210.0557951927185
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/norm/gtex-ms/tejaas_rand/permnull_sb0.05/chr3/rr.txt
LD prunning took 205.20145964622498
Loading CHR  4 - 19.3543 seconds
/cbscratch/franco/trans-eqtl/dev-pipeline/precorr/no