In [2]:
import time
import sys, os, collections
import numpy as np
from operator import attrgetter
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')
from utils import utils
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/main/tissues.txt"
tissues, descriptions = utils.read_tissues(tissue_file)

def read_ldfile(ldfile):
    ldict = collections.defaultdict(lambda: False)
    with open(ldfile) as instream:
        next(instream)
        for line in instream:
            arr = line.rstrip().split()
            chrm = arr[0]
            if chrm.startswith("chr"):
                chrm = int(chrm[3:])
            else:
                chrm = int(chrm)
            pos1 = str(arr[1])
            pos2 = str(arr[2])
            n = int(arr[3])
            r2 = float(arr[4])
            if ldict[pos1]:
                ldict[pos1][pos2] = r2
            else:
                ldict[pos1] = collections.defaultdict(lambda: False)
                ldict[pos1][pos2] = r2
                
            if ldict[pos2]:
                ldict[pos2][pos1] = r2
            else:
                ldict[pos2] = collections.defaultdict(lambda: False)
                ldict[pos2][pos1] = r2
    return ldict

import mpmath
mpmath.mp.dps = 500
def pval(x): return float(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2))))
   
SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'q', 'mu', 'sigma', 'p', 'logp']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

def tejaas_saikat(filepath):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            pos   = int(arr[2])
            chrom = int(arr[1])
            p     = float(arr[3])
            p     = 1e-30 if p == 0 else p
            logp = np.log10(p) # mpmath.log10
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, q=None, mu=None, sigma=None, p=p, logp=-logp))
    return res

def tejaas_saikat_write(snplist, filepath):
    with open(filepath, 'w') as mfile:
        mfile.write("ID\tCHROM\tPos\tP\n")
        for snp in snplist:
            fmtstring = "{:s}\t{:d}\t{:d}\t{:g}\n"
            mfile.write(fmtstring.format(snp.rsid, snp.chrom, snp.pos, snp.p))

def prune_region(region, myldict):
    start = time.time()
    sorted_region = sorted(region, key=attrgetter('logp'), reverse=True)
    rejected = collections.defaultdict(lambda: False)
    accepted = []
    for snp in sorted_region:
        if rejected[str(snp.pos)]:
            continue
        accepted.append(snp)
        if myldict[str(snp.pos)]:
            for r in myldict[str(snp.pos)].keys():
                rejected[r] = True
    took = time.time() - start
    # print("LD prunning took", took)
    return sorted(accepted, key=attrgetter('pos'), reverse=False)            
            
def prune_region_GW(region, myldict):
    start = time.time()
    lonely = list()
    ld_region = collections.defaultdict(list)
    sorted_region = sorted(region, key=attrgetter('logp'), reverse=True)
    rejected = collections.defaultdict(dict)
    for i in range(1,23):
        rejected[i] = collections.defaultdict(lambda: False)
    accepted = []
    for snp in sorted_region:
        if rejected[snp.chrom][str(snp.pos)]:
            ## add to region
            lead_snp = rejected[snp.chrom][str(snp.pos)]
            ld_region[lead_snp].append(snp.pos)
            # print(snp.chrom, snp.pos, lead_snp)
            continue
        accepted.append(snp)
        if myldict[snp.chrom][str(snp.pos)]:
            for r in myldict[snp.chrom][str(snp.pos)].keys():
                rejected[snp.chrom][r] = snp.rsid
        else:
            #lonely SNP not in LD?
            lonely.append(snp)
    took = time.time() - start
    print("LD prunning took", took)
    return sorted(accepted, key=attrgetter('pos'), reverse=False), ld_region

In [3]:
# tissues = ["ms"]

# Load the LD map for the whole genome
basedir = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_tejaas_permnull_sb0.1_knn/"
snp_files = basedir+"{:s}/trans_eqtls_maf0.05.txt"
chroms = np.arange(1,23)
ldfile="/cbscratch/franco/datasets/gtex_v8/genotypes/ldmap/chr{:d}_gtex_v8.geno.ld"

LD_gw_dict = dict()
start = time.time()
for chrm in chroms:
    print("Loading CHR ", chrm, end="")
    myldict = read_ldfile(ldfile.format(chrm))
    LD_gw_dict[chrm] = myldict
    took = time.time() - start
    print(" - {:g} seconds".format(took))


Loading CHR  1 - 7.90346 seconds
Loading CHR  2 - 15.3817 seconds
Loading CHR  3 - 21.5581 seconds
Loading CHR  4 - 27.6596 seconds
Loading CHR  5 - 33.4142 seconds
Loading CHR  6 - 40.0881 seconds
Loading CHR  7 - 45.0487 seconds
Loading CHR  8 - 51.1188 seconds
Loading CHR  9 - 54.6531 seconds
Loading CHR  10 - 59.3958 seconds
Loading CHR  11 - 65.224 seconds
Loading CHR  12 - 69.1064 seconds
Loading CHR  13 - 71.7154 seconds
Loading CHR  14 - 74.615 seconds
Loading CHR  15 - 77.2611 seconds
Loading CHR  16 - 81.7828 seconds
Loading CHR  17 - 84.4609 seconds
Loading CHR  18 - 86.6212 seconds
Loading CHR  19 - 88.9144 seconds
Loading CHR  20 - 91.0783 seconds
Loading CHR  21 - 92.3597 seconds
Loading CHR  22 - 94.0222 seconds


In [4]:
# Do the actual pruning on all datasets and stuff
pruned_snps = list()
for tissue in tissues[:1]:
    infile = snp_files.format(tissue)
    if os.path.exists(infile):
        # prune snps
        snplist = tejaas_saikat(infile)
        pruned_snps, ld_regions = prune_region_GW(snplist, LD_gw_dict)

        # write pruned snps
        pruned_outfile = infile+".ld_prune"
        # tejaas_saikat_write(pruned_snps, pruned_outfile)

LD prunning took 0.018336772918701172


In [5]:
ld_regions

defaultdict(list,
            {'chr1_1309988_G_A_b38': [1330125,
              1387698,
              1259424,
              1306420,
              1295039,
              1280044],
             'chr1_1411323_A_C_b38': [1422081,
              1426261,
              1430190,
              1430908,
              1431450,
              1433374,
              1436079,
              1425013,
              1400410],
             'chr1_1487887_A_G_b38': [1492850,
              1470034,
              1483898,
              1486151,
              1482624,
              1527386,
              1490032,
              1495204,
              1511945,
              1459004,
              1516000,
              1530331],
             'chr1_3220635_T_C_b38': [3223849,
              3213452,
              3214671,
              3230687,
              3230466,
              3215860,
              3232844,
              3232010,
              3205757,
              3230198,
              3230727,
         

In [18]:

for chrm in chroms:
    chrm_snps = [x for x in pruned_snps if x.chrom == chrm]
    chrm_snps_sorted = sorted(chrm_snps, key=attrgetter('pos'), reverse=False)
    for leadsnp in chrm_snps_sorted:
        ldr = [str(x) for x in ld_regions[leadsnp.rsid]]
        str_fmt = "{:d}\t{:s}\t{:d}\t{:g}\t{:s}\n".format(leadsnp.chrom, leadsnp.rsid, leadsnp.pos, leadsnp.p, ",".join(ldr))
        print(str_fmt)

1	chr1_873251_G_A_b38	873251	1.12455e-10	832873

1	chr1_927744_G_T_b38	927744	3.1148e-09	926428,926744,927003,927009

1	chr1_965350_G_A_b38	965350	7.78551e-10	959193

1	chr1_1170732_A_G_b38	1170732	1e-30	

1	chr1_1284756_A_G_b38	1284756	2.35545e-12	1285574

1	chr1_1309988_G_A_b38	1309988	1e-30	1330125,1387698,1259424,1306420,1295039,1280044

1	chr1_1411323_A_C_b38	1411323	1e-30	1422081,1426261,1430190,1430908,1431450,1433374,1436079,1425013,1400410

1	chr1_1419214_A_G_b38	1419214	1.33216e-12	1421170

1	chr1_1452909_A_C_b38	1452909	1.11022e-16	1455617,1519934,1519938

1	chr1_1487887_A_G_b38	1487887	1e-30	1492850,1470034,1483898,1486151,1482624,1527386,1490032,1495204,1511945,1459004,1516000,1530331

1	chr1_1516110_G_A_b38	1516110	4.98108e-08	

1	chr1_1534915_T_C_b38	1534915	5.37348e-14	1533095,1533018,1532105,1533253,1521518,1520875,1515567

1	chr1_3195105_G_A_b38	3195105	3.9746e-14	3199797,3195853

1	chr1_3220635_T_C_b38	3220635	1e-30	3223849,3213452,3214671,3230687,3230466,3215860,323

# Clump trans-eqtls regions accross all tissues to speed up 1KG LD calculation

In [68]:
# first load all snps from all tissues
alltranseqtls = list()
for tissue in tissues[:2]:
    infile = snp_files.format(tissue)
    if os.path.exists(infile):
        alltranseqtls += tejaas_saikat(infile)

In [69]:
chr_list = list()
for CHR in np.arange(1,23):
    pos_list = [x.pos for x in alltranseqtls if x.chrom == CHR]
    chr_list.append(sorted(pos_list))

In [73]:
GW_regions = list()
for CHR in np.arange(0,22):
    print(CHR+1)
    pos_list = [int(x) for x in chr_list[CHR]]
    prev_pos = None
    next_pos = None
    regions  = list()
    window   = 200000
    start    = None
    end      = None
    for curr_pos in pos_list:
        if start is None:
            start = curr_pos - window
            end   = curr_pos + window
        else:
            if curr_pos-window > end:
                # big jump, save window
                print("Jump!")
                regions.append([start, end])
                start = curr_pos - window
                end   = curr_pos + window            

        if end is None:
            end = curr_pos + window
        else:
            if curr_pos <= end:
                end = curr_pos + window 
    GW_regions.append(regions)

1
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
2
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
3
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
4
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
5
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
6
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
7
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
8
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
9
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump!
Jump

In [86]:
with open("/cbscratch/franco/datasets/1KG_genomes/LD_regions2calculate.txt", 'w') as outstream:
    for chrm, l in enumerate(GW_regions):
        chrm_num = chrm + 1
        for r in l:
            line = "{:d}\t{:d}\t{:d}\n".format(chrm_num, r[0], r[1])
            outstream.write(line)
