In [15]:
import sys, os
import numpy as np
import pandas as pd
import collections

gtex_v8_variant_table = "/cbscratch/franco/datasets/gtex_v8/metadata/variant_b37_rs_id_dbSNP151_GRCh38p7_map.tsv"
fst_file = "/cbscratch/franco/from_saikat/EUR-AFR.weir.fst"
chroms = [str(x) for x in range(1,23)]

gtex_variant_dict = dict()
gtex_rsid_dict = dict()
for chrm in range(1,23):
    gtex_variant_dict[chrm] = collections.defaultdict(lambda: False)
    
print("Loading gtex variants")
with open(gtex_v8_variant_table) as instream:
    next(instream)
    for line in instream:
        varid, rsid = line.strip().split()
        chrom = varid.split("_")[0][3:]
        if chrom in chroms:
            pos   = int(varid.split("_")[1])
            gtex_variant_dict[int(chrom)][pos] = varid
            gtex_rsid_dict[varid] = rsid

Loading gtex variants


In [30]:
print("Loading fst values")
gtex_fst_dict = dict()
fst_dict = dict()
for chrm in range(1, 23):
    fst_dict[chrm] = collections.defaultdict(lambda: False)
with open(fst_file) as instream:    
    next(instream)
    for line in instream:
        arr = line.strip().split()
        if len(arr) > 0:
            chrom = int(arr[0])
            pos   = int(arr[1])
            fstval= float(arr[2])
            fst_dict[chrom][pos] = fstval
            if gtex_variant_dict[chrom][pos]:
                gtex_fst_dict[gtex_variant_dict[chrom][pos]] = fstval

Loading fst values


In [42]:
tissue = "sse"
basedir = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8"
outdir = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8_fst015/sse"
if not os.path.exists(outdir): os.makedirs(outdir)
trans_file = os.path.join(basedir, tissue, "trans_eqtls.txt")

not_fst = 0
high_fst = 0
teqtls_list = collections.defaultdict(lambda: False)
with open(os.path.join(outdir, "trans_eqtls.txt"), 'w') as outstream:
    with open(trans_file) as instream:
        header = instream.readline()
        outstream.write(header)
        for line in instream:
            arr = line.strip().split()
            varid = arr[0]
            chrom = int(varid.split("_")[0][3:])
            pos   = int(varid.split("_")[1])
            if pos in fst_dict[chrom]:
                if fst_dict[chrom][pos] <= 0.15:
                    outstream.write(line)
                    teqtls_list[varid] = True
                else:
                    high_fst += 1
            else:
                not_fst += 1
                # print(f"{varid} not in Fst list")
print(f"{not_fst} trans-eqtls not found in Fst list")
print(f"{high_fst} trans-eqtls with high Fst")

8 trans-eqtls not found in Fst list
1513 trans-eqtls with high Fst


In [43]:
for genefile in genefiles:
    targets = os.path.join(basedir, tissue, genefile)
    with open(targets) as instream:
        with open(os.path.join(outdir, genefile), 'w') as outstream:
            header = instream.readline()
            outstream.write(header)
            for line in instream:
                varid = line.strip().split()[0]
                if teqtls_list[varid]:
                    outstream.write(line)

In [47]:
tissue = "sse"
basedir = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8"
outdir = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8_fst015/sse"
trans_file = os.path.join(basedir, tissue, "trans_eqtls.txt")

In [50]:
print(os.path.basename(trans_file))
print(os.path.dirname(trans_file))

trans_eqtls.txt
/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8/sse


In [51]:
gtex_variant_dict[12]

defaultdict(<function __main__.<lambda>()>,
            {10623: 'chr12_10623_C_T_b38',
             10642: 'chr12_10642_C_T_b38',
             10652: 'chr12_10652_G_C_b38',
             10664: 'chr12_10664_T_C_b38',
             10669: 'chr12_10669_G_A_b38',
             10674: 'chr12_10674_G_A_b38',
             10700: 'chr12_10700_G_A_b38',
             10703: 'chr12_10703_C_T_b38',
             10975: 'chr12_10975_G_T_b38',
             10985: 'chr12_10985_C_T_b38',
             11012: 'chr12_11012_A_G_b38',
             11030: 'chr12_11030_G_C_b38',
             11032: 'chr12_11032_C_T_b38',
             11035: 'chr12_11035_G_T_b38',
             11089: 'chr12_11089_G_C_b38',
             11093: 'chr12_11093_G_A_b38',
             11117: 'chr12_11117_G_A_b38',
             11136: 'chr12_11136_C_T_b38',
             11170: 'chr12_11170_C_G_b38',
             11236: 'chr12_11236_C_T_b38',
             11289: 'chr12_11289_G_A_b38',
             11300: 'chr12_11300_C_T_b38',
          