In [33]:
import os, sys
sys.path.append('../../')
from utils import utils
import json
import numpy as np
import gzip
import collections

annot_file = "/cbscratch/franco/datasets/gtex_v8/expression/WGS_Feature_overlap_collapsed_VEP_short_4torus.MAF01.txt.gz"
snp_file   = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_SHAPEIT2/0.01/gtex_v8_snpinfo_SHAPEIT2.txt"
# Subset full set of annotations to our genotype
out_file = "/cbscratch/franco/datasets/gtex_v8/expression/GTEx_v8_Features_subset_SHAPEIT2_genotype.MAF01.txt.gz"

if not os.path.exists(out_file):
    print("Reading Feature annotation")
    snp_dict = collections.defaultdict(lambda: False)
    with open(snp_file) as instream:
        for line in instream:
            arr = line.strip().split()
            snp_dict[arr[1]] = True

    print("Writing custom genotype annotations")
    with gzip.open(out_file, 'wb') as outstream:
        with gzip.open(annot_file) as instream:
            header = next(instream)
            outstream.write(header)
            for line in instream:
                arr = line.decode().strip().split("\t")
                if snp_dict[arr[0]]:
                    outstream.write(line)
else:
    print("File exists!")

File exists!


In [34]:
import pandas as pd
df = pd.read_csv(out_file, header=0, sep="\t", index_col=0)


In [35]:
df.head()

Unnamed: 0_level_0,enhancer_d,promoter_d,open_chromatin_region_d,promoter_flanking_region_d,CTCF_binding_site_d,TF_binding_site_d,3_prime_UTR_variant_d,5_prime_UTR_variant_d,frameshift_variant_d,intron_variant_d,missense_variant_d,non_coding_transcript_exon_variant_d,splice_acceptor_variant_d,splice_donor_variant_d,splice_region_variant_d,stop_gained_d,synonymous_variant_d
SNP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
chr1_13550_G_A_b38,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
chr1_14677_G_A_b38,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
chr1_16841_G_T_b38,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
chr1_16856_A_G_b38,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
chr1_17005_A_G_b38,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [36]:
df.shape

(8048655, 17)

In [93]:
## Background frequencies
import numpy as np
tot = df.shape[0]
bg  = dict()
for typ, n in zip(df.columns, np.sum(df.values, axis=0)):
    bg[typ] = n/tot
    print("{:s}\t{:d}\t{:g}".format(typ, n, n/tot))
    
not_annot = np.sum(np.sum(df.values, axis=1) == 0)
bg['not_annot'] = not_annot/tot
print("{:s}\t{:d}\t{:g}".format('not_annot', not_annot, not_annot/tot))

enhancer_d	63300	0.00786467
promoter_d	62692	0.00778913
open_chromatin_region_d	190956	0.0237252
promoter_flanking_region_d	179666	0.0223225
CTCF_binding_site_d	217021	0.0269636
TF_binding_site_d	34386	0.00427227
3_prime_UTR_variant_d	83841	0.0104168
5_prime_UTR_variant_d	21382	0.00265659
frameshift_variant_d	0	0
intron_variant_d	4012743	0.498561
missense_variant_d	28796	0.00357774
non_coding_transcript_exon_variant_d	138288	0.0171815
splice_acceptor_variant_d	371	4.60947e-05
splice_donor_variant_d	696	8.64741e-05
splice_region_variant_d	8801	0.00109347
stop_gained_d	367	4.55977e-05
synonymous_variant_d	30129	0.00374336
not_annot	3430119	0.426173


In [127]:
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissue_table.txt"
tissues, descriptions, tstrings = utils.read_tissues_str(tissue_file)

json_file = "../../gtex_v8_metadata.json"
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_samples = dict()

tshorst_dict = dict(zip(tstrings, tissues))

for tshort, tfull in zip(tissues, descriptions):
    tissue_names[tshort] = tfull
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]

special_tissues = ['ag', 'haa', 'liv', 'msg', 'pan', 'pit', 'si', 'spl', 'va', 'wb']

In [128]:
import mpmath
import collections
mpmath.mp.dps = 50
def pvalue(x): return float(mpmath.log10(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2)))))

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()
    
def tejaas(filepath, mafcutoff=0.01):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            pos   = int(arr[2])
            p     = float(arr[7])
            chrom = int(arr[1])
            q     = float(arr[4])
            mu    = float(arr[5])
            sigma = float(arr[6])
            maf   = float(arr[3])
            if maf < mafcutoff or maf > (1-mafcutoff):
                continue
            if sigma == 0:
                continue
            logp  = np.log10(p) if p != 0 else pvalue( (q - mu) / sigma)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, maf=maf))
    return res

alltranseqtls = list()
transeqtl_dict = dict()
tejaas_file = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8/{:s}/trans_eqtls.txt"
for tissue in tissues:
    if tissue in special_tissues:
        tejaas_file = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_optim_gamma_knn30_cut5e-8/{:s}/trans_eqtls_ldpruned.txt"
    else:
        tejaas_file = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8/{:s}/trans_eqtls_ldpruned.txt"
    tissue_tejaas_file = tejaas_file.format(tissue)
    tissue_trans_eqtls = tejaas(tissue_tejaas_file)
    transeqtl_dict[tissue] = tissue_trans_eqtls
    alltranseqtls += tissue_trans_eqtls

In [143]:
import scipy.stats as ss

## Count ocurrence of trans-eQTL in different tissues
teqtl_ntissues = collections.defaultdict(int)
for snp in alltranseqtls:
    teqtl_ntissues[snp.rsid] += 1
    
teqtl_cats = dict()
for tissue in tissues:
    teqtl_cats[tissue] = dict()
    ids = [snp.rsid for snp in transeqtl_dict[tissue]]
    ntot = len(set(ids))
    df_slice = df.loc[ids]
    cat_counts = np.sum(df_slice, axis=0)
    for c in cat_counts.index:
        # teqtl_cats[tissue][c] = cat_counts[c] 
        teqtl_cats[tissue][c] = dict(counts = int(cat_counts[c]), 
                                      frac = cat_counts[c]/ntot, 
                                      enrichment = cat_counts[c]/ntot / bg[c],
                                      pval = ss.binom_test(cat_counts[c], ntot, bg[c], alternative='greater') )
    not_annot = np.sum(np.sum(df_slice, axis=1) == 0)
    # teqtl_cats[tissue]['no_annot'] = not_annot
    teqtl_cats[tissue]['not_annot'] = dict(counts = int(not_annot), 
                                     frac = not_annot/ntot, 
                                     enrichment = not_annot/ntot / bg['not_annot'],
                                     pval = ss.binom_test(not_annot, ntot, bg['not_annot'], alternative='greater') )



In [144]:
## Get category counts across all trans-eqtls in all tissues
teqtl_cats['all'] = dict()
all_rsids = list(teqtl_ntissues.keys())
ntot = len(set(all_rsids))
df_slice = df.loc[all_rsids]
cat_counts = np.sum(df_slice, axis=0)
for c in cat_counts.index:
    #print(c, cat_counts[c])
#     teqtl_cats['all'][c] = cat_counts[c] 
    teqtl_cats['all'][c] = dict( counts = int(cat_counts[c]), 
                           frac = cat_counts[c]/ntot, 
                           enrichment = cat_counts[c]/ntot / bg[c],
                           pval = ss.binom_test(cat_counts[c], ntot, bg[c], alternative='greater') )
not_annot = np.sum(np.sum(df_slice, axis=1) == 0)
# teqtl_cats['all']['no_annot'] = not_annot
teqtl_cats['all']['not_annot'] = dict( counts = int(not_annot), 
                                frac = not_annot/ntot, 
                                enrichment = not_annot/ntot / bg['not_annot'],
                                pval = ss.binom_test(not_annot, ntot, bg['not_annot'], alternative='greater') )

  if sys.path[0] == '':


In [145]:
## Get category counts across trans-eqtls in 1, 2, 3, 4, 5+ tissues
n1_rsids = [k for k in teqtl_ntissues.keys() if teqtl_ntissues[k] == 1]
n2_rsids = [k for k in teqtl_ntissues.keys() if teqtl_ntissues[k] == 2]
n3_rsids = [k for k in teqtl_ntissues.keys() if teqtl_ntissues[k] == 3]
n4_rsids = [k for k in teqtl_ntissues.keys() if teqtl_ntissues[k] == 4]
n5_rsids = [k for k in teqtl_ntissues.keys() if teqtl_ntissues[k] >= 5]

ntot = len(all_rsids)
print(len(n1_rsids), len(n2_rsids), len(n3_rsids), len(n4_rsids), len(n5_rsids))
print(len(n1_rsids)/ntot, len(n2_rsids)/ntot, len(n3_rsids)/ntot, len(n4_rsids)/ntot, len(n5_rsids)/ntot)

ncombinations = dict(zip(['n1', 'n2', 'n3', 'n4', 'n5'], [n1_rsids, n2_rsids, n3_rsids, n4_rsids, n5_rsids]))

20811 4031 1596 778 1211
0.73208569317902 0.14180180814014845 0.05614380694410244 0.027368346994054948 0.04260034474267422


In [146]:
for k in ncombinations:
    ids = ncombinations[k]
    ntot = len(set(ids))
    print(k, ntot)
    teqtl_cats[k] = dict()
    df_slice = df.loc[ids]
    cat_counts = np.sum(df_slice, axis=0)
    for c in cat_counts.index:
        # teqtl_cats[k][c] = cat_counts[c]
        teqtl_cats[k][c] = dict(counts = int(cat_counts[c]), 
                           frac = cat_counts[c]/ntot, 
                           enrichment = cat_counts[c]/ntot / bg[c],
                           pval = ss.binom_test(cat_counts[c], ntot, bg[c], alternative='greater') )
    not_annot = np.sum(np.sum(df_slice, axis=1) == 0)
    # teqtl_cats[k]['no_annot'] = not_annot
    teqtl_cats[k]['not_annot'] = dict(counts = int(not_annot), 
                                frac = not_annot/ntot, 
                                enrichment = not_annot/ntot / bg['not_annot'],
                                pval = ss.binom_test(not_annot, ntot, bg['not_annot'], alternative='greater') )

n1 20811
n2 4031
n3 1596
n4 778
n5 1211


  if sys.path[0] == '':


In [147]:
json_dict = json.dumps(teqtl_cats)
with open("gtex_v8_feature_enrichments.txt", 'w') as outstream:
    outstream.write(json_dict)