In [1]:
import os
import numpy as np
import gzip
import collections
import sys
sys.path.append('/usr/users/fsimone/tejaas')
from iotools import readgtf

In [2]:
base_dir = "/cbscratch/franco/datasets"
gtffile = os.path.join(base_dir, "GENCODE/gencode.v26.annotation.gtf.gz")
genemapfile = os.path.join(base_dir, "crossmappability/hg38_gene_mappability.txt.gz")
crossmapfile = os.path.join(base_dir, "crossmappability/hg38_cross_mappability_strength.txt.gz")

In [3]:
gene_info = readgtf.gencode(gtffile, trim=False)

In [4]:
def read_crossmap(crossmapfile):
    gene_pairs = collections.defaultdict(list)
    with gzip.open(crossmapfile, 'r') as instream:
        for line in instream:
            arr = line.rstrip().split()
            gene_pairs[arr[0].decode('utf-8')].append(arr[1].decode('utf-8'))
    return gene_pairs

cross_gene_dict = read_crossmap(crossmapfile)            

In [5]:
def read_genemapp(genemapfile):
    gene_mapp = collections.defaultdict(lambda: False)
    with gzip.open(genemapfile, 'r') as instream:
        for line in instream:
            gene_id, mapp = line.rstrip().split()
            if mapp != b'NA':
                gene_mapp[gene_id.decode('utf-8')] = float(mapp)
    return gene_mapp

gene_mapp = read_genemapp(genemapfile)

In [6]:
import re

SNPINFO_FIELDS = ['chrom', 'varid', 'bp_pos', 'ref_allele', 'alt_allele', 'maf']
class SnpInfo(collections.namedtuple('_SnpInfo', SNPINFO_FIELDS)):
    __slots__ = ()
    
def read_vcf(filename, startsnp, endsnp, mode="DS", samplefile=None):
    dosage = list()
    snpinfo = list()
    linenum = 0
    with gzip.open(filename, 'r') as vcf:
        for line in vcf:
            linestrip = line.decode().strip()
            if linestrip[:2] == '##': continue
            if linestrip[:6] == '#CHROM':
                linesplit = linestrip.split("\t")
                donor_ids = linesplit[9:]
            else:
                if linenum >= startsnp and linenum < endsnp:
                    linesplit = linestrip.split("\t")
                    if linesplit[0].startswith("chr"):
                        chrom = int(linesplit[0][3:])
                    else:
                        chrom = int(linesplit[0])
                    pos   = int(linesplit[1])
                    varid = linesplit[2]
                    ref   = linesplit[3]
                    alt   = linesplit[4]

                    if mode == "DS":
                        if "DS" not in linesplit[8].split(':'):
                            mode = "GT"
                        else:
                            dsindx = linesplit[8].split(':').index("DS")
                            ds = [x.split(':')[dsindx] for x in linesplit[9:]]
                            gtindx = linesplit[8].split(':').index("GT")
                            for i, x in enumerate(ds):
                                if x == ".":
                                    gt = linesplit[9+i].split(':')[gtindx]
                                    if len(gt) == 3 and gt[0] != "." and gt[2] != ".":
                                        ds[i] = float(int(gt[0]) + int(gt[2]))

                    if mode == "GT":
                        if "GT" not in linesplit[8].split(':'):
                            print("ERROR: no GT field in VCF file")
                            raise
                        gtindx = linesplit[8].split(':').index("GT")
                        gt = [x.split(':')[gtindx] for x in linesplit[9:]]
                        ds = [ float(int(x[0]) + int(x[2])) if len(x) == 3 and x[0] != "." and x[2] != "." else "." for x in gt ]

                    ds_notna = [float(x) for x in ds if x != "."]
                    freq = sum(ds_notna) / 2 / len(ds_notna)
                    maf = freq
                    snpdosage = [float(x) if x != '.' else 2 * freq for x in ds]

                    this_snp = SnpInfo(chrom      = chrom,
                                       bp_pos     = pos,
                                       varid      = varid,
                                       ref_allele = ref,
                                       alt_allele = alt,
                                       maf        = maf)

                    dosage.append(snpdosage)
                    snpinfo.append(this_snp)
                linenum += 1
                
                if linenum > endsnp:
                    break
                    
    if samplefile is not None:
        if os.path.exists(samplefile):
            with open(samplefile, 'r') as samfile:
                sample = 0
                samplenames = list()
                next(samfile)
                next(samfile)
                for line in samfile:
                    if re.search('^#', line):
                        continue
                    samplenames.append(line.strip().split()[0])
            common_ids = [x for x in samplenames if x in donor_ids]
            print("GT Sample selection {:d} samples were retained from a total of {:d} samples".format(len(common_ids), len(donor_ids)))
            ix = [donor_ids.index(x) for x in common_ids]
            return np.array(dosage)[:,ix], snpinfo, common_ids
        else:
            print("samplefile does not exist")
            raise
    else:
        return np.array(dosage), snpinfo, donor_ids


def normalize_expr(Y):
    if isinstance(Y, pd.DataFrame):
        Y_cent = (Y.values - np.mean(Y.values, axis = 1).reshape(-1, 1)) / np.std(Y.values, axis = 1).reshape(-1, 1)
        Y_cent = pd.DataFrame(Y_cent, index=Y.index, columns=Y.columns)
        Y_cent.index.name = Y.index.name
    else:
        Y_cent = (Y - np.mean(Y, axis = 1).reshape(-1, 1)) / np.std(Y, axis = 1).reshape(-1, 1)
    return Y_cent

def select_donors(vcf_donors, expr_donors):
    ''' Make sure that donors are in the same order for both expression and genotype
    '''
    common_donors = [x for x in vcf_donors if x in expr_donors]
    vcfmask = np.array([vcf_donors.index(x) for x in common_donors])
    exprmask = np.array([expr_donors.index(x) for x in common_donors])
    return vcfmask, exprmask

def select_genes(info, names):
    ''' Select genes which would be analyzed. 
        Make sure the indices are not mixed up
    '''
    allowed = [x.ensembl_id for x in info]
    common  = [x for x in names if x in allowed]
    genes = [x for x in info if x.ensembl_id in common]
    indices = [names.index(x.ensembl_id) for x in genes]
    return genes, np.array(indices)

CISMASK_FIELDS = ['rmv_id', 'apply2']
class CisMask(collections.namedtuple('_CisMask', CISMASK_FIELDS)):
    __slots__ = ()

    @property
    def nsnp(self):
        return len(self.apply2)

    def __repr__(self):
        parent_string = super(CisMask, self).__repr__()
        return '{:s}, nsnp = {:d}'.format(parent_string, self.nsnp)

def get_cismasklist(snpinfo, geneinfo, chrom, window=1e6):
    chr_genes_ix = [[] for ichrm in range(22)] 
    chr_genes = [[] for ichrm in range(22)]
    if chrom is not None:
        chr_genes_ix[chrom - 1] = np.array([i for i, g in enumerate(geneinfo) if g.chrom == chrom])
        chr_genes[chrom - 1] = [geneinfo[ix] for ix in chr_genes_ix[chrom - 1]]
    else:
        for ichrm in range(22):
            chr_genes_ix[ichrm] = np.array([i for i, g in enumerate(geneinfo) if g.chrom == ichrm + 1])
            chr_genes[ichrm] = [geneinfo[ix] for ix in chr_genes_ix[ichrm]]
    genemasks = list()
    iprev = 0
    ichrmprev = 0
    for snp in snpinfo:
        pos = snp.bp_pos
        left = pos - window
        right = pos + window
        ichrm = chrom - 1 if chrom is not None else snp.chrom - 1
        iprev_started = False
        if ichrm != ichrmprev:
            iprev = 0
            ichrmprev = ichrm
        thismask = list()
        for i, g in enumerate(chr_genes[ichrm][iprev:]):
            gstart = g.start
            gend = g.end
            if gstart >= left and gstart <= right:
                # thismask.append(iprev + i)
                thismask.append(chr_genes_ix[ichrm][iprev + i])
                if not iprev_started:
                    new_start_iloc = iprev
                    iprev_started = True
            elif gend >= left and gend <= right:
                # thismask.append(iprev + i)
                thismask.append(chr_genes_ix[ichrm][iprev + i])
                if not iprev_started:
                    new_start_iloc = iprev
                    iprev_started = True
            if gstart > right:
                break
        if len(thismask) > 0:
            #genemasks.append(chr_genes_ix[np.array(thismask)])
            #iprev = thismask[0]
            genemasks.append(np.array(thismask))
            iprev = new_start_iloc
        else:
            genemasks.append(np.array([]))
    return genemasks

def compress_cismasklist(genemasks):
    cismasks = list()
    appendmask = False
    endmask = False
    setprev = False
    snplist = list()
    for i, mask in enumerate(genemasks):
        if not setprev:
            prev_mask = mask
            setprev = True
        if np.all(np.array_equal(mask, prev_mask)):
            snplist.append(i)
        else:
            appendmask = True

        if i == len(genemasks) - 1: endmask = True # no more masks to process

        if appendmask:
            thismask = CisMask(rmv_id = prev_mask, apply2 = snplist)
            cismasks.append(thismask)
            snplist = list([i])
            prev_mask = mask
            if not endmask:
                appendmask = False

        if endmask:
            # if not appendmask:
            #     snplist.append(i)
            thismask = CisMask(rmv_id = mask, apply2 = snplist)
            cismasks.append(thismask)

    return cismasks

In [9]:
# Load real expression
import pandas as pd
tissue="as"
# - No corrections
df = pd.read_csv("/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue), header=0, index_col=0, sep="\t")
ngene, nsample = df.shape
gx_donors = list(df.columns)
gx = df.values
gene_names = list(df.index)


In [10]:
# Use real genotype
chrm=12
f_vcf = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_0.01/GTEX_v8_2019-07-29_WGS_838Indiv_Freeze_NoMissingGT_SNPfilter_MAF0.01_chr{:d}.vcf.gz".format(chrm)
samplefile = "/cbscratch/franco/datasets/gtex_v8/genotypes/gtex_v8.sample"
gtfull, snpinfos, gt_donors = read_vcf(f_vcf, 0, 50000, samplefile=samplefile)

import copy
vcfmask, exprmask = select_donors(gt_donors, gx_donors)
genes, indices = select_genes(gene_info, gene_names)
dosage_masked = gtfull[:, vcfmask]

GT Sample selection 838 samples were retained from a total of 838 samples


In [11]:
## --- Obtain CisMasks

cismasklist = get_cismasklist(snpinfos, genes, chrm, window=1e6)
cismaskcomp = compress_cismasklist(cismasklist)

In [12]:
snpinfos[0]

SnpInfo(chrom=12, varid='chr12_50255_C_T_b38', bp_pos=50255, ref_allele='C', alt_allele='T', maf=0.012529832935560859)

In [27]:
genes_ix_dict = dict(zip([x.ensembl_id for x in genes], np.arange(len(genes))))
crossmap_cismaskcomp = list()
for cismask in cismaskcomp:
    gene_list = list()  # list of crossmapped genes for the cismask
    for gene in [genes[i] for i in cismask.rmv_id]:
        gene_list += cross_gene_dict[gene.ensembl_id]
    uniq_gene_list = list(set(gene_list))
    ugene_dict = collections.defaultdict(lambda: False) # significant speed-up by making a dict
    for g in uniq_gene_list:
        ugene_dict[g] = True
    cm_gene2rmv_ix = [genes_ix_dict[x.ensembl_id] for x in genes if ugene_dict[x.ensembl_id]]
    # print(len(cismask.rmv_id), len(cm_gene2rmv_ix))
    if len(cm_gene2rmv_ix) > 0:
        new_mask_genes = sorted(list(set(list(cismask.rmv_id) + cm_gene2rmv_ix)))
        # new_cismask = cismask._replace(rmv_id = np.array( list(cismask.rmv_id) + cm_gene2rmv_ix))
        new_cismask = cismask._replace(rmv_id = np.array(new_mask_genes))
        # print(len(new_cismask.rmv_id))
        crossmap_cismaskcomp.append(new_cismask)
    else:
        #raise
        crossmap_cismaskcomp.append(cismask)

In [29]:
# for i in range(len(crossmap_cismaskcomp)):
#     print(len(crossmap_cismaskcomp[i].rmv_id) - len(set(crossmap_cismaskcomp[i].rmv_id)))

crossmap_cismaskcomp[0].rmv_id

array([   11,    24,    29, ..., 15649, 15654, 15668])

# How many cis-egenes does GTEx have?

In [139]:
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import time
from statsmodels.distributions.empirical_distribution import ECDF
sys.path.append('../')
sys.path.append('/usr/users/fsimone/tejaas')
#from utils import readgtf
from utils import utils
import mpmath
import collections
from operator import attrgetter
import gzip

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'target', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

def read_cis(filepath):
    res = list()
    if not os.path.exists(filepath) or os.stat(filepath).st_size == 0:
        print("File empty or does not exist")
        return res
    with gzip.open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr  = line.decode().strip().split("\t")
            rsid = arr[0]
            if rsid.startswith("chrX"):
                continue
            pos = int(rsid.split("_")[1])
            chrom = int(rsid.split("_")[0][3:])
            gene = arr[1].split(":")[-1].split(".")[0]
            maf  = float(arr[5])
            logp = np.log10(float(arr[6]))
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, target=gene, maf=maf))
    return res

In [142]:
import json
json_file = "../gtex_v8_metadata.json"
with open(json_file) as instream:
    gtex_meta = json.load(instream)
    
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/main/tissues.txt"
tissues, descriptions = utils.read_tissues(tissue_file)
tissue_names   = dict()
tissue_colors  = dict()
tissue_samples = dict()
for tshort, tfull in zip(tissues, descriptions):
    tissue_names[tshort] = tfull
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]

tissue="as"
gtexportal_dir = "/cbscratch/franco/datasets/gtex_v8/expression/gtex_portal/eQTLs/GTEx_Analysis_v8_eQTL/"
dataset = "gtex_v8"
tejaas_expr = "raw"
K = 30
pcutoff = 5e-8
MIN_TRANS = 1
MIN_CIS   = 1

signif_cisfile = os.path.join(gtexportal_dir, "{:s}.v8.signif_variant_gene_pairs.txt.gz".format(tissue_names[tissue].replace(" ", "_")))
if not os.path.exists(signif_cisfile) or os.stat(signif_cisfile).st_size == 0:
    print("{:s} has no cis-file in GTEx!".format(tissue_names[tissue]))
ciseqtls = read_cis(signif_cisfile)

In [143]:
ciseqtls

[SNPRes(rsid='chr1_64764_C_T_b38', chrom=1, pos=64764, logp=7.992845622625191, target='ENSG00000227232', maf=0.0611015),
 SNPRes(rsid='chr1_665098_G_A_b38', chrom=1, pos=665098, logp=6.064291252157337, target='ENSG00000227232', maf=0.111015),
 SNPRes(rsid='chr1_666028_G_A_b38', chrom=1, pos=666028, logp=5.206855951966113, target='ENSG00000227232', maf=0.0972461),
 SNPRes(rsid='chr1_108826_G_C_b38', chrom=1, pos=108826, logp=3.922541247632173, target='ENSG00000269981', maf=0.0344234),
 SNPRes(rsid='chr1_126108_G_A_b38', chrom=1, pos=126108, logp=3.8504852980993123, target='ENSG00000269981', maf=0.0628227),
 SNPRes(rsid='chr1_133160_G_A_b38', chrom=1, pos=133160, logp=3.7081784404175417, target='ENSG00000269981', maf=0.070568),
 SNPRes(rsid='chr1_134234_G_A_b38', chrom=1, pos=134234, logp=3.662331129895448, target='ENSG00000269981', maf=0.0688468),
 SNPRes(rsid='chr1_135032_G_A_b38', chrom=1, pos=135032, logp=3.6097309685349512, target='ENSG00000269981', maf=0.0671256),
 SNPRes(rsid='chr