In [1]:
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import pandas as pd
import gzip
import collections
import sys
sys.path.append('/usr/users/fsimone/tejaas')
from iotools import readgtf
from utils import cismasking

sys.path.append('../../')
import operator
import json
from utils import utils
json_file = "../../gtex_v8_metadata.json"
tissue_file = "/usr/users/fsimone/trans-eqtl-pipeline/analysis/plots/tissue_table.txt"
tshorts, tfulls_plain = utils.read_tissues(tissue_file, plain=True)
_, tfulls = utils.read_tissues(tissue_file)
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors  = dict()
tissue_names   = dict()
tissue_samples = dict()
for tshort, tfull in zip(tshorts, tfulls_plain):
    tissue_names[tshort] = tfull
for tshort, tfull in zip(tshorts, tfulls):
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]

brain_tissues = ['bam','ban','bca','bceh','bce','bco','bfr','bhi','bhy','bnu','bpu','bsp','bsu']   
optim_tissues = ['haa', 'pan', 'spl', 'wb']


In [2]:
base_dir = "/cbscratch/franco/datasets"
gtffile = os.path.join(base_dir, "GENCODE/gencode.v26.annotation.gtf.gz")
genemapfile  = os.path.join(base_dir, "crossmappability/hg38_gene_mappability.txt.gz")
crossmapfile = os.path.join(base_dir, "crossmappability/hg38_cross_mappability_strength.txt.gz")

In [3]:
SNPINFO_FIELDS = ['chrom', 'varid', 'bp_pos', 'ref_allele', 'alt_allele', 'maf']
class SnpInfo(collections.namedtuple('_SnpInfo', SNPINFO_FIELDS)):
    __slots__ = ()

    
def Snp2info(snps_list):
    snpinfo = list()
    for snpid in snps_list:
        arr = snpid.split("_")
        chrom = int(arr[0][3:])
        pos = int(arr[1])
        this_snp = SnpInfo(chrom      = chrom,
                           bp_pos     = pos,
                           varid      = snpid,
                           ref_allele = "X",
                           alt_allele = "Y",
                           maf        = None)
        snpinfo.append(this_snp)
    return snpinfo

def knn_correction(expr, dosage, K, f=1):
    assert (expr.shape[0] == dosage.shape[1])
    pca = PCA(n_components=int(f * min(expr.shape[0], expr.shape[1]) ))
#     print("Original dimension: {:d} x {:d}".format(expr.shape[0], expr.shape[1]))
    pca.fit(expr) # requires N x G
    expr_pca = pca.transform(expr)
#     print("Reduced dimension: {:d} x {:d}".format(expr_pca.shape[0], expr_pca.shape[1]))

    def gene_distance(a, b):
        return np.linalg.norm(a - b)

    nsample = expr.shape[0]
    distance_matrix = np.zeros((nsample, nsample))
    for i in range(nsample):
        for j in range(i+1, nsample):
            dist = gene_distance(expr_pca[i,:], expr_pca[j,:])
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist

    kneighbor = K
    gx_knn = np.zeros_like(expr)
    gt_knn = np.zeros_like(dosage)
    neighbor_list = list()

    for i in range(nsample):
        neighbors = np.argsort(distance_matrix[i, :])[:kneighbor + 1][1:]
        gx_knn[i, :] = expr[i, :] - np.mean(expr[neighbors, :], axis = 0)
        # noisy_neighbors = np.random.choice(neighbors, size = int(2 * kneighbor / 3), replace = False)
        # noisy_neighbors = np.random.choice(neighbors, size = kneighbor, replace = True )
        noisy_neighbors = neighbors
        gt_knn[:, i] = dosage[:, i] - np.mean(dosage[:, noisy_neighbors], axis = 1)
        neighbor_list.append(neighbors)

    return gx_knn, gt_knn
    
def extend_cismask(genes, cismaskcomp, cross_gene_dict):
    # cross_gene_dict = read_crossmap(crossmapfile)
    genes_ix_dict = dict(zip([x.ensembl_id for x in genes], np.arange(len(genes))))
    crossmap_cismaskcomp = list()
    for cismask in cismaskcomp:
        gene_list = list()  # list of crossmapped genes for the cismask
        for gene in [genes[i] for i in cismask.rmv_id]:
            gene_list += cross_gene_dict[gene.ensembl_id]
        uniq_gene_list = list(set(gene_list))
        ugene_dict = collections.defaultdict(lambda: False) # significant speed-up by making a dict
        for g in uniq_gene_list:
            ugene_dict[g] = True
        cm_gene2rmv_ix = [genes_ix_dict[x.ensembl_id] for x in genes if ugene_dict[x.ensembl_id]]
        if len(cm_gene2rmv_ix) > 0:
            new_mask_genes = sorted(list(set(list(cismask.rmv_id) + cm_gene2rmv_ix)))
            new_cismask = cismask._replace(rmv_id = np.array(new_mask_genes))
            crossmap_cismaskcomp.append(new_cismask)
        else:
            crossmap_cismaskcomp.append(cismask)
    return crossmap_cismaskcomp

def read_tissue_genes(gx_file, geneinfo_dict):
    genes = list()
    with open(gx_file) as inst:
        next(inst)
        for line in inst:
            ensembl_id = line.split()[0]
            genes.append(geneinfo_dict[ensembl_id])
    return genes

def read_crossmap(crossmapfile):
    gene_pairs = collections.defaultdict(list)
    with gzip.open(crossmapfile, 'r') as instream:
        for line in instream:
            arr = line.rstrip().split()
            gene_pairs[arr[0].decode('utf-8')].append(arr[1].decode('utf-8'))
    return gene_pairs

cross_gene_dict = read_crossmap(crossmapfile) 

In [4]:
geneinfo = readgtf.gencode(gtffile, trim=False)

### reformat gene info
geneinfo_dict = dict()
for g in geneinfo:
    geneinfo_dict[g.ensembl_id] = g

In [5]:
import mpmath
mpmath.mp.dps = 50
def pvalue(x): return float(mpmath.log10(1 - 0.5 * (1 + mpmath.erf(x/mpmath.sqrt(2)))))

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()
    
def tejaas(filepath):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            chrom = int(arr[1])
            pos   = int(arr[2])
            maf   = float(arr[3])
            q     = float(arr[4])
            mu    = float(arr[5])
            sigma = float(arr[6])
            p     = float(arr[7])
            if sigma == 0:
                continue
            logp  = np.log10(p) if p != 0 else pvalue( (q - mu) / sigma)
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, maf=maf))
    return res

base_resdir = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_SHAPEIT2/raw/gtex_v8-aa/tejaas/permnull_sb0.1_knn30_crossmap/chr1/rr.txt"

target_file    = "trans_eqtls.txt.ld_prune"

## SHAPEIT 2 data
path_sb01    = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma01_knn30_cut5e-8/"
path_sb0006  = "/cbscratch/franco/trans-eqtl/protein_coding_lncRNA_gamma0006_knn30_cut5e-8/"
path_raw     = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_SHAPEIT2/raw/summary_5e-08/"


teqtls        = os.path.join(path_sb01, "{:s}/trans_eqtls_ldpruned.txt") 
teqtls_cm     = os.path.join(path_raw, "{:s}/tejaas", "permnull_sb0.1_knn30_crossmap", target_file)
alt_teqtls    = os.path.join(path_sb0006, "{:s}/trans_eqtls_ldpruned.txt") 
alt_teqtls_cm = os.path.join(path_raw, "{:s}/tejaas", "permnull_sb0.006_knn30_crossmap", target_file)

teqtl_dict = collections.defaultdict(dict)
for ts in tshorts:
    if ts not in brain_tissues:
        if ts in optim_tissues:
            teqtl_dict["sb"][ts] = tejaas(alt_teqtls.format(ts))
            teqtl_dict["cm"][ts] = tejaas(alt_teqtls_cm.format(ts))
        else:
            teqtl_dict["sb"][ts] = tejaas(teqtls.format(ts))
            teqtl_dict["cm"][ts] = tejaas(teqtls_cm.format(ts))


In [6]:
import pandas as pd

cm_allpval_file = "/cbscratch/franco/trans-eqtl/dev-pipeline/gtex_v8_SHAPEIT2/all_variants_pvalues_tejaas_4optims_crossmap.txt"
pval_df_cm = pd.read_csv(cm_allpval_file, header=0, index_col=0, sep="\t")

allpval_file = "/cbscratch/franco/from_saikat/gtex_v8_202003/all_variants_pvalues_tejaas_4optims.txt"
pval_df = pd.read_csv(allpval_file, header=0, index_col=0, sep="\t")

In [7]:
## Subset tissue dataframes of pvalues
print(pval_df_cm.shape)
print(pval_df.shape)

tissues_set = [x for x in tshorts if x not in brain_tissues]
pval_df_cm_subset = pval_df_cm[tissues_set]
pval_df_subset = pval_df[tissues_set]

print(pval_df_cm_subset.shape)
print(pval_df_subset.shape)


(8048655, 38)
(8048655, 49)
(8048655, 36)
(8048655, 36)


In [8]:
global_cutoff = -np.log10(5e-8)

for ts in optim_tissues:
    print(f"Tissue: {ts}")

    # gx_file = f"/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{ts}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered"
    gx_file = f"/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{ts}_knn30_norm.txt.protein_coding_lncRNA_filtered"
    df = pd.read_csv(gx_file, header=0, index_col=0, sep="\t")

    snpids    = [x.rsid for x in teqtl_dict["sb"][ts]]
    snpids_cm = [x.rsid for x in teqtl_dict["cm"][ts]]

    both_sets = list(set(snpids + snpids_cm))  

    logpvals    = pval_df_subset[ts][both_sets]
    logpvals_cm = pval_df_cm_subset[ts][both_sets]

    # Find snps that become significant after crossmap filter
    sig_after_cm_ix      = logpvals < global_cutoff
    snpinfo_tissue       = Snp2info(both_sets)
    snpinfo_sig_after_cm = [snp for i, snp in enumerate(snpinfo_tissue) if sig_after_cm_ix[i]]

    print(f"SNPs before cm: {len(snpids)}")
    print(f"SNPs after cm:  {len(snpids_cm)}")
    print(f"SNPs brought in {np.sum(sig_after_cm_ix)}")

    ts_genes = read_tissue_genes(gx_file, geneinfo_dict)

    #### Obtain cismasks and add crossmappable genes
    cismasklist = cismasking.get_cismasklist(snpinfo_sig_after_cm, ts_genes, None, window=1e6)
    cismaskcomp = cismasking.compress_cismasklist(cismasklist)
    crossmap_cismaskcomp = extend_cismask(ts_genes, cismaskcomp, cross_gene_dict)

    print(f"masked crossmap-genes: {len(crossmap_cismaskcomp[0].rmv_id)}")
    print(f"masked cis-genes: {len(cismaskcomp[0].rmv_id)}")

    usegenes_cm = np.ones(df.shape[0], dtype=bool)
    thismask_cm = crossmap_cismaskcomp[1]
    if thismask_cm.rmv_id.shape[0] > 0: usegenes_cm[thismask_cm.rmv_id] = False      
    df_cm = df[usegenes_cm]

    usegenes = np.ones(df.shape[0], dtype=bool)
    thismask = cismaskcomp[1]
    if thismask.rmv_id.shape[0] > 0: usegenes[thismask.rmv_id] = False   
    df_cis = df[usegenes]

    ## Write down these expressions to find out if there are differences in the sigma beta optimization

    # df_cm.to_csv(f"{ts}_tpms_crossmap_filtered.txt", sep="\t", header=True, index=True)
    # df_cis.to_csv(f"{ts}_tpms_cismask_filtered.txt", sep="\t", header=True, index=True)
    df_cm.to_csv(f"{ts}_knn30_crossmap_filtered.txt", sep="\t", header=True, index=True)
    df_cis.to_csv(f"{ts}_knn30_cismask_filtered.txt", sep="\t", header=True, index=True)

Tissue: haa
SNPs before cm: 64
SNPs after cm:  236
SNPs brought in 173
masked crossmap-genes: 4472
masked cis-genes: 8
Tissue: pan
SNPs before cm: 179
SNPs after cm:  356
SNPs brought in 178
masked crossmap-genes: 3842
masked cis-genes: 1
Tissue: spl
SNPs before cm: 1354
SNPs after cm:  1666
SNPs brought in 371
masked crossmap-genes: 5944
masked cis-genes: 4
Tissue: wb
SNPs before cm: 6269
SNPs after cm:  11482
SNPs brought in 5264
masked crossmap-genes: 5688
masked cis-genes: 13


In [9]:
## Copy original expressions to facilitate calculations
import shutil

for ts in optim_tissues:
    print(f"Tissue: {ts}")

    # gx_file = f"/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{ts}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered"
    # dest_file = f"{ts}_tpms_qcfilter.txt"
    gx_file = f"/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{ts}_knn30_norm.txt.protein_coding_lncRNA_filtered"
    dest_file = f"{ts}_knn30_norm.txt"
    shutil.copy(gx_file, dest_file)

Tissue: haa
Tissue: pan
Tissue: spl
Tissue: wb
