In [11]:
%load_ext autoreload
%autoreload 2


import sys, os
sys.path.append("../")

import argparse
import numpy as np
import math
import pickle
from iotools.readOxford import ReadOxford
from iotools.readrpkm import ReadRPKM
from iotools.io_model import WriteModel
from inference.linreg_association import LinRegAssociation
from inference.empirical_bayes import EmpiricalBayes
from utils import hyperparameters
from inference import logmarglik
from iotools import readgtf
from utils import gtutils
from utils import mfunc
from utils.containers import ZstateInfo
from utils.printstamp import printStamp
from utils.helper_functions import write_params, load_target_genes
from sklearn.preprocessing import scale
from iotools import snp_annotator

import config_dev as config

In [12]:
# Annotation (use complete gene name in gtf without trimming the version)
# load annotation for whole genome
gene_info = readgtf.gencode_v12(config.gtfpath, trim=False)

In [13]:
# Load gene list
genelistfile = "genes4testing_high_and_low_r2_0.001"
# genelistfile = "genes4testing_highr2"
selected_gene_ids = load_target_genes(genelistfile, gene_info, config.chrom)
print(selected_gene_ids)


Read 641 genes with high r2 values

Found 57 genes in CHR 12
['ENSG00000151065.9', 'ENSG00000078237.4', 'ENSG00000139194.3', 'ENSG00000173262.7', 'ENSG00000171860.4', 'ENSG00000205846.3', 'ENSG00000166527.3', 'ENSG00000256660.1', 'ENSG00000172243.13', 'ENSG00000139112.6', 'ENSG00000013583.4', 'ENSG00000123104.7', 'ENSG00000064115.6', 'ENSG00000139117.9', 'ENSG00000139174.6', 'ENSG00000161800.8', 'ENSG00000139610.1', 'ENSG00000167612.8', 'ENSG00000123395.10', 'ENSG00000170486.6', 'ENSG00000185640.5', 'ENSG00000135476.7', 'ENSG00000161638.6', 'ENSG00000123338.8', 'ENSG00000170473.12', 'ENSG00000135452.5', 'ENSG00000135655.9', 'ENSG00000183735.5', 'ENSG00000111554.10', 'ENSG00000090382.2', 'ENSG00000127337.2', 'ENSG00000135643.4', 'ENSG00000111615.8', 'ENSG00000139323.9', 'ENSG00000184752.8', 'ENSG00000139343.6', 'ENSG00000111145.3', 'ENSG00000136048.9', 'ENSG00000120860.6', 'ENSG00000136052.5', 'ENSG00000151131.5', 'ENSG00000136051.9', 'ENSG00000166046.6', 'ENSG00000110851.7', 'ENSG00000

In [14]:

if not os.path.exists(config.learn_pickfile_dev):
    # read Genotype
    oxf = ReadOxford(config.gtex_gtpath, config.gtex_samplepath, config.chrom, config.learning_dataset)
    genotype = np.array(oxf.dosage)
    samplenames = oxf.samplenames
    snps = oxf.snps_info

    printStamp("Dumping CHR {:d} genotype".format(config.chrom))
    with open(config.learn_pickfile_dev, 'wb') as output:
        pickle.dump(oxf, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled genotype")
    with open(config.learn_pickfile_dev, 'rb') as input:
        pickled_oxf = pickle.load(input)

    printStamp("Done reading")

    genotype = np.array(pickled_oxf.dosage)
    samplenames = pickled_oxf.samplenames
    snps = pickled_oxf.snps_info
    nsample = len(pickled_oxf.samplenames)

# Quality control
f_snps, f_genotype = gtutils.remove_low_maf(snps, genotype, 0.1)
gt = gtutils.normalize(f_snps, f_genotype)

# Gene Expression
rpkm = ReadRPKM(config.gtex_rpkmpath, "gtex")
expression = rpkm.expression
expr_donors = rpkm.donor_ids
gene_names = rpkm.gene_names

# Selection
printStamp("Selection of samples")
vcfmask, exprmask = mfunc.select_donors(samplenames, expr_donors)
genes, indices = mfunc.select_genes(gene_info, gene_names)

gene_training_list = []
for i, gene in enumerate(genes):
    k = indices[i]
    if gene.ensembl_id in selected_gene_ids and gene.chrom == config.chrom:
        gene_training_list.append((k,gene))
        # print(k,gene)

2018-05-16 01:28:09 - Reading pickled genotype
2018-05-16 01:28:31 - Done reading
2018-05-16 01:28:38 - Selection of samples


In [15]:

p = config.parameters[0]

prior = p[0]
params = p[1]
hyperpriors = []
hyperparams = p[3]
run_description = p[4]
cutoff = p[5]
usedist = p[6]
usefeat = p[7]

print(p)


['gxpred-bslmm', [0.9, 0.0, 0.1, 0.1, 0.005], [None, None, None, None, None], None, 'test_1KGannots', 'soft', 'nodist', '1kg']


In [21]:
from iotools import snp_annotator
from collections import defaultdict
import gzip 

model_dir = "{:s}_{:s}_{:s}_{:s}_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(prior, cutoff, usedist, usefeat, params[0], params[1], params[2], params[3], params[4])
outdir = "/home/franco/soedinglab/dev_gxpred_models/"
modelpath = os.path.join(outdir, "z"+str(config.zmax), config.run_description, model_dir)

print(modelpath)

write_params(modelpath, p)

model = WriteModel(modelpath, config.chrom)

# Load rsid dictionary
annot_dict = defaultdict(list)
if usefeat == "1kg":
    annotfile = os.path.join(config.annot1kg_dir, "1KG."+str(config.chrom)+".annot.gz")
    print(annotfile)
    with gzip.open(annotfile, 'r') as instream:
        _ = instream.readline()
        for line in instream:
            arr = line.decode().strip().split(" ")
            rsid = arr[0]
            annots = list(map(int, arr[1:]))
            annot_dict[rsid] = annots


# for i in range(0,len(gene_training_list)):
for i in range(15,20):

    k, gene = gene_training_list[i]

    print(k, gene)

    printStamp("Learning for gene "+str(gene.ensembl_id))

    # select only the cis-SNPs
    cismask = mfunc.select_snps(gene, f_snps, config.window)
    if len(cismask) > 0:
        target = expression[k, exprmask]
        target = scale(target, with_mean=True, with_std=True)
        predictor = gt[cismask][:, vcfmask]
        snpmask = cismask

        # if number of cis SNPs > threshold, use p-value cut-off

        min_pos = f_snps[cismask[0]].bp_pos - 1000
        max_pos = f_snps[cismask[-1]].bp_pos + 1000
        
        if len(cismask) > config.min_snps:
            assoc_model = LinRegAssociation(predictor, target, config.min_snps, config.pval_cutoff, cutoff)
            pvalmask = cismask[assoc_model.selected_variables]
            if pvalmask.shape[0] == 0:
                print("No significant SNPs found for gene {:s}".format(gene.ensembl_id))
                continue
            print ("Found {:d} SNPs, reduced to {:d} SNPs (max p-value {:g}) for {:s}".format(len(cismask), len(pvalmask), assoc_model.ordered_pvals[len(pvalmask) - 1], gene.name))
            predictor = gt[pvalmask][:, vcfmask]
            snpmask = pvalmask
        else:
            print ("Found {:d} SNPs for {:s}".format(len(cismask), gene.name))

        if config.shuffle_geno:
            print("Shuffling Genotype!")
            np.random.shuffle(predictor.T)

        selected_snps = [f_snps[x] for x in snpmask]

        if config.prune_LD:
            ld_indices = snp_annotator.get_snps_LD(gene, selected_snps, min_pos, max_pos, config.genofile_plink, config.ldstorepath, config.ld_path)
            snpmask = np.delete(snpmask, np.reshape(ld_indices, -1))
            predictor = gt[snpmask][:, vcfmask]

            # replace with the pruned snsp in LD
            selected_snps = [f_snps[x] for x in snpmask]

            print ("Reduced to {:d} SNPs".format(len(snpmask)))
        
        # read the features
        # TODO: only returns the base feature (vect of 1's)
        feature0 = np.ones((len(selected_snps), 1))
        
        if usefeat == "1kg":
            current_annot = list()
            for snp in selected_snps:
                if len(annot_dict[snp.varid]) > 0:
                    current_annot.append(annot_dict[snp.varid])
                else:
                    current_annot.append([0,0,0,0,0])
                    print("not found {:s}!".format(snp.varid))
            feature1kg = np.array(current_annot)
        
        features = np.concatenate((feature0, feature1kg), axis=1)
        raise
                    
        # add UTR feature
#         utr_feature = snp_annotator.get_GENCODE_annotation(config.gtfpath, gene, selected_snps, "UTR")
#         exon_feature = snp_annotator.get_GENCODE_annotation(config.gtfpath, gene, selected_snps, "exon")
#         features = np.concatenate((feature0, utr_feature, exon_feature), axis=1)

        # Get DHS distance feature
        dist_feature = snp_annotator.get_distance_feature(selected_snps, gene, usedist)

        nfeat = features.shape[1]
        print("Loaded {:d} features".format(nfeat))

        init_params = np.zeros(nfeat + 4)
        init_params[0] = - np.log((1 / params[0]) - 1)
        if nfeat > 1:
            for i in range(1, nfeat):
                init_params[i] = - np.log((1 / params[0]) - 1)
        init_params[nfeat + 0] = params[1] # mu
        init_params[nfeat + 1] = params[2] # sigma
        init_params[nfeat + 2] = params[3] # sigmabg
        init_params[nfeat + 3] = 1 / params[4] / params[4] # tau

        # perform the analysis

        print ("Starting first optimization ==============")
        emp_bayes = EmpiricalBayes(predictor, target, features, dist_feature, 1, init_params, method="new")
        emp_bayes.fit()
        if config.zmax > 1:
            if emp_bayes.success:
                res = emp_bayes.params
                print ("Starting second optimization from previous results ================")
                # Python Error: C library could not compute z-components. Check C errors above.
            else:
                res = init_params
                print ("Starting second optimization from initial parameters ================")
            emp_bayes = EmpiricalBayes(predictor, target, features, dist_feature, config.zmax, res, method="new")
            emp_bayes.fit()

        if emp_bayes.success:
            res = emp_bayes.params
            res[4] = 1 / np.sqrt(res[4])

            print(res)
#             print("PI: \t",res[0])
#             print("mu: \t",res[1])
#             print("sigma: \t",res[2])
#             print("sigmabg: \t",res[3])
#             print("tau: \t",res[4])

            model_snps = [f_snps[x] for x in snpmask]
            model_zstates = list()
            scaledparams = hyperparameters.scale(emp_bayes.params)
            zprob, zexp = logmarglik.model_exp(scaledparams, predictor, target, features, dist_feature, emp_bayes.zstates)
            for j, z in enumerate(emp_bayes.zstates):
                this_zstate = ZstateInfo(state = z,
                                         prob  = zprob[j],
                                         exp   = list(zexp[j, :]) )
                model_zstates.append(this_zstate)
            # print(model_snps)
            # for i,m in enumerate(model_zstates):
            #     print("z-state: ",i," Prob:", m.prob)
            model.write_success_gene(gene, model_snps, model_zstates, res)
        else:
            model.write_failed_gene(gene, np.zeros_like(init_params))
            print ("Failed optimization")

/home/franco/soedinglab/dev_gxpred_models/z1/test_1KGannots/gxpred-bslmm_soft_nodist_1kg_0.900_0.000_0.100_0.100_0.005
/home/franco/cluster2/datasets/1KG_annots/1KG.12.annot.gz
14210 GeneInfo(name='RACGAP1', ensembl_id='ENSG00000161800.8', chrom=12, start=50370706, end=50426918)
2018-05-16 01:31:09 - Learning for gene ENSG00000161800.8
Found 1929 SNPs, reduced to 200 SNPs (max p-value 0.137009) for RACGAP1


RuntimeError: No active exception to reraise

In [22]:
np.sum(features, axis=0)

array([200.,   3.,  10.,  25.,  50., 103.])

In [20]:
features.shape

(200, 6)

# Prediction

In [4]:

import sys
sys.path.append("../")
import os
import pickle
from utils.printstamp import printStamp
from iotools.io_model import ReadModel
from iotools.readOxford import ReadOxford
from utils.containers import GeneExpressionArray
from utils import gtutils
from utils import mfunc
import numpy as np
import config_dev as config


In [2]:

if not os.path.exists(config.p_pickfile_dev):
# Read genotype (quite slow for testing) use pickle below
    p_oxf = ReadOxford(config.p_gtpath, config.p_samplepath, config.chrom, config.predicting_dataset)
    p_genotype = np.array(p_oxf.dosage)
    p_samplenames = p_oxf.samplenames
    p_snps = p_oxf.snps_info
    p_nsample = len(p_oxf.samplenames)

    printStamp("Dumping CHR {:d} genotype".format(chrom))
    with open(config.p_pickfile_dev, 'wb') as output:
        pickle.dump(p_oxf, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled genotype")
    with open(config.p_pickfile_dev, 'rb') as input:
        pickled_oxf = pickle.load(input)

    printStamp("Done reading")

    p_genotype = np.array(pickled_oxf.dosage)
    p_samplenames = pickled_oxf.samplenames
    p_snps = pickled_oxf.snps_info
    p_nsample = len(pickled_oxf.samplenames)


2018-05-14 21:53:11 - Reading pickled genotype
2018-05-14 21:54:04 - Done reading


In [9]:

p = config.parameters[0]

prior = p[0]
params = p[1]
hyperpriors = []
hyperparams = p[3]
run_description = p[4]
cutoff = p[5]
usedist = p[6]
usefeat = p[7]

print(p)


model_dir = "{:s}_{:s}_{:s}_{:s}_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(prior, cutoff, usedist, usefeat, params[0], params[1], params[2], params[3], params[4])
outdir = "/home/franco/soedinglab/dev_gxpred_models/"
modelpath = os.path.join(outdir, "z"+str(config.zmax), config.run_description, model_dir)


['gxpred-bslmm', [0.9, 0.0, 0.1, 0.1, 0.005], [None, None, None, None, None], None, 'test_1KGannots', 'soft', 'nodist', '1kg']


In [10]:


outfileprefix = os.path.join(modelpath,"pred_chr"+str(config.chrom))

printStamp("Predicting for "+modelpath)
# Write predictions for each model
p_model = ReadModel(modelpath, config.chrom)
p_genes = p_model.genes
gx = list()
for gene in p_genes:

    p_model.read_gene(gene)
    p_model_snps = p_model.snps
    p_model_zstates = p_model.zstates

    x = gtutils.prediction_variables(p_snps, p_model_snps, p_genotype)
    x = gtutils.normalize(p_model_snps, x)

    ypred = np.zeros(p_nsample)
    for z in p_model_zstates:
        ypred += z.prob * np.dot(x.T, z.exp)

    gx.append(GeneExpressionArray(geneid = gene.ensembl_id, expr_arr = ypred))


# Write output
printStamp("Done predicting for "+modelpath)
mfunc.write_gcta_phenotype(outfileprefix, p_samplenames, gx)

2018-05-14 21:56:06 - Predicting for /home/franco/soedinglab/dev_gxpred_models/z1/test_1KGannots/gxpred-bslmm_newsoft_nodist_1kg_0.900_0.000_0.100_0.100_0.005


Exception: File /home/franco/soedinglab/dev_gxpred_models/z1/test_1KGannots/gxpred-bslmm_newsoft_nodist_1kg_0.900_0.000_0.100_0.100_0.005/chr12/genes.txt does not exist

# Assessment 

In [None]:
import sys
sys.path.append("../")
import os
from iotools import readgtf
from iotools.readrpkm import ReadRPKM
from iotools.readPrediction import ReadPrediction
from scipy.stats import pearsonr
from utils.helper_functions import write_r2_dataframe, get_common_elements, pearson_corr_rowwise
import math
import pickle
from utils.printstamp import printStamp

import config_dev as config

In [None]:

# Load reference dataset Gene Expression
reference_rpkm = ReadRPKM(config.reference_expdatapath, config.predicting_dataset)
reference_expression = reference_rpkm.expression
reference_expr_donors = reference_rpkm.donor_ids
reference_gene_names = reference_rpkm.gene_names

In [None]:
# use the selected_gene_ids with high R² values as targets, only those in the selected chrom will appear
# genelistfile = "genes4testing_highr2"
genelistfile = "genes4testing_high_and_low_r2_0.001"
selected_gene_ids = load_target_genes(genelistfile, gene_info, config.chrom)
target_genelist = [g.split(".")[0] for g in selected_gene_ids]
target_donors = reference_expr_donors


In [None]:

### Predixcan assessment ###

if not os.path.exists(config.predixcan_pickfile_dev):
    predixcanpred = ReadPrediction(config.pxpred_predpath, config.reference_samplepath, "predixcan", trim=True)

    if len(predixcanpred.gene_names) > 0:
        printStamp("Dumping Predixcan prediction")
        with open(config.predixcan_pickfile_dev, 'wb') as output:
            pickle.dump(predixcanpred, output, pickle.HIGHEST_PROTOCOL)
    else:
        raise("No prediction data found")
else:
    printStamp("Reading pickled Predixcan prediction")
    with open(config.predixcan_pickfile_dev, 'rb') as input:
        predixcanpred = pickle.load(input)

# filter predixcan predictions with only those in gxpred
predixcanpred.sort_by_gene(target_genelist)
predixcanpred.sort_by_samples(target_donors, use_prev=True)

sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, predixcanpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, predixcanpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

predixcan_r = pearson_corr_rowwise(predixcanpred.sorted_expr_mat.T, sorted_expression.T)



In [None]:
%load_ext autoreload
%autoreload 2

### GXpred assessment ###

print(modelpath)

gxpred_predpath = os.path.join(modelpath)
gxpred = ReadPrediction(gxpred_predpath, config.reference_samplepath, "gxpred", trim=True)

# filter gxpred predicted values
gxpred.sort_by_gene(target_genelist)
gxpred.sort_by_samples(target_donors, use_prev=True)


# Filter and sort the reference expression values
# Cardiogenics variables
# expression
# expr_donors
# gene_names

sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, gxpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, gxpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

# Calculate Pearson correlation
gxpred_r = pearson_corr_rowwise(gxpred.sorted_expr_mat.T, sorted_expression.T)

print(gxpred.sorted_gene_names)
print(gxpred_r**2)
print(predixcan_r**2)


# Write to table with predictions for given genes
# predtabledir = os.path.join(home, "gxpred","devtools", "all_predictions.txt")
# new_write_predicted_r2(predtabledir, prior, params, gxpred_r, predixcan_r, gxpred.sorted_gene_names)

In [None]:

write_r2_dataframe(modelpath, config.chrom, "predixcan", predixcan_r, predixcanpred, overwrite=True)
write_r2_dataframe(modelpath, config.chrom, "gxpred-bslmm", gxpred_r, gxpred)