In [101]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append("../")

import numpy as np
import math
import pickle
from config import *
from iotools.readOxford import ReadOxford
from iotools.readrpkm import ReadRPKM
from iotools.io_model import WriteModel
from inference.linreg_association import LinRegAssociation
from inference.empirical_bayes import EmpiricalBayes
from utils import hyperparameters
from inference import logmarglik
from iotools import readgtf
from utils import gtutils
from utils import mfunc
from utils.containers import ZstateInfo
from utils.printstamp import printStamp
from helper_functions import load_target_genes, write_params
from sklearn.preprocessing import scale

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
# Annotation (use complete gene name in gtf without trimming the version)
# load annotation for whole genome
gene_info = readgtf.gencode_v12(gtfpath, trim=False)

In [103]:

# Load gene list
genelistfile = "genes4testing_high_and_low_r2_0.001"
selected_gene_ids = load_target_genes(genelistfile, gene_info, chrom)


Read 641 genes with high r2 values

Found 57 genes in CHR 12


In [104]:


if not os.path.exists(learn_pickfile):
    # read Genotype
    oxf = ReadOxford(gtex_gtpath, gtex_samplepath, chrom, learning_dataset)
    genotype = np.array(oxf.dosage)
    samplenames = oxf.samplenames
    snps = oxf.snps_info

    printStamp("Dumping CHR {:d} genotype".format(chrom))
    with open(learn_pickfile, 'wb') as output:
        pickle.dump(oxf, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled genotype")
    with open(learn_pickfile, 'rb') as input:
        pickled_oxf = pickle.load(input)

    printStamp("Done reading")

    genotype = np.array(pickled_oxf.dosage)
    samplenames = pickled_oxf.samplenames
    snps = pickled_oxf.snps_info
    nsample = len(pickled_oxf.samplenames)

# Quality control
f_snps, f_genotype = gtutils.remove_low_maf(snps, genotype, 0.1)
gt = gtutils.normalize(f_snps, f_genotype)

# Gene Expression
rpkm = ReadRPKM(gtex_rpkmpath, "gtex")
expression = rpkm.expression
expr_donors = rpkm.donor_ids
gene_names = rpkm.gene_names

# Selection
printStamp("Selection of samples")
vcfmask, exprmask = mfunc.select_donors(samplenames, expr_donors)
genes, indices = mfunc.select_genes(gene_info, gene_names)

gene_training_list = []
for i, gene in enumerate(genes):
    k = indices[i]
    if gene.ensembl_id in selected_gene_ids and gene.chrom == chrom:
        gene_training_list.append((k,gene))
        # print(k,gene)

2018-03-24 16:56:52 - Reading pickled genotype
2018-03-24 16:57:32 - Done reading
2018-03-24 16:57:39 - Selection of samples


In [85]:
gene_training_12 = []
for i, gene in enumerate(genes):
    k = indices[i]
    if gene.chrom == chrom:
        gene_training_12.append((k,gene))
        # print(k,gene)

In [87]:
print(len(gene_training_12))
print(len(genes))

831
14810


In [60]:
parameters[0]

['gxpred-bslmm',
 [0.005, 0.0, 0.1, 0.001, 0.005],
 [None, None, None, 'L1', None],
 {'lambda': 0.1},
 'unbound_L1_0.1']

In [63]:

p = parameters[0]

prior = p[0]
params = p[1]
hyperpriors = p[2]
hyperparams = p[3]
run_description = p[4]
init_params = np.array(params)
# account for Tau inverse value
init_params[4] = 1 / init_params[4] / init_params[4]

model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
modelpath = os.path.join("./z"+str(zmax), run_description, model_dir)
write_params(modelpath, p)

model = WriteModel(modelpath, chrom)

for i in range(0,len(gene_training_list)):

    k, gene = gene_training_list[i]

    print(k, gene)

    # select only the cis-SNPs
    cismask = mfunc.select_snps(gene, f_snps, window)
    if len(cismask) > 0:
        target = expression[k, exprmask]
        target = scale(target, with_mean=True, with_std=True)
        predictor = gt[cismask][:, vcfmask]
        snpmask = cismask

        # if number of cis SNPs > threshold, use p-value cut-off
        if len(cismask) > min_snps:
            assoc_model = LinRegAssociation(predictor, target, min_snps, pval_cutoff)
            pvalmask = cismask[assoc_model.selected_variables]
            print ("Found {:d} SNPs, reduced to {:d} SNPs (max p-value {:g}) for {:s}".format(len(cismask), len(pvalmask), assoc_model.ordered_pvals[len(pvalmask) - 1], gene.name))
            predictor = gt[pvalmask][:, vcfmask]
            snpmask = pvalmask
        else:
            print ("Found {:d} SNPs for {:s}".format(len(cismask), gene.name))

        # perform the analysis

        print ("Starting first optimization ==============")
        emp_bayes = EmpiricalBayes(predictor, target, 1, init_params, method="new", prior=prior, hyperpriors= hyperpriors, hyperparams= hyperparams)
        emp_bayes.fit()
        if zmax > 1:
            if emp_bayes.success:
                res = emp_bayes.params
                print ("Starting second optimization from previous results ================")
                # Python Error: C library could not compute z-components. Check C errors above.
            else:
                res = init_params
                print ("Starting second optimization from initial parameters ================")
            emp_bayes = EmpiricalBayes(predictor, target, zmax, res, method="new", prior=prior, hyperpriors= hyperpriors, hyperparams= hyperparams)
            emp_bayes.fit()

        if emp_bayes.success:
            res = emp_bayes.params
            res[4] = 1 / np.sqrt(res[4])
            print("PI: \t",res[0])
            print("mu: \t",res[1])
            print("sigma: \t",res[2])
            print("sigmabg: \t",res[3])
            print("tau: \t",res[4])

            model_snps = [f_snps[x] for x in snpmask]
            model_zstates = list()
            scaledparams = hyperparameters.scale(emp_bayes.params)
            zprob, zexp = logmarglik.model_exp(scaledparams, predictor, target, emp_bayes.zstates, prior)
            for j, z in enumerate(emp_bayes.zstates):
                this_zstate = ZstateInfo(state = z,
                                         prob  = zprob[j],
                                         exp   = list(zexp[j, :]) )
                model_zstates.append(this_zstate)
            # print(model_snps)
            # for i,m in enumerate(model_zstates):
            #     print("z-state: ",i," Prob:", m.prob)
            model.write_success_gene(gene, model_snps, model_zstates, res)
        else:
            model.write_failed_gene(gene, np.zeros_like(init_params))
            print ("Failed optimization")


13866 GeneInfo(name='C3AR1', ensembl_id='ENSG00000171860.4', chrom=12, start=8210898, end=8219066)
Found 3603 SNPs, reduced to 200 SNPs (max p-value 0.0297755) for C3AR1
Selected method: new, prior: gxpred-bslmm
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zstat

Old probsum: 0.000000, target: 0.980000
Working with 93 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 92 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 92 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 92 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 92 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 92 leading zstates.
Old probsum: 0.769836, target: 0.980000
Working with 161 leading zstates.
Old probsum: 0.770524, target: 0.980000
Working with 161 leading zstates.
Old probsum: 0.896102, target: 0.980000
Working with 121 leading zstates.
Old probsum: 0.893446, target: 0.980000
Working with 125 leading zstates.
Old probsum: 0.896216, target: 0.980000
Working with 123 leading zstates.
Old probsum: 0.906538, target: 0.980000
Working with 117 leading zstates.
Old probsum: 0.921511, target: 0.980000
Working with 107 leading zstates.
Old probsum: 0.938816, target: 0.980000
Work

Found 4208 SNPs, reduced to 218 SNPs (max p-value 0.000950746) for CLEC12B
Selected method: new, prior: gxpred-bslmm
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.0000

Old probsum: 0.851052, target: 0.980000
Working with 148 leading zstates.
Old probsum: 0.846987, target: 0.980000
Working with 149 leading zstates.
Old probsum: 0.534438, target: 0.980000
Working with 193 leading zstates.
Old probsum: 0.529333, target: 0.980000
Working with 193 leading zstates.
Old probsum: 0.529412, target: 0.980000
Working with 193 leading zstates.
Old probsum: 0.540811, target: 0.980000
Working with 193 leading zstates.
Old probsum: 0.671188, target: 0.980000
Working with 189 leading zstates.
Old probsum: 0.705941, target: 0.980000
Working with 188 leading zstates.
Old probsum: 0.811463, target: 0.980000
Working with 180 leading zstates.
Old probsum: 0.851476, target: 0.980000
Working with 174 leading zstates.
Old probsum: 0.923587, target: 0.980000
Working with 148 leading zstates.
Old probsum: 0.923587, target: 0.980000
Working with 148 leading zstates.
      fun: 447.90841907693584
 hess_inv: <5x5 LbfgsInvHessProduct with dtype=float64>
      jac: array([ 2.45897

14448 GeneInfo(name='USP15', ensembl_id='ENSG00000135655.9', chrom=12, start=62654118, end=62811211)
Found 3472 SNPs, reduced to 200 SNPs (max p-value 0.0153321) for USP15
Selected method: new, prior: gxpred-bslmm
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zst

Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 14 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 18 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 19 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 20 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 24 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 28 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 28 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 28 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 26 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 22 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 21 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 20 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with

Old probsum: 0.000000, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000040, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000072, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000107, target: 0.980000
Working with 5 leading zstates.
Old probsum: 0.000207, target: 0.980000
Working with 6 leading zstates.
Old probsum: 0.000335, target: 0.980000
Working with 8 leading zstates.
Old probsum: 0.000457, target: 0.980000
Working with 8 leading zstates.
Old probsum: 0.000723, target: 0.980000
Working with 9 leading zstates.
Old probsum: 0.000925, target: 0.980000
Working with 9 leading z

Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 2 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 3 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 6 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 29 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 63 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 91 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 98 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 99 leading zstates.
Old probsum: 0.000000, target: 0.980000
Working with 100 le

# Prediction

In [141]:

import sys
sys.path.append("../")
import os
import pickle
from utils.printstamp import printStamp
from iotools.io_model import ReadModel
from utils.containers import GeneExpressionArray
from utils import gtutils
from utils import mfunc
import numpy as np
from config import *

In [142]:

if not os.path.exists(p_pickfile):
# Read genotype (quite slow for testing) use pickle below
    p_oxf = ReadOxford(p_gtpath, p_samplepath, chrom, predicting_dataset)
    p_genotype = np.array(p_oxf.dosage)
    p_samplenames = p_oxf.samplenames
    p_snps = p_oxf.snps_info
    p_nsample = len(p_oxf.samplenames)

    printStamp("Dumping CHR {:d} genotype".format(chrom))
    with open(p_pickfile, 'wb') as output:
        pickle.dump(p_oxf, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled genotype")
    with open(p_pickfile, 'rb') as input:
        pickled_oxf = pickle.load(input)

    printStamp("Done reading")

    p_genotype = np.array(pickled_oxf.dosage)
    p_samplenames = pickled_oxf.samplenames
    p_snps = pickled_oxf.snps_info
    p_nsample = len(pickled_oxf.samplenames)


2018-03-26 21:12:33 - Reading pickled genotype
2018-03-26 21:12:41 - Done reading


In [143]:

# Use parameters from config.py
for p in parameters:
    prior = p[0]
    params = p[1]
    hyperpriors = p[2]
    hyperparams = p[3]
    run_description = p[4]    
    model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
    modelpath = os.path.join("./z"+str(zmax), run_description, model_dir)

    outfileprefix = os.path.join(modelpath,"pred_chr"+str(chrom))

    printStamp("Predicting for "+model_dir)
    # Write predictions for each model
    p_model = ReadModel(modelpath, chrom)
    p_genes = p_model.genes
    gx = list()
    for gene in p_genes:

        p_model.read_gene(gene)
        p_model_snps = p_model.snps
        p_model_zstates = p_model.zstates

        x = gtutils.prediction_variables(p_snps, p_model_snps, p_genotype)
        x = gtutils.normalize(p_model_snps, x)

        ypred = np.zeros(p_nsample)
        for z in p_model_zstates:
            ypred += z.prob * np.dot(x.T, z.exp)

        gx.append(GeneExpressionArray(geneid = gene.ensembl_id, expr_arr = ypred))


    # Write output
    mfunc.write_gcta_phenotype(outfileprefix, p_samplenames, gx)

2018-03-26 21:12:43 - Predicting for gxpred-bslmm_0.100_0.000_0.100_0.100_0.005


KeyboardInterrupt: 

In [119]:
# Create my own parameters
# This prediction is on GTEx, to asses trans-eQTLs


# Hardcode parameters to read existing learned model
parameters = []
prior = "gxpred-bslmm"

set_init_params = [[0.1, 0.0, 0.1, 0.1, 0.005]]
run_description = "bound_mu_NoPriors"

hyperpriors = [None, None, None, None, None]
hyperparams = {"lambda":0.05, "Galpha":2, "Gbeta":0.5}
for init_params in set_init_params:
    parameters.append([prior, init_params, hyperpriors, hyperparams, run_description])

p = parameters[0]

prior = p[0]
params = p[1]
hyperpriors = p[2]
hyperparams = p[3]
run_description = p[4]
model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
modelpath = os.path.join("./z"+str(zmax), run_description, model_dir)

outfileprefix = os.path.join(modelpath,"gtex_pred_chr"+str(chrom))

printStamp("Predicting for "+model_dir)
# Write predictions for each model
p_model = ReadModel(modelpath, chrom)
p_genes = p_model.genes
gx = list()
for gene in p_genes:

    p_model.read_gene(gene)
    p_model_snps = p_model.snps
    p_model_zstates = p_model.zstates

    x = gtutils.prediction_variables(snps, p_model_snps, genotype)
    x = gtutils.normalize(p_model_snps, x)

    ypred = np.zeros(nsample)
    for z in p_model_zstates:
        ypred += z.prob * np.dot(x.T, z.exp)

    gx.append(GeneExpressionArray(geneid = gene.ensembl_id, expr_arr = ypred))


# Write output
mfunc.write_gcta_phenotype(outfileprefix, samplenames, gx)

2018-03-24 17:08:13 - Reading pickled genotype
2018-03-24 17:08:17 - Done reading
2018-03-24 17:08:18 - Predicting for gxpred-bslmm_0.100_0.000_0.100_0.100_0.005


IndexError: index 450 is out of bounds for axis 0 with size 450

# Assessment 

In [144]:
import sys
sys.path.append("../")
import os
from iotools import readgtf
from iotools.readrpkm import ReadRPKM
from iotools.readPrediction import ReadPrediction
from scipy.stats import pearsonr
from helper_functions import load_target_genes, write_r2_dataframe, get_common_elements, new_write_predicted_r2, pearson_corr_rowwise
import math
import pickle
from utils.printstamp import printStamp

from config import *

In [145]:

# Load reference dataset Gene Expression
reference_rpkm = ReadRPKM(reference_expdatapath, "cardiogenics")
reference_expression = reference_rpkm.expression
reference_expr_donors = reference_rpkm.donor_ids
reference_gene_names = reference_rpkm.gene_names

In [146]:
# use the selected_gene_ids with high R² values as targets, only those in the selected chrom will appear
selected_gene_ids = load_target_genes(genelistfile, gene_info, chrom)
target_genelist = [g.split(".")[0] for g in selected_gene_ids]
target_donors = reference_expr_donors

Read 641 genes with high r2 values

Found 57 genes in CHR 12


In [74]:

### Predixcan assessment ###

if not os.path.exists(predixcan_pickfile):
    # pxpred_predpath = os.path.join(home, "predictions/cardiogenics/predixcan_predictions_klinikum")
    predixcanpred = ReadPrediction(pxpred_predpath, reference_samplepath, "predixcan", trim=True)

    printStamp("Dumping Predixcan prediction")
    with open(predixcan_pickfile, 'wb') as output:
        pickle.dump(predixcanpred, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled Predixcan prediction")
    with open(predixcan_pickfile, 'rb') as input:
        predixcanpred = pickle.load(input)

# filter predixcan predictions with only those in gxpred
predixcanpred.sort_by_gene(target_genelist)
predixcanpred.sort_by_samples(target_donors, use_prev=True)

sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, predixcanpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, predixcanpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

predixcan_r = pearson_corr_rowwise(predixcanpred.sorted_expr_mat.T, sorted_expression.T)



2018-03-07 11:26:27 - Reading pickled Predixcan prediction
Genes found: 14 of 15
Samples found: 744 of 849


In [76]:
%load_ext autoreload
%autoreload 2

### GXpred assessment ###

#for p in parameters:
p = parameters[0]
prior = p[0]
params = p[1]
hyperpriors = p[2]
hyperparams = p[3]
run_description = p[4]
model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
print(prior, params)

modelpath = os.path.join("./z"+str(zmax), run_description, model_dir)
print(modelpath)


gxpred_predpath = os.path.join(modelpath)
gxpred = ReadPrediction(gxpred_predpath, reference_samplepath, "gxpred", trim=True)

# filter gxpred predicted values
gxpred.sort_by_gene(target_genelist)
gxpred.sort_by_samples(target_donors, use_prev=True)


# Filter and sort the reference expression values
sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, gxpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, gxpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

# Calculate Pearson correlation
gxpred_r = pearson_corr_rowwise(gxpred.sorted_expr_mat.T, sorted_expression.T)

print(gxpred.sorted_gene_names)
print(gxpred_r**2)
print(predixcan_r**2)


write_r2_dataframe(modelpath, chrom, "predixcan", predixcan_r, predixcanpred)
write_r2_dataframe(modelpath, chrom, prior, gxpred_r, gxpred)

# Write to table with predictions for given genes
# predtabledir = os.path.join(home, "gxpred","devtools", "all_predictions.txt")
# new_write_predicted_r2(predtabledir, prior, params, gxpred_r, predixcan_r, gxpred.sorted_gene_names)

gxpred-bslmm [0.005, 0.0, 0.1, 0.001, 0.005]
No prediction found for CHR 1
No prediction found for CHR 2
No prediction found for CHR 3
No prediction found for CHR 4
No prediction found for CHR 5
No prediction found for CHR 6
No prediction found for CHR 7
No prediction found for CHR 8
No prediction found for CHR 9
No prediction found for CHR 10
No prediction found for CHR 11
Loaded 13 genes in CHR 12
No prediction found for CHR 13
No prediction found for CHR 14
No prediction found for CHR 15
No prediction found for CHR 16
No prediction found for CHR 17
No prediction found for CHR 18
No prediction found for CHR 19
No prediction found for CHR 20
No prediction found for CHR 21
No prediction found for CHR 22
Genes found: 13 of 15
Samples found: 744 of 849


# Get R² on GTEx for trans eQTL

In [137]:
# Hardcode parameters to read existing learned model
parameters = []
prior = "gxpred-bslmm"

set_init_params = [[0.1, 0.0, 0.1, 0.1, 0.005]]
run_description = "bound_mu_NoPriors"

hyperpriors = [None, None, None, None, None]
hyperparams = {"lambda":0.05, "Galpha":2, "Gbeta":0.5}
for init_params in set_init_params:
    parameters.append([prior, init_params, hyperpriors, hyperparams, run_description])

p = parameters[0]


# Gene Expression
rpkm = ReadRPKM(gtex_rpkmpath, "gtex")
reference_expression = rpkm.expression
reference_expr_donors = rpkm.donor_ids
reference_gene_names = rpkm.gene_names



# use the selected_gene_ids with high R² values as targets, only those in the selected chrom will appear
selected_gene_ids = load_target_genes(genelistfile, gene_info, chrom)
# target_genelist = [g.split(".")[0] for g in selected_gene_ids]
target_genelist = selected_gene_ids
target_donors = reference_expr_donors


### GXpred assessment ###

#for p in parameters:
p = parameters[0]
prior = p[0]
params = p[1]
hyperpriors = p[2]
hyperparams = p[3]
run_description = p[4]
model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
print(prior, params)

modelpath = os.path.join("./z"+str(zmax), run_description, model_dir)

gxpred_predpath = os.path.join(modelpath)
gxpred = ReadPrediction(gxpred_predpath, gtex_samplepath, "gxpred", trim=False, prefix="gtex_pred_chr")

# filter gxpred predicted values
gxpred.sort_by_gene(target_genelist)
gxpred.sort_by_samples(target_donors, use_prev=True)

# Filter and sort the reference expression values

sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, gxpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, gxpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

# Calculate Pearson correlation
gxpred_r = pearson_corr_rowwise(gxpred.sorted_expr_mat.T, sorted_expression.T)

write_r2_dataframe(modelpath, chrom, prior, gxpred_r, gxpred)


Read 641 genes with high r2 values

Found 57 genes in CHR 12
gxpred-bslmm [0.1, 0.0, 0.1, 0.1, 0.005]
No prediction found for CHR 1
No prediction found for CHR 2
No prediction found for CHR 3
No prediction found for CHR 4
No prediction found for CHR 5
No prediction found for CHR 6
No prediction found for CHR 7
No prediction found for CHR 8
No prediction found for CHR 9
No prediction found for CHR 10
No prediction found for CHR 11
Loaded 57 genes in CHR 12
2
No prediction found for CHR 13
No prediction found for CHR 14
No prediction found for CHR 15
No prediction found for CHR 16
No prediction found for CHR 17
No prediction found for CHR 18
No prediction found for CHR 19
No prediction found for CHR 20
No prediction found for CHR 21
No prediction found for CHR 22
Genes found: 57 of 57
Samples found: 338 of 338
(450, 57)
