In [20]:
%load_ext autoreload
%autoreload 2



import sys, os
sys.path.append("../")

import numpy as np
import math
import pickle
from config_annots import *
from iotools.readOxford import ReadOxford
from iotools.readrpkm import ReadRPKM
from iotools.io_model import WriteModel
from inference.linreg_association import LinRegAssociation
from inference.empirical_bayes import EmpiricalBayes
from utils import hyperparameters
from inference import logmarglik
from iotools import readgtf
from utils import gtutils
from utils import mfunc
from utils.containers import ZstateInfo
from utils.printstamp import printStamp
from helper_functions import load_target_genes, write_params
from sklearn.preprocessing import scale

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Annotation (use complete gene name in gtf without trimming the version)
# load annotation for whole genome
gene_info = readgtf.gencode_v12(gtfpath, trim=False)

In [3]:
# Load gene list
# genelistfile = "genes4testing_high_and_low_r2_0.001"
genelistfile = "genes4testing_highr2"
selected_gene_ids = load_target_genes(genelistfile, gene_info, chrom)
print(selected_gene_ids)


Read 70 genes with high r2 values

Found 7 genes in CHR 12
['ENSG00000171860.4', 'ENSG00000064115.6', 'ENSG00000139610.1', 'ENSG00000090382.2', 'ENSG00000127337.2', 'ENSG00000135643.4', 'ENSG00000184752.8']


In [4]:

if not os.path.exists(learn_pickfile):
    # read Genotype
    oxf = ReadOxford(gtex_gtpath, gtex_samplepath, chrom, learning_dataset)
    genotype = np.array(oxf.dosage)
    samplenames = oxf.samplenames
    snps = oxf.snps_info

    printStamp("Dumping CHR {:d} genotype".format(chrom))
    with open(learn_pickfile, 'wb') as output:
        pickle.dump(oxf, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled genotype")
    with open(learn_pickfile, 'rb') as input:
        pickled_oxf = pickle.load(input)

    printStamp("Done reading")

    genotype = np.array(pickled_oxf.dosage)
    samplenames = pickled_oxf.samplenames
    snps = pickled_oxf.snps_info
    nsample = len(pickled_oxf.samplenames)

# Quality control
f_snps, f_genotype = gtutils.remove_low_maf(snps, genotype, 0.1)
gt = gtutils.normalize(f_snps, f_genotype)

# Gene Expression
rpkm = ReadRPKM(gtex_rpkmpath, "gtex")
expression = rpkm.expression
expr_donors = rpkm.donor_ids
gene_names = rpkm.gene_names

# Selection
printStamp("Selection of samples")
vcfmask, exprmask = mfunc.select_donors(samplenames, expr_donors)
genes, indices = mfunc.select_genes(gene_info, gene_names)

gene_training_list = []
for i, gene in enumerate(genes):
    k = indices[i]
    if gene.ensembl_id in selected_gene_ids and gene.chrom == chrom:
        gene_training_list.append((k,gene))
        # print(k,gene)

2018-03-14 20:31:07 - Reading pickled genotype
2018-03-14 20:31:10 - Done reading
2018-03-14 20:31:17 - Selection of samples


In [67]:
parameters[0] = ['gxpred-bslmm',
 [0.1, 0.0, 0.1, 0.010, 0.005],
 [None, None, None, None, None],
 None,
 'bound_mu_NoPriors']

In [68]:

p = parameters[0]

prior = p[0]
params = p[1]

print(params)

hyperpriors = []
hyperparams = p[3]
run_description = p[4]



[0.1, 0.0, 0.1, 0.01, 0.005]


In [7]:
%load_ext wurlitzer
from iotools import snp_annotator

model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
modelpath = os.path.join("./z"+str(zmax), run_description, model_dir)

write_params(modelpath, p)

model = WriteModel(modelpath, chrom)

# for i in range(0,len(gene_training_list)):
for i in range(0,1):

    k, gene = gene_training_list[i]

    print(k, gene)

    # select only the cis-SNPs
    cismask = mfunc.select_snps(gene, f_snps, window)
    if len(cismask) > 0:
        target = expression[k, exprmask]
        target = scale(target, with_mean=True, with_std=True)
        predictor = gt[cismask][:, vcfmask]
        snpmask = cismask

        # if number of cis SNPs > threshold, use p-value cut-off
        min_snps = 20
        if len(cismask) > min_snps:
            assoc_model = LinRegAssociation(predictor, target, min_snps, pval_cutoff)
            pvalmask = cismask[assoc_model.selected_variables]
            print ("Found {:d} SNPs, reduced to {:d} SNPs (max p-value {:g}) for {:s}".format(len(cismask), len(pvalmask), assoc_model.ordered_pvals[len(pvalmask) - 1], gene.name))
            predictor = gt[pvalmask][:, vcfmask]
            snpmask = pvalmask
        else:
            print ("Found {:d} SNPs for {:s}".format(len(cismask), gene.name))

            
        # read the features
        # TO-DO: call another module for getting the features
        # for now, it contains only a list of 1's
        feature1 = np.ones((predictor.shape[0], 1))
        
        selected_snps = [f_snps[x] for x in snpmask]
        feature2 = snp_annotator.get_dummy_dist_feature(selected_snps, gene, window)
        # feature2 = (feature2 - np.mean(feature2))/np.std(feature2)
        
#         np.random.shuffle(feature2)
      
        
#         feature3 = np.random.random_integers(0,1, len(selected_snps))[np.newaxis].T
        feature3 = np.random.random(len(selected_snps))[np.newaxis].T
    
        feature4 = np.random.exponential(scale=1, size=len(selected_snps))[np.newaxis].T
        
        
        features = np.concatenate((feature1,feature2), axis=1)
        
        nfeat = features.shape[1]
        print("Loaded {:d} features".format(nfeat))

        init_params = np.zeros(nfeat + 4)
        init_params[0] = - np.log((1 / params[0]) - 1)
        for i in range(1, nfeat):
            init_params[i] = 0
            # init_params[i] = params[0]
        init_params[nfeat + 0] = params[1] # mu
        init_params[nfeat + 1] = params[2] # sigma
        init_params[nfeat + 2] = params[3] # sigmabg
        init_params[nfeat + 3] = 1 / params[4] / params[4] # tau

        print(init_params)          
            
        # perform the analysis

        print ("Starting first optimization ==============")
        emp_bayes = EmpiricalBayes(predictor, target, features, 1, init_params, method="new")
        emp_bayes.fit()
        if zmax > 1:
            if emp_bayes.success:
                res = emp_bayes.params
                print ("Starting second optimization from previous results ================")
                # Python Error: C library could not compute z-components. Check C errors above.
            else:
                res = init_params
                print ("Starting second optimization from initial parameters ================")
            emp_bayes = EmpiricalBayes(predictor, target, features, cmax, res, method="new")
            emp_bayes.fit()

        if emp_bayes.success:
            res = emp_bayes.params
#             res[4] = 1 / np.sqrt(res[4])
#             print("PI: \t",res[0])
#             print("mu: \t",res[1])
#             print("sigma: \t",res[2])
#             print("sigmabg: \t",res[3])
#             print("tau: \t",res[4])

            model_snps = [f_snps[x] for x in snpmask]
            model_zstates = list()
            scaledparams = hyperparameters.scale(emp_bayes.params)
            zprob, zexp = logmarglik.model_exp(scaledparams, predictor, target, features, emp_bayes.zstates)
            for j, z in enumerate(emp_bayes.zstates):
                this_zstate = ZstateInfo(state = z,
                                         prob  = zprob[j],
                                         exp   = list(zexp[j, :]) )
                model_zstates.append(this_zstate)
            # print(model_snps)
            # for i,m in enumerate(model_zstates):
            #     print("z-state: ",i," Prob:", m.prob)
            model.write_success_gene(gene, model_snps, model_zstates, res)
        else:
            model.write_failed_gene(gene, res) # np.zeros_like(init_params)
            print ("Failed optimization")

13866 GeneInfo(name='C3AR1', ensembl_id='ENSG00000171860.4', chrom=12, start=8210898, end=8219066)
Found 3603 SNPs, reduced to 74 SNPs (max p-value 0.000975978) for C3AR1
Loaded 2 features
[-2.19722458e+00  0.00000000e+00  0.00000000e+00  1.00000000e-01
  1.00000000e-02  4.00000000e+04]
called from prob_comps
Before czcomps: [-2.19722458  0.        ] 0.0 0.10000000000000002 0.010000000000000004 39999.99999999997
0.0 0.98
zstates.py: Working with 2 leading zstates.
called from func_grad
Before czcomps: [-2.19722458  0.        ] 0.0 0.10000000000000002 0.010000000000000004 39999.99999999997
called from func_grad
Before czcomps: [-2.19722598e+00 -1.33155983e-06] 0.0 0.10000192592358798 0.010076390132891976 5413.724845791061
called from prob_comps
Before czcomps: [-2.19722598e+00 -1.33155983e-06] 0.0 0.10000192592358798 0.010076390132891976 5413.724845791061
0.0 0.98
zstates.py: Working with 2 leading zstates.
called from func_grad
Before czcomps: [-2.19722782e+00 -3.08613747e-06] 0.0 0.10

6.307861064037395e-29 0.98
zstates.py: Working with 5 leading zstates.
called from func_grad
Before czcomps: [-3.23696704 -0.95721384] 0.0 0.8432638989017303 0.01589809950155548 2.3295419962996373
called from prob_comps
Before czcomps: [-3.23696704 -0.95721384] 0.0 0.8432638989017303 0.01589809950155548 2.3295419962996373
8.844827499152033e-29 0.98
zstates.py: Working with 5 leading zstates.
called from func_grad
Before czcomps: [-3.29227955 -1.00510641] 0.0 0.8002935318947753 0.01591233006672515 2.3340847277399863
called from prob_comps
Before czcomps: [-3.29227955 -1.00510641] 0.0 0.8002935318947753 0.01591233006672515 2.3340847277399863
9.802759653599849e-29 0.98
zstates.py: Working with 5 leading zstates.
called from func_grad
Before czcomps: [-3.31609013 -1.02246536] 0.0 0.7433711127768056 0.015885016607460507 2.328467344139442
called from prob_comps
Before czcomps: [-3.31609013 -1.02246536] 0.0 0.7433711127768056 0.015885016607460507 2.328467344139442
9.554548887772152e-29 0.98
z

5.149596213612865e-28 0.98
zstates.py: Working with 3 leading zstates.
      fun: 344.81271448497097
 hess_inv: <6x6 LbfgsInvHessProduct with dtype=float64>
      jac: array([ 0.09153588,  0.09026455, -0.01298836,  0.04184904,  0.05923147,
       -1.92146256])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 59
      nit: 50
   status: 0
  success: True
        x: array([-6.00105237e+02,  5.98920596e+02,  0.00000000e+00, -2.86622362e-01,
       -4.07845092e+00, -4.27507641e-01])
called from model_exp
Before czcomps: [-600.10523735  598.92059625] 0.0 0.7507952043185241 0.016933676881760586 2.3514103151515933


# Prediction

In [8]:

import sys
sys.path.append("../")
import os
import pickle
from utils.printstamp import printStamp
from iotools.io_model import ReadModel
from utils.containers import GeneExpressionArray
from utils import gtutils
from utils import mfunc
import numpy as np
from config_annots import *

In [9]:

if not os.path.exists(p_pickfile):
# Read genotype (quite slow for testing) use pickle below
    p_oxf = ReadOxford(p_gtpath, p_samplepath, chrom, predicting_dataset)
    p_genotype = np.array(p_oxf.dosage)
    p_samplenames = p_oxf.samplenames
    p_snps = p_oxf.snps_info
    p_nsample = len(p_oxf.samplenames)

    printStamp("Dumping CHR {:d} genotype".format(chrom))
    with open(p_pickfile, 'wb') as output:
        pickle.dump(p_oxf, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled genotype")
    with open(p_pickfile, 'rb') as input:
        pickled_oxf = pickle.load(input)

    printStamp("Done reading")

    p_genotype = np.array(pickled_oxf.dosage)
    p_samplenames = pickled_oxf.samplenames
    p_snps = pickled_oxf.snps_info
    p_nsample = len(pickled_oxf.samplenames)


2018-03-14 20:33:59 - Reading pickled genotype
2018-03-14 20:34:06 - Done reading


In [10]:

# Use parameters from config.py
# for p in parameters:
p = parameters[0]
prior = p[0]
params = p[1]
hyperpriors = p[2]
hyperparams = p[3]
run_description = p[4]
model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
modelpath = os.path.join("./z"+str(zmax)+ "_test", run_description, model_dir)

outfileprefix = os.path.join(modelpath,"pred_chr"+str(chrom))


printStamp("Predicting for "+model_dir)
# Write predictions for each model
p_model = ReadModel(modelpath, chrom)
p_genes = p_model.genes
gx = list()
for gene in p_genes:

    p_model.read_gene(gene)
    p_model_snps = p_model.snps
    p_model_zstates = p_model.zstates

    x = gtutils.prediction_variables(p_snps, p_model_snps, p_genotype)
    x = gtutils.normalize(p_model_snps, x)

    ypred = np.zeros(p_nsample)
    for z in p_model_zstates:
        ypred += z.prob * np.dot(x.T, z.exp)

    gx.append(GeneExpressionArray(geneid = gene.ensembl_id, expr_arr = ypred))


# Write output
mfunc.write_gcta_phenotype(outfileprefix, p_samplenames, gx)

2018-03-14 20:34:22 - Predicting for gxpred-bslmm_0.100_0.000_0.100_0.010_0.005


# Assessment 

In [50]:
import sys
sys.path.append("../")
import os
from iotools import readgtf
from iotools.readrpkm import ReadRPKM
from iotools.readPrediction import ReadPrediction
from scipy.stats import pearsonr
from helper_functions import load_target_genes, write_r2_dataframe, get_common_elements, new_write_predicted_r2, pearson_corr_rowwise
import math
import pickle
from utils.printstamp import printStamp

from config_annots import *

In [51]:

# Load reference dataset Gene Expression
reference_rpkm = ReadRPKM(reference_expdatapath, "cardiogenics")
reference_expression = reference_rpkm.expression
reference_expr_donors = reference_rpkm.donor_ids
reference_gene_names = reference_rpkm.gene_names

In [52]:
# use the selected_gene_ids with high R² values as targets, only those in the selected chrom will appear
genelistfile = "genes4testing_highr2"
selected_gene_ids = load_target_genes(genelistfile, gene_info, chrom)
target_genelist = [g.split(".")[0] for g in selected_gene_ids]
target_donors = reference_expr_donors

Read 70 genes with high r2 values

Found 7 genes in CHR 12


In [57]:

### Predixcan assessment ###

if not os.path.exists(predixcan_pickfile):
    # pxpred_predpath = os.path.join(home, "predictions/cardiogenics/predixcan_predictions_klinikum")
    predixcanpred = ReadPrediction(pxpred_predpath, reference_samplepath, "predixcan", trim=True)

    printStamp("Dumping Predixcan prediction")
    with open(predixcan_pickfile, 'wb') as output:
        pickle.dump(predixcanpred, output, pickle.HIGHEST_PROTOCOL)
else:
    printStamp("Reading pickled Predixcan prediction")
    with open(predixcan_pickfile, 'rb') as input:
        predixcanpred = pickle.load(input)

# filter predixcan predictions with only those in gxpred
predixcanpred.sort_by_gene(target_genelist)
predixcanpred.sort_by_samples(target_donors, use_prev=True)

sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, predixcanpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, predixcanpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

predixcan_r = pearson_corr_rowwise(predixcanpred.sorted_expr_mat.T, sorted_expression.T)



2018-03-19 15:16:10 - Reading pickled Predixcan prediction
Genes found: 7 of 7
Samples found: 744 of 849


In [79]:
%load_ext autoreload
%autoreload 2

### GXpred assessment ###

#for p in parameters:
p = parameters[0]
prior = p[0]
params = p[1]
hyperpriors = p[2]
hyperparams = p[3]
run_description = p[4]
model_dir = prior+"_{:.3f}_{:.3f}_{:.3f}_{:.3f}_{:.3f}".format(params[0], params[1], params[2], params[3], params[4])
print(prior, params)

modelpath = os.path.join("./z"+str(zmax)+ "_test", run_description, model_dir)
print(modelpath)


gxpred_predpath = os.path.join(modelpath)
gxpred = ReadPrediction(gxpred_predpath, reference_samplepath, "gxpred", trim=True)

# filter gxpred predicted values
gxpred.sort_by_gene(target_genelist)
gxpred.sort_by_samples(target_donors, use_prev=True)


# Filter and sort the reference expression values
# Cardiogenics variables
# expression
# expr_donors
# gene_names

sorted_expr_donors, ix_samples = get_common_elements(reference_expr_donors, gxpred.sorted_samples)
sorted_gene_names, ix_genes = get_common_elements(reference_gene_names, gxpred.sorted_gene_names)
sorted_expression = reference_expression[ix_genes,:][:, ix_samples].T

# Calculate Pearson correlation
gxpred_r = pearson_corr_rowwise(gxpred.sorted_expr_mat.T, sorted_expression.T)

print(gxpred.sorted_gene_names)
print(gxpred_r**2)
print(predixcan_r**2)


# Write to table with predictions for given genes
# predtabledir = os.path.join(home, "gxpred","devtools", "all_predictions.txt")
# new_write_predicted_r2(predtabledir, prior, params, gxpred_r, predixcan_r, gxpred.sorted_gene_names)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
gxpred-bslmm [0.1, 0.0, 0.1, 0.01, 0.005]
./z1_test/bound_mu_NoPriors/gxpred-bslmm_0.100_0.000_0.100_0.010_0.005
No prediction found for CHR 1
No prediction found for CHR 2
No prediction found for CHR 3
No prediction found for CHR 4
No prediction found for CHR 5
No prediction found for CHR 6
No prediction found for CHR 7
No prediction found for CHR 8
No prediction found for CHR 9
No prediction found for CHR 10
No prediction found for CHR 11
Loaded 1 genes in CHR 12
1
No prediction found for CHR 13
No prediction found for CHR 14
No prediction found for CHR 15
No prediction found for CHR 16
No prediction found for CHR 17
No prediction found for CHR 18
No prediction found for CHR 19
No prediction found for CHR 20
No prediction found for CHR 21
No prediction found for CHR 22
Genes found: 1 of 7
Samples found: 744 of 849
(786, 1)
['ENSG00000171860']
[0.60532227]
[0.60396525 0.28503124 0.63198656 0.628724

In [75]:

write_r2_dataframe(modelpath, chrom, "predixcan", predixcan_r, predixcanpred)
write_r2_dataframe(modelpath, chrom, prior, gxpred_r, gxpred)

genes_df:                       Ensembl_ID Gene_Name  Success     Gamma0    Gamma1   Mu  \
ENSG00000171860  ENSG00000171860     C3AR1     True -600.10524  598.9206  0.0   

                  Sigma  Sigma_bg  Sigma_tau  
ENSG00000171860  0.7508   0.01693    2.35141  
new_df:                  predixcan       Ensembl_ID
ENSG00000171860   0.603965  ENSG00000171860
ENSG00000064115   0.285031  ENSG00000064115
ENSG00000139610   0.631987  ENSG00000139610
ENSG00000090382   0.628725  ENSG00000090382
ENSG00000127337   0.509881  ENSG00000127337
ENSG00000135643   0.214941  ENSG00000135643
ENSG00000184752   0.196200  ENSG00000184752
(1, 9)
(7, 2)
