In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('/mnt/storage/saikat/work/gwas-eQTL/codebase/gxpred/'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import argparse
import numpy as np

from iotools.readvcf import ReadVCF
from iotools.readrpkm import ReadRPKM
from iotools import readgtf
from utils import gtutils
from utils import mfunc
from utils.containers import GeneExpressionArray

In [3]:
train_vcfpath = "data/input/chr21_correlated_genes.vcf.gz"
pred_vcfpath = "data/input/GTEx_450Indiv_chr21_genot_imput_info04_maf01_HWEp1E6_ConstrVarIDs_donorIDs_46_49Mb.vcf.gz"
#pred_vcfpath = "data/input/chr21_correlated_genes.vcf.gz"
rpkmpath = "data/input/correlated_genes.txt"
gtfpath = "data/input/correlated_genes.gtf.gz"
outpath = "elastic_net_gxpred"
obsv_expr_path = "data/input/gtex_wholeblood_normalized_expression_correlated_genes.txt"
chrom = 21
window = 1000000

In [4]:
# Genotype
vcf = ReadVCF(train_vcfpath, mode="GT")
genotype = vcf.dosage
vcf_donors = vcf.donor_ids
snps = vcf.snpinfo
snps_nofilter = snps

# Quality control
#snps, genotype = gtutils.remove_low_maf(snps, genotype, 0.1)
#gt = gtutils.normalize(snps, genotype)
gt = genotype

# Annotation
gene_info = readgtf.gencode_v12(gtfpath, include_chrom = chrom)

# Gene Expression
rpkm = ReadRPKM(rpkmpath)
expression = rpkm.expression
expr_donors = rpkm.donor_ids
gene_names = rpkm.gene_names

# Selection
vcfmask, exprmask = mfunc.select_donors(vcf_donors, expr_donors)
genes, indices = mfunc.select_genes(gene_info, gene_names)

# Prediction genotype
pred_vcf = ReadVCF(pred_vcfpath, mode="DS")
pred_genotype = pred_vcf.dosage
pred_vcf_donors = pred_vcf.donor_ids
pred_snps = pred_vcf.snpinfo
pred_nsample = len(pred_vcf_donors)

In [5]:
pred_expr = list()
pred_expr_donors = list()
with open(obsv_expr_path, 'r') as mfile:
    header = mfile.readline().split()[1:]
    for mline in mfile:
        row = mline.split()
        pred_expr.append(np.array([float(x) for x in row[1:]]))
        
pred_expr_donors = ['-'.join(x.split('-')[:2]) for x in header]
pred_expr = np.vstack(pred_expr)
pred_vcfmask, pred_exprmask = mfunc.select_donors(pred_vcf_donors, pred_expr_donors)
pred_genotype = pred_genotype[:, pred_vcfmask]
pred_expr = pred_expr[:, pred_exprmask]

In [6]:
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import scale
from scipy import stats

In [None]:
genes

In [11]:
gxpred = list()
for i, gene in enumerate(genes):
    k = indices[i]
    cismask = mfunc.select_snps(gene, snps, window)
    if len(cismask) > 0:
        Y_train = expression[k, exprmask]
        Y_train = scale(Y_train, with_mean=True, with_std=True)
        X_train = gt[cismask][:, vcfmask]
        train_snps = [snps[j] for j in cismask]
        X_pred = gtutils.prediction_variables(pred_snps, train_snps, pred_genotype)
        Y_gtex = pred_expr[i]
        #X_pred = gtutils.normalize(train_snps, X_pred)
        
        csvfilename = "{:s}_gt.csv".format(gene.name)
        np.savetxt(csvfilename, X_train, delimiter=',')
        csvfilename = "{:s}_gx.csv".format(gene.name)
        np.savetxt(csvfilename, Y_train, delimiter=',')
        csvfilename = "{:s}_gtex_gt.csv".format(gene.name)
        np.savetxt(csvfilename, X_pred, delimiter=',')
        csvfilename = "{:s}_gtex_gx.csv".format(gene.name)
        np.savetxt(csvfilename, Y_gtex, delimiter=',')
        
        regr = ElasticNetCV(l1_ratio=0.5, cv=10, random_state=0)
        regr.fit(X_train.T, Y_train)
        Y_pred_samedata = regr.predict(X_train.T)
        slope, intercept, r_value, p_value, std_err = stats.linregress(Y_pred_samedata, Y_train)
        print ("Min. MSE: {:g}".format(np.min(np.mean(regr.mse_path_, axis = 1))))
        print ("Min. lambda: {:g}".format(regr.alpha_))
        print ("Selected SNPs: {:d}".format(np.where(regr.coef_ != 0)[0].shape[0]))
        print ("R-squared: {:g}".format(r_value ** 2))
        print ("p-value: {:g}".format(p_value))                       
        print ("Prediction on GTEx ==== ")
        Y_pred = regr.predict(X_pred.T)
        gxpred.append(GeneExpressionArray(geneid = gene.ensembl_id, expr_arr = Y_pred))
        slope, intercept, r_value, p_value, std_err = stats.linregress(Y_pred, Y_gtex)
        print ("R-squared: {:g}".format(r_value ** 2))
        print ("p-value: {:g}".format(p_value)) 
    else:
        print("No genotype for gene {:s}".format(gene.name))
        
# Write output
mfunc.write_gcta_phenotype(outpath, [pred_vcf_donors[x] for x in pred_vcfmask], gxpred)



Min. MSE: 0.763822
Min. lambda: 0.0757114
Selected SNPs: 48
R-squared: 0.390611
p-value: 8.26572e-42
Prediction on GTEx ==== 
R-squared: 0.201704
p-value: 3.50531e-18




Min. MSE: 0.81693
Min. lambda: 0.118241
Selected SNPs: 24
R-squared: 0.260115
p-value: 4.32561e-26
Prediction on GTEx ==== 
R-squared: 0.291618
p-value: 5.59652e-27


In [7]:
import collections
chr21snps = collections.defaultdict(lambda:0)
with open('data/input/geuvadis.annot.chr21.txt') as mfile:
    mfile.readline()
    for mline in mfile:
        row = mline.split()
        chr21snps[row[5]] = int(row[1])

i = 0
gene = genes[i]
bplist = [x.bp_pos for x in snps]
cismask = list()
betas = list()
with open('C21orf56_snps.txt') as mfile:
    mfile.readline()
    for mline in mfile:
        row = mline.split()
        bppos = chr21snps[row[1]]
        mindex = bplist.index(bppos)
        cismask.append(mindex)
        betas.append(float(row[4]))
        print (row[2] == snps[mindex].ref_allele or row[2] == snps[mindex].alt_allele, snps[mindex].maf, bppos)
cismask = np.array(cismask)
betas = np.array(betas)
train_snps = [snps[j] for j in cismask]
X_pred = gtutils.prediction_variables(pred_snps, train_snps, pred_genotype)
Y_gtex = pred_expr[i]

True 0.0425531914893617 46636145
True 0.0625 46646264
True 0.5 46775249
True 0.5 46775677
True 0.09973404255319149 46859163
True 0.006648936170212766 46905824
True 0.005319148936170213 46932589
True 0.1356382978723404 47032094
True 0.041223404255319146 47091238
True 0.25132978723404253 47253215
True 0.005319148936170213 47331964
True 0.32978723404255317 47347750
True 0.32978723404255317 47353775
True 0.39361702127659576 47355955
True 0.32978723404255317 47362647
True 0.839095744680851 47375268
True 0.839095744680851 47376757
True 0.15691489361702127 47457507
True 0.16356382978723405 47469124
True 0.538563829787234 47476701
True 0.4734042553191489 47501261
True 0.06648936170212766 47502874
True 0.08776595744680851 47503353
True 0.17553191489361702 47519026
True 0.7313829787234043 47553922
True 0.31382978723404253 47575372
True 0.28058510638297873 47582102
True 0.19148936170212766 47589099
True 0.18351063829787234 47599280
True 0.6276595744680851 47608580
True 0.6901595744680851 47610066

In [8]:
ypred = np.dot(X_pred.T, betas)
Y_gtex_scaled = scale(Y_gtex)
slope, intercept, r_value, p_value, std_err = stats.linregress(Y_gtex, ypred)
print ("R-squared: {:g}".format(r_value ** 2))
print ("p-value: {:g}".format(p_value))

R-squared: 0.243057
p-value: 4.21044e-22


In [9]:
X_pred

array([[ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  1.   , ...,  0.   ,  1.   ,  0.   ],
       [ 2.   ,  2.   ,  2.   , ...,  2.   ,  0.201,  1.   ],
       ..., 
       [ 0.   ,  1.   ,  0.   , ...,  1.   ,  1.   ,  0.   ],
       [ 2.   ,  1.   ,  2.   , ...,  1.   ,  1.   ,  2.   ],
       [ 0.   ,  1.   ,  0.   , ...,  1.   ,  1.   ,  0.   ]])

In [10]:
y_predixcan = list()
predixcan_donors = list()
with open('C21orf56_predixcan.txt', 'r') as mfile:
    mfile.readline()
    for mline in mfile:
        row = mline.split()
        y_predixcan.append(float(row[1]))
        predixcan_donors.append(row[0])
predixcan_mask = [predixcan_donors.index(pred_vcf_donors[x]) for x in pred_vcfmask]
y_predixcan = np.array([y_predixcan[x] for x in predixcan_mask])
slope, intercept, r_value, p_value, std_err = stats.linregress(y_predixcan, Y_gtex)
print ("R-squared: {:g}".format(r_value ** 2))
print ("p-value: {:g}".format(p_value))

R-squared: 0.243057
p-value: 4.21044e-22


In [None]:
print ("Calculated: {:g}\nPrediXcan:  {:g}".format(ypred[3], y_predixcan[3]))

In [None]:
import pandas as pd
dosagefile = '../predixcan/debug/GTEx_450Indiv_genot_imput_info04_maf01_HWEp1E6_dbSNP142IDs_donorIDs_dosage_chr21.dosage'
dosage_df = pd.read_csv(dosagefile, sep=' ', index_col=1, header = None)

In [None]:
dosage_donors = list()
with open('../predixcan/debug/donor_ids.fam') as mfile:
    for mline in mfile:
        dosage_donors.append(mline.split()[0].strip())

In [None]:
meta_cols = [ 'chrom', 'bp_pos', 'ref', 'alt', 'freq']
dosage_df.columns = meta_cols + dosage_donors
#dosage_df.drop(meta_cols, axis=1, inplace=True)
ordered_dosage_donors = [pred_vcf_donors[x] for x in pred_vcfmask]
#dosage_df = dosage_df[ordered_dosage_donors]
#dosage_df

In [None]:
train_rsids = list()
betas = list()
X_pred = list()
with open('C21orf56_snps.txt') as mfile:
    mfile.readline()
    for mline in mfile:
        row = mline.split()
        this_rsid = row[1]
        train_rsids.append(this_rsid)
        this_ref = dosage_df['ref'].loc[this_rsid]
        mindex = bplist.index(chr21snps[this_rsid])
        if this_ref == snps[mindex].ref_allele:
            this_gt = dosage_df[ordered_dosage_donors].loc[this_rsid].as_matrix()
        elif this_ref == snps[mindex].alt_allele:
            this_gt = 2.0 - dosage_df[ordered_dosage_donors].loc[this_rsid].as_matrix()
        else:
            this_gt = np.zeros(len(ordered_dosage_donors))
        X_pred.append(this_gt)
        betas.append(float(row[4]))

betas = np.array(betas)
X_pred = np.vstack(X_pred)
ypred = np.dot(X_pred.T, betas)
Y_gtex_scaled = scale(Y_gtex)
slope, intercept, r_value, p_value, std_err = stats.linregress(Y_gtex, ypred)
print ("R-squared: {:g}".format(r_value ** 2))
print ("p-value: {:g}".format(p_value))

In [None]:
mx_trial = X_pred[:,3]
mx_trial

In [None]:
np.dot(mx_trial, betas)

In [None]:
lambdas = regr.alphas_
cvm = np.mean(regr.mse_path_, axis = 1)
cvm_err = np.std(regr.mse_path_, axis = 1)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
plt.plot(np.log(lambdas), cvm)
plt.show()

In [None]:
from inference.linreg_association import LinRegAssociation
assoc_model = LinRegAssociation(X_train, Y_train, 100, 0.05)
selecttop = assoc_model.selected_variables

In [None]:
bgpool = [i for i in range(1040) if i not in selecttop]
nbg = min(max(100, len(selecttop)), len(bgpool))
selectbg = np.random.choice(bgpool, nbg)
select = np.concatenate((selecttop, selectbg))

In [None]:
gt = ["1/1", "1|0", "0|1", "1|1", "0|0", ".|0"]

In [None]:
[ int(x[0]) + int(x[2]) if len(x) == 3 and x[0] != "." and x[2] != "." else "." for x in gt ]