In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utils import get_sequences
from itertools import islice

In [48]:
# df = pd.read_csv('data/cagi4_mpra/4-eQTL-causal_SNPs_sample_v2.txt', sep='\t')
df = pd.read_csv('data/cagi4_mpra/train_df.csv')
df.head()

Unnamed: 0,ID,chr,pos,RefAllele,AltAllele,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,...,Regulatory_Hit,emVar_Hit,eQTL_lead,eQTL_gene,eQTL_beta,eQTL_tStat,eQTL_P,sequence,ref_sequence,alt_sequence
0,chr1:183160326:I,1,183160326,G,GAT,169.349792,142.237956,-0.190268,0.226744,0.0,...,0,0,chr1:183160326:I,ENSG00000058085,0.128792,6.503906,2.66e-10,,CAAACAATATTTTTACAATAAAGTAATTGAGAAAGTACCCTATAAT...,CAAACAATATTTTTACAATAAAGTAATTGAGAAAGTACCCTATAAT...
1,chr1:218193345:D,1,218193345,TC,T,546.915129,510.274157,-0.094848,0.215542,0.0,...,0,0,chr1:218193345:D,ENSG00000223375,0.181454,7.312115,1.76e-12,,GGCTTCACACTGTTTTTATTAGGGCTTATTGTTTGGAAAATTAAGT...,GGCTTCACACTGTTTTTATTAGGGCTTATTGTTTGGAAAATTAAGT...
2,chr1:226329182:I,1,226329182,T,TA,117.934774,96.640336,-0.216104,0.245357,0.0,...,0,0,chr1:226329182:I,ENSG00000182827,-0.081298,-6.925068,2.05e-11,,CCTTCAACTCCTGTGCTTAAGTGATCCTCTGGCCTCTGCCTCCCTA...,CCTTCAACTCCTGTGCTTAAGTGATCCTCTGGCCTCTGCCTCCCTA...
3,chr1:58999158:D,1,58999158,TA,T,178.517618,167.443939,-0.080045,0.062945,0.0,...,0,0,chr1:58999158:D,ENSG00000162600,0.099132,7.687867,1.48e-13,,GTATTAAAATTATGAATGGGAAGCCACATATACTAAAAAGCTCAGT...,GTATTAAAATTATGAATGGGAAGCCACATATACTAAAAAGCTCAGT...
4,chr12:104337222:D,12,104337222,GTAAT,G,86.668936,121.945143,0.349468,1.153695,0.0,...,0,0,chr12:104337222:D,ENSG00000204954,-0.112888,-7.578777,3.07e-13,,AAGTTATTTTGTGAACAAATTAAGCTGCAGCTGGTTACTTTGTAAC...,AAGTTATTTTGTGAACAAATTAAGCTGCAGCTGGTTACTTTGTAAC...


In [6]:
# if the refallele is one character long, it should be 75[1][74]    1 0
# if the refallele is two characters long, it should be 74[2][74]   2 1
# if the refallele is three characters long, it should be 74[3]73   3 1
# if it's four characters long, it should be 73[4][73]              4 2

# start_ind is in fact 76 - ((len(RefAllele)+1)// 2)

In [44]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis=1)

In [46]:
df.to_csv('data/cagi4_mpra/train_df.csv', index=False)

In [10]:
from utils import encode_sequences

ref_onehot = encode_sequences(df['ref_sequence'], seqlen=1000)
alt_onehot = encode_sequences(df['alt_sequence'], seqlen=1000)

In [5]:
from deepsea import DeepSea

# ds = DeepSea(use_gpu=False)
# ref_preds = ds.predict(ref_onehot)
# alt_preds = ds.predict(alt_onehot)
ref_preds = np.load('data/ref4_preds_deepsea.npy')
alt_preds = np.load('data/alt4_preds_deepsea.npy')

EnsembleExpr also outperforms state-of-the-art methods in the
eQTL-prioritization literature. Zhou and Troyanskaya (2015) reported
that an L2-regularized logistic regression model trained on DeepSEAderived
features and evolutionary conservation scores achieved predictive
performance that surpasses existing approaches for eQTLs.
We trained this DeepSEA plus conservation model on the DeepSEA derived
features and conservation scores for the variants in the CAGI
training set, and predicted “emVar” labels of variants in the test
set. While ranking third among all the submissions, the DeepSEA
plus conservation model achieved a performance inferior to that of EnsemblExpr

Test set score: 0.589 AUROC, 0.389 AUPRC

### TOPTODO: add conservation features, implement test set, try using that other library (the one from r that grossman used)

In [38]:
from utils import snp_feats_from_preds
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.utils.class_weight import compute_sample_weight


feattypes = [['absodds', 'absdiff'], 'absdiff', 'diff', 'scaleddiff', 'absodds', 'odds']
folds = StratifiedKFold(n_splits=5)
splits = [s for s in folds.split(ref_preds, df['emVar_Hit'])]
classifier_type = 'xgbsk'

for ft in feattypes:
  
  lr = LogisticRegression(penalty='l2', C=0.1)
  
  xdiff = snp_feats_from_preds(ref_preds, alt_preds, feattypes=[ft] if type(ft)==str else ft)
  class_rocs = []
  class_auprcs = []
  prob_rocs = []
  prob_auprcs = []
#   https://stackoverflow.com/questions/41006322/cross-validation-with-roc
  for train_inds, test_inds in splits:

    ytest = df['emVar_Hit'][test_inds]
    ytrain = df['emVar_Hit'][train_inds]
    xtrain = xdiff[train_inds]
    xtest = xdiff[test_inds]
    num_pos = np.sum(ytrain)
    num_neg = ytrain.shape[0] - num_pos
    scale_pos_weight = num_neg / num_pos
    
    if classifier_type == 'xgbsk':
      lr = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight,
                             reg_lambda=10)
      lr.fit(xtrain, ytrain, verbose=False, eval_set=[(xtrain, ytrain), (xtest, ytest)],
             eval_metric='auc', early_stopping_rounds=10)
    
    elif classifier_type == 'xgb':
      params = {'reg_lambda': 10, 'eta': 0.1,
                'scale_pos_weight': scale_pos_weight,
                'objective': 'binary:logistic'}
      dtrain = xgb.DMatrix(xtrain, label=ytrain)
      dval = xgb.DMatrix(xtest, label=ytest)
      evallist = [(dtrain, 'train'), (dval, 'valid')]
      bst = xgb.train(params, dtrain, num_boost_round=10, evals=evallist)
    
    
    else:
      sample_weight = compute_sample_weight('balanced', ytrain)
      lr.fit(xtrain, ytrain, sample_weight=sample_weight)
    
    class_preds = lr.predict(xtest)
    predicted_probs = lr.predict_proba(xtest)
#     print(predicted_probs.shape, class_preds.shape)
    
    auc_class = roc_auc_score(ytest, class_preds)
    auc_proba = roc_auc_score(ytest, predicted_probs[:,1])
    prauc_class = average_precision_score(ytest, class_preds)
    prauc_proba = average_precision_score(ytest, predicted_probs[:,1])
    
    class_rocs.append(auc_class)
    class_auprcs.append(prauc_class)
    prob_rocs.append(auc_proba)
    prob_auprcs.append(prauc_proba)
    
#   cv_score = cross_val_score(lr, xdiff, df['emVar_Hit'], scoring='roc_auc', cv=10)
#   
#   lr.fit(xdiff, df['emVar_Hit'])
#   preds = lr.predict(xdiff)
  print('\n',ft)
  print('ROC AUC (class): \t{}\t AUPRC (class): \t{}'.format(np.mean(class_rocs), np.mean(class_auprcs)))
  print('ROC AUC (prob): \t{}\t AUPRC (prob): \t{}'.format(np.mean(prob_rocs), np.mean(prob_auprcs)))


 ['absodds', 'absdiff']
ROC AUC (class): 	0.7172550699077834	 AUPRC (class): 	0.37235069370971796
ROC AUC (prob): 	0.8063219371688586	 AUPRC (prob): 	0.2374446942668024

 absdiff
ROC AUC (class): 	0.7071233666147271	 AUPRC (class): 	0.3507231057621621
ROC AUC (prob): 	0.803736452766022	 AUPRC (prob): 	0.25146576172527585

 diff
ROC AUC (class): 	0.7223559518278428	 AUPRC (class): 	0.3800046739574582
ROC AUC (prob): 	0.8118010933659509	 AUPRC (prob): 	0.23942390771453645

 scaleddiff
ROC AUC (class): 	0.7313497506792712	 AUPRC (class): 	0.38297892426154456
ROC AUC (prob): 	0.8302969329283517	 AUPRC (prob): 	0.24605883635494868

 absodds
ROC AUC (class): 	0.6331240735414501	 AUPRC (class): 	0.25703792613016496
ROC AUC (prob): 	0.7413016143425002	 AUPRC (prob): 	0.14744092355329358

 odds
ROC AUC (class): 	0.6393001478119511	 AUPRC (class): 	0.2641958290363351
ROC AUC (prob): 	0.7280000268653226	 AUPRC (prob): 	0.15791032997991128
