A notebook to make ensemble predictions to quantify the standard error in the Confidence.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from models import DeepSeaSNP, Conservation, SNPContext, MultiFeatures, EnhancerOneHot, \
    Stacked, SubstitutionOneHot, DNase, Classifier, Regression, RegressionClassifier
from utils import write_submission

## Load data

Read the training data with added conservation information.

In [2]:
training = pd.read_csv('data/cagi5_df.csv')
training.head()

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.47,3.93,1.49
1,X,138612669,T,C,-0.26,0.24,-1,release_F9,0.006,0.47,3.93,1.49
2,X,138612669,T,G,0.34,0.05,0,release_F9,0.006,0.47,3.93,1.49
3,X,138612670,A,C,0.0,0.0,0,release_F9,0.008,0.47,3.93,0.337
4,X,138612670,A,G,0.22,0.2,1,release_F9,0.008,0.47,3.93,0.337


Read the submission data

In [3]:
submission = pd.read_csv(
    'data/submission-with-cons.tsv',
    sep='\t',
    na_values='*')
submission.sample(6)

Unnamed: 0,Chrom,Pos,Ref,Alt,Promoter_Enhancer,Direction,P_Direction,Confidence,SE,Comments,phastCon,phyloP,GerpN,GerpRS
13551,X,138612864,A,T,F9,,,,,,0.996,0.47,5.32,2.53
1847,1,155271413,T,A,PKLR,,,,,,0.068,0.375,3.09,-2.84
5423,6,396201,G,A,IRF4,,,,,,0.727,-0.419,4.86,2.01
8671,8,128413540,A,G,MYC,,,,,,0.994,-0.278,5.47,0.412
6404,6,37775320,A,C,ZFAND3,,,,,,0.014,0.329,4.25,1.65
8989,10,51549008,T,C,MSMB,,,,,,0.056,0.375,2.99,2.99


### Choose features

In [4]:
nn_filenames = [
    ('deepsea-test_{}_preds.npy', 'deepsea_{}_preds.npy'),
    ('ds500-dq-embed64-500-d20-3-5-11-test_{}_preds_v1.npy', 'ds500-dq-embed64-500-d20-3-5-11_{}_preds_v1.npy'),
    ('ds500-dq-embed64-500-d20-5-11-test_{}_preds_v1.npy', 'ds500-dq-embed64-500-d20-5-11_{}_preds_v1.npy'),
    ('crnn_500_200-9-16-test_{}_preds_v1.npy', 'crnn_500_200-9-16_{}_preds_v1.npy'),
    ('crnn_500_200-4-9-16-test_{}_preds_v1.npy', 'crnn_500_200-4-9-16_{}_preds_v1.npy'),
]
train_features = [
    None,
    DNase(test=False),
    Conservation(),
    Stacked('deep-dnase-cons'),
]
test_features = [
    None,
    DNase(test=True),
    Conservation(),
    Stacked('test-deep-dnase-cons'),
]

## Fit model

Fit the model for each neural network and store the predicted confidences.

In [8]:
n_networks = len(nn_filenames)
pred_confs = np.zeros((submission.shape[0], n_networks))
for i, (test_filename, train_filename) in enumerate(nn_filenames):
    # Create features for this network's predictions
    test_features[0] = DeepSeaSNP(filename_fmt=test_filename)
    train_features[0] = DeepSeaSNP(filename_fmt=train_filename)
    # Create multi features
    test_multi = MultiFeatures(test_features)
    train_multi = MultiFeatures(train_features)
    # Create the model
    model_kwargs = {'features': train_multi, 'model_name': 'xgb'}
    regression = Regression(**model_kwargs)
    classifier = Classifier(**model_kwargs)
    model = RegressionClassifier(regression, classifier)
    # Fit the model
    X_train = model.get_features(training)
    y_train = model.get_response(training)
    model.fit(X_train, y_train)
    # Make predictions
    X_test = test_multi.get_features(submission)
    predictions = model.predict(X_test, submission.index)
    predictions = model.make_submission(predictions)
    pred_confs[:, i] = predictions['PredConfidence']

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:


Calculate the standard errors of the confidence predictions.

In [12]:
std_error = np.std(pred_confs, axis=1)

array([0.09216255, 0.1313273 , 0.14456596, ..., 0.01869002, 0.01440216,
       0.00940996])

### Save the standard errors

In [13]:
np.save('data/conf-se.npy', std_error)