In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import collections

from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics

import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

In [None]:
### USER CONFIGURES ###
data_dir = "/data/akmorrow/tfti/t2t_data"
train_dir = "/data/akmorrow/tfti/t2t_train"
validation_file = "/data/akmorrow/tfti/data/deepseavalidation.txt"

In [None]:
sys.path.append("../tfti")
import tfti

# Reset graph for conistency.
tf.reset_default_graph()

# Prepare model.
problem_name = "genomics_binding_deepsea_gm12878"
model_name = "tfti_transformer"
hparams_set = "tfti_transformer_debug"
hparams_overrides_str = ""

data_dir = os.path.expanduser(data_dir)
#output_dir = os.path.expanduser(train_dir + f"/{problem_name}/{model_name}-{hparams_set}")
output_dir = os.path.expanduser("/data/akmorrow/tfti/t2t_train/genomics_binding_deepsea_gm12878/tfti_transformer-params_tfti_transformer_debug")

# Prepare model.
hparams = trainer_lib.create_hparams(hparams_set, hparams_overrides_str, data_dir, problem_name)
problem = registry.problem(problem_name)
encoders = problem.get_feature_encoders(data_dir)

# Prepare the model and the graph when model runs on features.
model = registry.model(model_name)(hparams, tf.estimator.ModeKeys.EVAL)

# Prepare features for feeding into the model.
inputs_ph = tf.placeholder(dtype=tf.string, shape=[ ])
targets_ph = tf.placeholder(dtype=tf.int64, shape=[problem.num_binary_predictions])
features = {"inputs": inputs_ph, "targets": targets_ph}
features = problem.preprocess_dev_example(features, tf.estimator.ModeKeys.EVAL, hparams)

# Introduce a dummy batch dimension.
for key in features.keys():
    features[key] = tf.expand_dims(features[key], 0)

logits, losses = model(features)
predictions = tf.nn.sigmoid(logits)
labels = features["targets"]

# Evaluation metrics we want to use.
set_auroc, _ = tfti.set_auroc(logits, labels, features)
set_auprc, _ = tfti.set_auprc(logits, labels, features)
average_auroc, _ = tfti.average_auroc(logits, labels, features)
average_auprc, _ = tfti.average_auprc(logits, labels, features)

# saver = tf.train.Saver()
sess = tf.InteractiveSession()

# Initialize AUC running average stuff.
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)

# Load weights from checkpoint.
ckpts = tf.train.get_checkpoint_state(output_dir)
ckpt = ckpts.model_checkpoint_path
# saver.restore(sess, ckpt)

In [None]:
# load in test data
import pandas as pd
validation_data=pd.read_csv(validation_file, sep='\t',header=None)

In [None]:
# parse prediction file and run graph

predictions_and_labels = []

for i in range(len(validation_data)):
    inputs = validation_data[0][i]
    targets = np.array(list(map(int, validation_data[1][i].split(','))))
    
    fetch = (predictions, labels)
    fetch_numpy = sess.run(fetch, feed_dict={
        inputs_ph: inputs,
        targets_ph: targets
    })
    
    predictions_and_labels.append(fetch_numpy)
    
predictions_and_labels = [(x.squeeze(), y.squeeze())
                          for (x, y) in predictions_and_labels]

In [None]:
predictions_numpy = np.array(predictions_and_labels)[:, 0, :]
labels_numpy = np.array(predictions_and_labels)[:, 1, :]

In [None]:
# these are the 24 TFs we are evaluating on
tfs = sorted(['SP1', 'Pol2-4H8', 'USF2', 'NRSF', 'RFX5', 'c-Myc', 'RXRA', 'EZH2', 'TBP', 'CHD1', 'Egr-1', 'SIN3A', 'GABP', 'CEBPB', 'Nrf1', 'p300', 'CTCF', 'ATF3', 'ATF2', 'Pol2', 'BCL11A', 'BRCA1', 'TCF12', 'SIX5', 'JunD', 'Rad21', 'YY1', 'USF-1', 'Max', 'TAF1', 'CHD2', 'Mxi1', 'SRF', 'Znf143'])

In [None]:
# Make ROC plots on all TFs
plt.figure()

lw = 2
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')


for i in range(len(tfs)):
    tf = tfs[i]
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, _ = roc_curve(labels_numpy[:,i], predictions_numpy[:,i])
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, lw, label='%s (%0.2f)' % (tf, roc_auc))
    
plt.legend(loc="lower right")
plt.show()