## Dependency imports

In [15]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import collections

from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics

sys.path.append("../tfti")
import tfti

## Pipeline utility functions

In [None]:
##############################
#         HELPER FNS         #
##############################

def get_init_op():
    """Returns an initialization op."""
    global_init_op = tf.global_variables_initializer()
    local_init_op = tf.local_variables_initializer()
    return tf.group(global_init_op, local_init_op)

def get_session(is_interactive=True, **kwargs):
    """Returns a session."""
    config = tf.ConfigProto(
        log_device_placement=True,
        allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    if is_interactive:
        return tf.InteractiveSession(config=config, **kwargs)
    else:
        return tf.Session(config=config, **kwargs)

def restore_from_checkpoint(session, ckpt_dir=None):
    """Restores session from checkpoint."""
    if ckpt_dir is None:
        tf.logging.warn("Value for argument ckpt_dir is `None`. "
                        "Not restoring from checkpoint.")
    else:
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(
            ckpt_dir).model_checkpoint_path
        saver.restore(session, ckpt)
        
def initialize_variables(session, ckpt_dir=None):
    """Initializes global and local variables."""
    init_op = get_init_op()
    session.run(init_op)
    restore_from_checkpoint(
        session=sess,
        ckpt_dir=ckpt_dir,
    )
    
def prepare_pipeline(problem_name,
                     model_name,
                     hparams_set,
                     hparams_overrides_str,
                     data_dir="None",
                     mode=tf.estimator.ModeKeys.EVAL):
    """Returns a tuple: (problem, model, hparams)."""
    problem = registry.problem(problem_name)
    problem.get_feature_encoders()  # Creates encoders.
    hparams = trainer_lib.create_hparams(
        hparams_set, hparams_overrides_str, data_dir, problem_name)
    model = registry.model(model_name)(hparams, mode)
    return problem, model, hparams

def get_latents_and_metrics_weights(problem, targets, keep_mask):
    """Creates latents and weights."""
    metrics_mask = tf.to_float(tf.logical_not(keep_mask))
    float_keep_mask = tf.to_float(keep_mask)
    latents = tf.to_int32(
        float_keep_mask * tf.to_float(targets)
        + (1.0 - float_keep_mask) * problem.unk_id)
    return latents, metrics_mask

## Main body call

* Redefine the `get_raw_data_generator` to yield raw inputs.
* Also define `keep_mask` for deterministic masking/imputations.

In [21]:
##############################
#         MAIN BODY          #
##############################

# Parameters
problem_name="genomics_binding_deepsea_gm12878"
model_name="tfti_transformer"
hparams_set="tfti_transformer_debug"
hparams_overrides_str=""
ckpt_dir=None  # No checkpoint.
batch_size=2  # For batch parallelism.
keep_mask = np.random.choice(2, preprocessed_num_binary_predictions).astype(bool)  # Deterministic mask.

def get_raw_data_generator():
    """Yields raw inputs and targets.
    
    Yields:
        Tuples containing:
            inputs: NACTG strings of length 1000.
            targets: An binary label array of length 919.
    """
    # TODO: redefine this to read from a file.
    
    ##############################
    #      YOUR CODE HERE!       #
    ##############################
    
    for _ in range(100):
        raw_inputs = "".join(np.random.choice(list("NACTG"), problem.input_sequence_length))
        raw_targets = (np.random.random(problem.num_binary_predictions) < 0.5).astype(int)
        yield raw_inputs, raw_targets


# Reset graph for consistency.
tf.reset_default_graph()

# Construct pipeline.
problem, model, hparams = prepare_pipeline(
    problem_name, model_name, hparams_set, hparams_overrides_str)

# Shapes for preprocessed inputs/targets/latents.
preprocessed_input_sequence_length = int(np.ceil(problem.input_sequence_length / problem.chunk_size))
preprocessed_num_binary_predictions = len(problem.targets_gather_indices())
targets_gather_indices = problem.targets_gather_indices()

def get_processed_data_generator_fn(raw_data_generator, keep_mask):
    # Reshape to rank 3 arrays/tensors.
    keep_mask = keep_mask.reshape([-1, 1, 1])
    def get_processed_data_generator():
        for raw_inputs, raw_targets in raw_data_generator:
            preprocessed_inputs = np.array(problem._encoders["inputs"].encode(raw_inputs), dtype=np.int64)
            preprocessed_targets = raw_targets[targets_gather_indices]
            # Reshape to rank 3 arrays/tensors.
            preprocessed_inputs = preprocessed_inputs.reshape([-1, 1, 1])
            preprocessed_targets = preprocessed_targets.reshape([-1, 1, 1])
            yield preprocessed_inputs, preprocessed_targets, keep_mask
    return get_processed_data_generator

# Create dataset from generator.
raw_data_generator = get_raw_data_generator()
processed_data_generator_fn = get_processed_data_generator_fn(raw_data_generator, keep_mask)
ds = tf.data.Dataset.from_generator(
    processed_data_generator_fn,
    output_types=(tf.int64, tf.int64, tf.bool),
    output_shapes=(
        [preprocessed_input_sequence_length, 1, 1],
        [preprocessed_num_binary_predictions, 1, 1],
        [preprocessed_num_binary_predictions, 1, 1],
    )
)

ds = ds.repeat(1)  # Single evaluation epoch.
ds = ds.batch(batch_size)

# Create one-shot-iterator.
next_item = ds.make_one_shot_iterator().get_next()
preprocessed_inputs, preprocessed_targets, latents_keep_mask = next_item

# Create the latents from the targets and mask.
latents, metrics_mask = get_latents_and_metrics_weights(problem, preprocessed_targets, latents_keep_mask)

# Pass preprocessed features through model.
preprocessed_features = {
    "inputs": preprocessed_inputs,
    "targets": preprocessed_targets,
    "latents": latents,
    "metrics_mask": metrics_mask,
}
logits, losses = model(preprocessed_features)
labels = preprocessed_features["targets"]

# Evaluation metrics we want to use.
set_auroc, _ = tfti.set_auroc(logits, labels, preprocessed_features)
set_auprc, _ = tfti.set_auprc(logits, labels, preprocessed_features)
average_auroc, _ = tfti.average_auroc(logits, labels, preprocessed_features)
average_auprc, _ = tfti.average_auprc(logits, labels, preprocessed_features)

# Initialize weights - randomly or from checkpoint
sess = get_session(is_interactive=False)
initialize_variables(sess, ckpt_dir)

try:
    while True:
        # average_auroc updates a numerator & denominator
        # and computes the average_auroc for the entire dataset.
        final_average_auroc = sess.run(average_auroc)
except tf.errors.OutOfRangeError:
    print(f"Average AUROC for this latent mask is: {final_average_auroc}")

INFO:tensorflow:Unsetting shared_embedding_and_softmax_weights.


[2018-04-24 19:27:10,332] Unsetting shared_embedding_and_softmax_weights.


INFO:tensorflow:Setting T2TModel mode to 'eval'


[2018-04-24 19:27:10,335] Setting T2TModel mode to 'eval'


INFO:tensorflow:Setting hparams.dropout to 0.0


[2018-04-24 19:27:10,337] Setting hparams.dropout to 0.0


INFO:tensorflow:Setting hparams.layer_prepostprocess_dropout to 0.0


[2018-04-24 19:27:10,339] Setting hparams.layer_prepostprocess_dropout to 0.0


INFO:tensorflow:Setting hparams.symbol_dropout to 0.0


[2018-04-24 19:27:10,340] Setting hparams.symbol_dropout to 0.0


INFO:tensorflow:Setting hparams.attention_dropout to 0.0


[2018-04-24 19:27:10,343] Setting hparams.attention_dropout to 0.0


INFO:tensorflow:Setting hparams.relu_dropout to 0.0


[2018-04-24 19:27:10,345] Setting hparams.relu_dropout to 0.0


INFO:tensorflow:Marks for CellType GM12878: [(204, 'GM12878|ATF2|None'), (205, 'GM12878|ATF3|None'), (207, 'GM12878|BCL11A|None'), (410, 'GM12878|BRCA1|None'), (210, 'GM12878|CEBPB|None'), (412, 'GM12878|CHD1|None'), (413, 'GM12878|CHD2|None'), (127, 'GM12878|CTCF|None'), (53, 'GM12878|DNase|None'), (128, 'GM12878|EZH2|None'), (212, 'GM12878|Egr-1|None'), (216, 'GM12878|GABP|None'), (420, 'GM12878|JunD|None'), (421, 'GM12878|Max|None'), (423, 'GM12878|Mxi1|None'), (223, 'GM12878|NRSF|None'), (428, 'GM12878|Nrf1|None'), (229, 'GM12878|Pol2-4H8|None'), (230, 'GM12878|Pol2|None'), (436, 'GM12878|RFX5|None'), (235, 'GM12878|RXRA|None'), (233, 'GM12878|Rad21|None'), (437, 'GM12878|SIN3A|None'), (236, 'GM12878|SIX5|None'), (237, 'GM12878|SP1|None'), (238, 'GM12878|SRF|None'), (240, 'GM12878|TAF1|None'), (442, 'GM12878|TBP|None'), (241, 'GM12878|TCF12|None'), (243, 'GM12878|USF-1|None'), (444, 'GM12878|USF2|None'), (244, 'GM12878|YY1|None'), (447, 'GM12878|Znf143|None'), (725, 'GM12878|c-Myc|

[2018-04-24 19:27:10,352] Marks for CellType GM12878: [(204, 'GM12878|ATF2|None'), (205, 'GM12878|ATF3|None'), (207, 'GM12878|BCL11A|None'), (410, 'GM12878|BRCA1|None'), (210, 'GM12878|CEBPB|None'), (412, 'GM12878|CHD1|None'), (413, 'GM12878|CHD2|None'), (127, 'GM12878|CTCF|None'), (53, 'GM12878|DNase|None'), (128, 'GM12878|EZH2|None'), (212, 'GM12878|Egr-1|None'), (216, 'GM12878|GABP|None'), (420, 'GM12878|JunD|None'), (421, 'GM12878|Max|None'), (423, 'GM12878|Mxi1|None'), (223, 'GM12878|NRSF|None'), (428, 'GM12878|Nrf1|None'), (229, 'GM12878|Pol2-4H8|None'), (230, 'GM12878|Pol2|None'), (436, 'GM12878|RFX5|None'), (235, 'GM12878|RXRA|None'), (233, 'GM12878|Rad21|None'), (437, 'GM12878|SIN3A|None'), (236, 'GM12878|SIX5|None'), (237, 'GM12878|SP1|None'), (238, 'GM12878|SRF|None'), (240, 'GM12878|TAF1|None'), (442, 'GM12878|TBP|None'), (241, 'GM12878|TCF12|None'), (243, 'GM12878|USF-1|None'), (444, 'GM12878|USF2|None'), (244, 'GM12878|YY1|None'), (447, 'GM12878|Znf143|None'), (725, 'GM12

INFO:tensorflow:Marks for CellType GM12878: [(204, 'GM12878|ATF2|None'), (205, 'GM12878|ATF3|None'), (207, 'GM12878|BCL11A|None'), (410, 'GM12878|BRCA1|None'), (210, 'GM12878|CEBPB|None'), (412, 'GM12878|CHD1|None'), (413, 'GM12878|CHD2|None'), (127, 'GM12878|CTCF|None'), (53, 'GM12878|DNase|None'), (128, 'GM12878|EZH2|None'), (212, 'GM12878|Egr-1|None'), (216, 'GM12878|GABP|None'), (420, 'GM12878|JunD|None'), (421, 'GM12878|Max|None'), (423, 'GM12878|Mxi1|None'), (223, 'GM12878|NRSF|None'), (428, 'GM12878|Nrf1|None'), (229, 'GM12878|Pol2-4H8|None'), (230, 'GM12878|Pol2|None'), (436, 'GM12878|RFX5|None'), (235, 'GM12878|RXRA|None'), (233, 'GM12878|Rad21|None'), (437, 'GM12878|SIN3A|None'), (236, 'GM12878|SIX5|None'), (237, 'GM12878|SP1|None'), (238, 'GM12878|SRF|None'), (240, 'GM12878|TAF1|None'), (442, 'GM12878|TBP|None'), (241, 'GM12878|TCF12|None'), (243, 'GM12878|USF-1|None'), (444, 'GM12878|USF2|None'), (244, 'GM12878|YY1|None'), (447, 'GM12878|Znf143|None'), (725, 'GM12878|c-Myc|

[2018-04-24 19:27:10,359] Marks for CellType GM12878: [(204, 'GM12878|ATF2|None'), (205, 'GM12878|ATF3|None'), (207, 'GM12878|BCL11A|None'), (410, 'GM12878|BRCA1|None'), (210, 'GM12878|CEBPB|None'), (412, 'GM12878|CHD1|None'), (413, 'GM12878|CHD2|None'), (127, 'GM12878|CTCF|None'), (53, 'GM12878|DNase|None'), (128, 'GM12878|EZH2|None'), (212, 'GM12878|Egr-1|None'), (216, 'GM12878|GABP|None'), (420, 'GM12878|JunD|None'), (421, 'GM12878|Max|None'), (423, 'GM12878|Mxi1|None'), (223, 'GM12878|NRSF|None'), (428, 'GM12878|Nrf1|None'), (229, 'GM12878|Pol2-4H8|None'), (230, 'GM12878|Pol2|None'), (436, 'GM12878|RFX5|None'), (235, 'GM12878|RXRA|None'), (233, 'GM12878|Rad21|None'), (437, 'GM12878|SIN3A|None'), (236, 'GM12878|SIX5|None'), (237, 'GM12878|SP1|None'), (238, 'GM12878|SRF|None'), (240, 'GM12878|TAF1|None'), (442, 'GM12878|TBP|None'), (241, 'GM12878|TCF12|None'), (243, 'GM12878|USF-1|None'), (444, 'GM12878|USF2|None'), (244, 'GM12878|YY1|None'), (447, 'GM12878|Znf143|None'), (725, 'GM12

INFO:tensorflow:Using variable initializer: uniform_unit_scaling


[2018-04-24 19:27:10,442] Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_782_8.bottom


[2018-04-24 19:27:10,529] Transforming feature 'inputs' with symbol_modality_782_8.bottom


INFO:tensorflow:Transforming feature 'latents' with binary_imputation_class_label_modality_8.bottom


[2018-04-24 19:27:10,749] Transforming feature 'latents' with binary_imputation_class_label_modality_8.bottom


INFO:tensorflow:Transforming 'targets' with binary_class_label_modality_8.targets_bottom


[2018-04-24 19:27:11,390] Transforming 'targets' with binary_class_label_modality_8.targets_bottom


INFO:tensorflow:Building model body


[2018-04-24 19:27:11,457] Building model body


INFO:tensorflow:Transforming body output with binary_class_label_modality_8.top


[2018-04-24 19:27:14,771] Transforming body output with binary_class_label_modality_8.top




[2018-04-24 19:27:20,783] Value for argument ckpt_dir is `None`. Not restoring from checkpoint.


Average AUROC for this latent mask is: 0.5089658498764038
