# Tensorflow Attention Experimental
This notebook provides a rough implementation of the attention model for histone modification prediction in tensorflow.

In [1]:
import tensorflow as tf
import numpy as np

## Parameters
In this section, we list out the important hyperparameters used in our prediction task.

In [59]:
# Prediction parameters

N = 100  # batch size
L = 196  # number of annotation vectors per training example
D = 500  # dimension of each annotation vector
H = 100  # number of hidden units
T = 400  # length of sequence
V = 4    # vocabulary size ('a', 'c', 'g', 't')
C = 3    # number of prediction classes

## Data
In this section, we create dummy data for one batch on which we run our prediction. This is primarily to test that the pipeline runs correctly.

In [64]:
# Utilities for converting tensors to one_hot

def convert_label_to_one_hot(label, number_of_classes):
    """Converts a label to one hot encoding"""
    one_hot_encoding = np.zeros(number_of_classes)
    one_hot_encoding[label] = 1
    return np.reshape(one_hot_encoding, newshape=(1, number_of_classes))

def convert_to_one_hot(labels, number_of_classes):
    """Convert a list of labels to one hot encoding."""
    one_hot_labels = [convert_label_to_one_hot(l, number_of_classes=number_of_classes) for l in labels]
    return np.concatenate(one_hot_labels, axis=0)

def convert_training_examples_to_tensor(training_examples):
    """Convert batch data to tensor representation.
    
    @param training_examples:
        List of training examples.
    @return: 
        Numpy tensors of dimension (N x (A1 x A2)), where N is batch dimension and (A1 x A2) is dimension of 
        matrix corresponding to training example
    """
    # Add batch dimension to tensors and concatenate matrices across batch dimension
    sequence_tensor = np.concatenate([np.expand_dims(te.sequence, axis=0) for te in training_examples], axis=0)
    annotation_tensor = np.concatenate([np.expand_dims(te.annotation_vectors, axis=0) for te in training_examples], axis=0)
    label_tensor = np.concatenate([np.expand_dims(te.label, axis=0) for te in training_examples], axis=0)
    
    return TrainingTensor(sequence_tensor=sequence_tensor, 
                          annotation_tensor=annotation_tensor, 
                          label_tensor=label_tensor)


In [65]:
import collections

TrainingExample = collections.namedtuple(
    typename='TrainingExample', field_names=['sequence', 'annotation_vectors', 'label'])

TrainingTensor = collections.namedtuple(
    typename="TrainingTensor", field_names=['sequence_tensor', 'annotation_tensor', 'label_tensor'])

def create_dummy_training_example():
    """Create a single training example with dummy data."""
    dummy_sequence = convert_to_one_hot(labels=np.random.randint(low=0, high=V, size=T), number_of_classes=V)
    dummy_label = convert_label_to_one_hot(label=np.random.randint(low=0, high=3, size=1), number_of_classes=C)
    dummy_annotation_vectors = np.random.normal(loc=0.0, scale=1.0, size=(L, D))
    
    return TrainingExample(sequence=dummy_sequence,
                           annotation_vectors=dummy_annotation_vectors,
                           label=dummy_label)    

def create_dummy_batch_data():
    """Create training examples for batch."""
    training_examples = [create_dummy_training_example() for _ in xrange(N)]
    return convert_training_examples_to_tensor(training_examples)

In [66]:
batch_data = create_dummy_batch_data()

In [67]:
print batch_data.sequence_tensor.shape
print batch_data.annotation_tensor.shape
print batch_data.label_tensor.shape

(100, 400, 4)
(100, 196, 500)
(100, 1, 3)
