# Tensorflow Attention Experimental
This notebook provides a rough implementation of the attention model for histone modification prediction in tensorflow.

In [1]:
import tensorflow as tf
import numpy as np

## Parameters
This section lists out the important hyperparameters used in our prediction task.

In [2]:
N = 100  # batch size
L = 196  # number of annotation vectors per training example
D = 500  # dimension of each annotation vector
H = 100  # number of hidden units
T = 400  # length of sequence
V = 4    # vocabulary size ('a', 'c', 'g', 't')
C = 3    # number of prediction classes

## Utilities
This section provides utilities for converting between tensor objects. 

In [3]:
def convert_label_to_one_hot(label, number_of_classes):
    """Converts a discrete label to one hot encoding.
    
    @param labels: numpy array of discrete labels 
    @param number_of_classes: number of label classes
    @return: 1xC one-hot encoding, where C is number of classes
    """
    one_hot_encoding = np.zeros(number_of_classes)
    one_hot_encoding[label] = 1
    return np.reshape(one_hot_encoding, newshape=(1, number_of_classes))

def convert_to_one_hot(labels, number_of_classes):
    """Convert a list of labels to one hot encoding.
    
    @param labels: numpy array of discrete labels 
    @param number_of_classes: number of label classes
    @return: one-hot encoding of lables (N x C), where N is batch size, C is number of classes
    """
    one_hot_labels = [convert_label_to_one_hot(l, number_of_classes=number_of_classes) for l in labels]
    return np.concatenate(one_hot_labels, axis=0)

def convert_training_examples_to_tensor(training_examples):
    """Convert batch data to tensor representation.
    
    @param training_examples:
        List of training examples.
    @return: 
        Numpy tensors of dimension (N x (A1 x A2)), where N is batch dimension and (A1 x A2) is dimension of 
        matrix corresponding to training example
    """
    # Add batch dimension to tensors and concatenate matrices across batch dimension
    # np.expand_dims - adds batch dimension
    # np.concatenate - creates batch tensor by stacking on batch dimension
    sequence_tensor = np.concatenate([np.expand_dims(te.sequence, axis=0) for te in training_examples], axis=0)
    annotation_tensor = np.concatenate([np.expand_dims(te.annotation_vectors, axis=0) for te in training_examples], axis=0)
    label_tensor = np.concatenate([np.expand_dims(te.label, axis=0) for te in training_examples], axis=0)
    label_tensor = np.ravel(label_tensor)
    
    return TrainingTensor(sequence_tensor=sequence_tensor, 
                          annotation_tensor=annotation_tensor, 
                          label_tensor=label_tensor)


## Mock Data
This section provides utilities for mocking the data that will be used in our prediction task.

In [4]:
import collections

TrainingExample = collections.namedtuple(
    typename='TrainingExample', field_names=['sequence', 'annotation_vectors', 'label'])

TrainingTensor = collections.namedtuple(
    typename="TrainingTensor", field_names=['sequence_tensor', 'annotation_tensor', 'label_tensor'])

def create_dummy_training_example():
    """Create a single training example with dummy data."""
    dummy_sequence = convert_to_one_hot(labels=np.random.randint(low=0, high=V, size=T), number_of_classes=V)
    dummy_label = np.random.randint(low=0, high=3, size=1)
    dummy_annotation_vectors = np.random.normal(loc=0.0, scale=1.0, size=(L, D))
    
    return TrainingExample(sequence=dummy_sequence,
                           annotation_vectors=dummy_annotation_vectors,
                           label=dummy_label)    

def create_dummy_batch_data():
    """Create training examples for batch."""
    training_examples = [create_dummy_training_example() for _ in xrange(N)]
    return convert_training_examples_to_tensor(training_examples)

In [5]:
batch_data = create_dummy_batch_data()

# print tensor dimensions for reference
print batch_data.sequence_tensor.shape
print batch_data.annotation_tensor.shape
print batch_data.label_tensor.shape

(100, 400, 4)
(100, 196, 500)
(100,)


# Attention Model
This section builds the computational graph (i.e. model) for the attention model. 
We will need to implement the following layers and operations.

1. Get initial LSTM state (compute initial LSTM state given features)
2. Process sequential input to get hidden state.
3. Attention layer (compute attention probabilities given state and features)
3. Get context vector
4. Perform prediction

### Initializers

In [6]:
weight_initializer = tf.contrib.layers.xavier_initializer()
constant_initializer = tf.constant_initializer(0.0)

### Initial LSTM state

In [7]:
def get_initial_lstm(features, reuse):
    """Returns initial state of LSTM by initializing with CNN features.
    
    Input: features (N x L x D)
    Output: hidden_state (N x H), memory_state (N x H)
    
    Note that we want to separately initialize the hidden state for each sequence
    because we assume that the sequences are independent. We do 
    not want the state information from a previous sequence to leak into the current sequence.
    
    :param features:
        Features extracted from CNN of dimension (L x D).
    :return: 
        initial hidden and memory state.
    """
    features_mean = tf.reduce_mean(features, axis=1) # (N x D)
    
    with tf.variable_scope('initial_lstm', reuse=reuse):
        
        # get initial hidden state
        w_h = tf.get_variable('w_h', shape=(D, H), initializer=weight_initializer)
        b_h = tf.get_variable('b_h', shape=(H), initializer=constant_initializer)
        h_init_logits = tf.matmul(features_mean, w_h) + b_h
        h_init = tf.nn.tanh(h_init_logits)
        
        # get initial memory state
        w_c = tf.get_variable('w_c', shape=(D, H), initializer=weight_initializer)
        b_c = tf.get_variable('b_c', shape=(H), initializer=constant_initializer)
        c_init_logits = tf.matmul(features_mean, w_c) + b_c
        c_init = tf.nn.tanh(c_init_logits)
        
        return h_init, c_init

### Attention Layer

In [8]:
def attention_project_features(features, reuse=False):
    """Apply weighted transformation to features.
    
    Input: (N x L x D) - all annotation vectors for each batch entry
    Output: (N x L x D) - projected annotation vectors for each batch entry
    """
    with tf.variable_scope('attention_project_features', reuse=reuse):
        features_flat = tf.reshape(features, [-1, D]) # (NL x D)
        w_features = tf.get_variable('w_features', [D, D], initializer=weight_initializer)
        projected_features = tf.matmul(features_flat, w_features) # (NL x D)
        projected_features = tf.reshape(projected_features, [N, L, D]) # (N x L x D)
        return projected_features
    
def attention_project_hidden_state(h, reuse=False):
    """Apply weighted transformation to hidden state.
     
    Input: (N x H) - hidden state for each batch entry
    Output: (N x D) - projected hidden state for each batch entry
    """
    with tf.variable_scope('attention_project_hidden_state', reuse=reuse):
        w_hidden = tf.get_variable('w_hidden', [H, D], initializer=weight_initializer)
        projected_h = tf.matmul(h, w_hidden) 
        return projected_h

def attention_bias(reuse=False):
    """Get attention bias.
    
    Output: (H x 1)
    """
    with tf.variable_scope('attention_bias', reuse=False):
        b = tf.get_variable('b', [D], initializer=constant_initializer)
        return b
    
def get_attention_inputs(features, h):
    # transform features (N x L x D)
    projected_features = attention_project_features(features)
        
    # transform hidden state (N x 1 x D)
    projected_h = tf.transpose(attention_project_hidden_state(h))
    projected_h = tf.reshape(projected_h, shape=[N, 1, D])
        
    # get bias
    bias = attention_bias()
    
    return projected_features, projected_h, bias

def attention_layer(features, h, reuse=False):
    """Returns attention probabilities. 
    
    Input: 
        1. (N x L x D) - features for each batch entry
        2. (N x H) - hidden state for each batch entrying respective sequences.
    
    Output: 
        1. (N x L) matrix of probabilities for each annotation vector in each batch entry
    """
    with tf.variable_scope('attention_layer', reuse=reuse):
        projected_features, projected_h, bias = get_attention_inputs(features, h)
        
        # create attention input
        # note that +bias is a broadcasted operation
        attention_input = projected_features + projected_h #+ bias # (N x L x D)
        attention_input = tf.reshape(attention_input, shape=[-1, D]) # (NL x D)
        
        # apply attention mechanism
        w_attention = tf.get_variable('w_attention', shape=[D, 1], initializer=weight_initializer)
        attention_logits = tf.matmul(attention_input, w_attention) # (NL x 1)
        attention_logits = tf.reshape(attention_logits, shape=(N, L)) # (N x L)
        
        # compute attention probabilties
        attention_probabilities = tf.nn.softmax(attention_logits) # (N x L)
        return attention_probabilities 

### Select Context

In [9]:
def convert_to_gather_indices(selected_indices):
    """Convert selected context indices to tensor to be used for gather_nd."""
    indices = tf.reshape(np.arange(N), shape=(N,1))
    selected_context_indices = tf.reshape(selected_indices, shape=(N,1))
    return tf.concat((indices, selected_context_indices), axis=1)

def select_context(features, attention_probabilities):
    """Select context vector from attention probabilities
    
    :param features:
        (N x L x D) tensor, where N is batch size, L is number of attention
        vectors and D is dimension of attention vector. 
    :param attention_probabilities:
        (N x L) tensor of probabilities.
    :return:
        (N x D), where each row represents a context vector for ith example.
    """
    selected_context_indices = tf.argmax(attention_probabilities, axis=1)
    gather_indices = convert_to_gather_indices(selected_context_indices)
    return tf.gather_nd(params=features, indices=gather_indices)

### Decoding Layer

In [10]:
def decode_lstm(hidden_state, context, reuse=False):
    """Predict on hidden state and context."""
    with tf.variable_scope('decode_lstm', reuse=reuse):
        w_hidden = tf.get_variable('w_hidden', [H, C], initializer=weight_initializer)
        w_context = tf.get_variable('w_context', [D, C], initializer=weight_initializer)
        b_out = tf.get_variable('b_out', [C], initializer=constant_initializer)
        
        hidden_contribution = tf.matmul(hidden_state, w_hidden)
        context_contribution = tf.matmul(context, w_context)
        logits =  hidden_contribution + context_contribution + b_out
        return logits

### Build Model (Computational Graph)

In [11]:
def model():
    loss = 0.0
    
    # inputs
    features = tf.placeholder(tf.float32, (None, L, D))
    sequences = tf.placeholder(tf.float32, (None, T, V))
    labels = tf.placeholder(tf.int32, (None))
    
    inputs = {'features': features, 
              'sequences': sequences,
              'labels': labels}
    
    # initialization
    c, h = get_initial_lstm(features, reuse=False)
    loss = 0.0
    
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=H)
    
    # process sequence to get updated hidden unit
    for t in range(T):
        with tf.variable_scope('update_lstm', reuse=(t!=0)):
            _, (c,h) = lstm_cell(inputs=sequences[:, t, :], state=[c, h])
    
    # get context
    attention_probabilities = attention_layer(features, h, reuse=None) # (N x L)
    context = select_context(features, attention_probabilities) # (N x D)
    
    # get logits
    logits = decode_lstm(h, context, reuse=None)
    
    # get loss
    loss += tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))

    # return loss
    return (loss / tf.to_float(N)), inputs

## Train the network
This section puts together the function needed to train the network.

In [12]:
# training hyperparameters
learning_rate = 0.01
number_epochs = 1
number_iterations = 10

# build model
loss, model_inputs = model()

# train op
with tf.name_scope('optimizer'):
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
    train_op = optimizer.minimize(loss)

# add initialization operations 
init_op = tf.global_variables_initializer()

# reuse variables
tf.get_variable_scope().reuse_variables()

with tf.Session() as sess:    
    sess.run(init_op)
    for e in range(number_epochs):
        for i in range(number_iterations):
            batch_data = create_dummy_batch_data()
            
            feed_dict = {model_inputs['sequences']: batch_data.sequence_tensor,
                         model_inputs['features']: batch_data.annotation_tensor,
                         model_inputs['labels']: batch_data.label_tensor}
            
            _, loss_value = sess.run([train_op, loss], feed_dict)
            print "the loss for iteration {} = {}".format(i, loss_value)

the loss for iteration 0 = 1.5430419445
the loss for iteration 1 = 1.50749766827
the loss for iteration 2 = 1.5101596117
the loss for iteration 3 = 1.66599965096
the loss for iteration 4 = 1.75353240967
the loss for iteration 5 = 1.69154560566
the loss for iteration 6 = 1.72667145729
the loss for iteration 7 = 1.68650984764
the loss for iteration 8 = 1.4467086792
the loss for iteration 9 = 1.37894380093
