In [124]:
import tensorflow as tf
import numpy as np
import math

In [3]:
def reset_tf():
    global sess
    sess.close()
    tf.reset_default_graph()
    sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [4]:
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [176]:
class HyperParameters():
    # maximum number of symbols in an input sequence
    max_sequence_length = 50
    
    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 128
    
    # number of sequences per batch
    batch_size = 64
    
    # number of target classes
    num_target_classes = 2

hp = HyperParameters()

In [196]:
reset_tf()

# Placeholders
# ------------

# sequences of input symbols
input_sequences = tf.placeholder(tf.int32, 
                                 shape = (hp.batch_size, hp.max_sequence_length))
# length of each sequence
input_lengths = tf.placeholder(tf.int32, 
                               shape = (hp.batch_size))
# sequences of word ending markers
input_word_endings = tf.placeholder(tf.int32, 
                                    shape = (hp.batch_size, hp.max_sequence_length))
# sequences of target symbols
target_sequences = tf.placeholder(tf.int32, 
                                  shape = (hp.batch_size, hp.max_sequence_length))

# sequences of input positions (not a placeholder)
input_positions = tf.range(hp.max_sequence_length, dtype=tf.int32)
input_positions = tf.tile(input_positions, [hp.batch_size])
input_positions = tf.reshape(input_positions, 
                             (hp.batch_size, hp.max_sequence_length), 
                             name = 'input_positions')

# Embeddings
# ----------

# sequences of input embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                            (hp.vocab_size, hp.embedding_size))
input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                  input_sequences,
                                                  name = 'input_sequences_embedded')

# sequences of input position embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                            (hp.max_sequence_length, hp.embedding_size))
input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, input_positions)

# sequences of word ending embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_word_ending_embeddings = tf.get_variable('input_word_ending_embeddings',
                                               (2, hp.embedding_size))
input_word_endings_embedded = tf.nn.embedding_lookup(input_word_ending_embeddings, 
                                                     input_word_endings,
                                                     name = 'input_word_endings_embedded')

# Sequence mask
# -------------

sequence_mask = tf.sequence_mask(input_lengths,
                                 hp.max_sequence_length,
                                 dtype = tf.float32)
# expand dimensions to support broadcasting
expanded_sequence_mask = tf.expand_dims(sequence_mask, 
                                        2, 
                                        name = 'sequence')

input_combined_embedded = tf.add_n([input_sequences_embedded, 
                                    input_positions_embedded, 
                                    input_word_endings_embedded])
# TODO: is this necessary?
input_combined_embedded = tf.multiply(input_combined_embedded,
                                      expanded_sequence_mask,
                                      name = 'input_combined_embedded')

# Attention
# ---------

def attention_layer(A):
    A_T = tf.transpose(A, perm=[0, 2, 1])
    scaled_logits = tf.matmul(A, A_T) / tf.sqrt(tf.cast(tf.shape(A)[-1], tf.float32))
    return tf.matmul(tf.nn.softmax(scaled_logits), A)

# TODO: layer normalization (possibly with masking)
input_attention_layer_1 = input_combined_embedded + attention_layer(input_combined_embedded)

# Feed-forward
# ------------

def feed_forward_layer(A, num_units, scope='feed_forward', reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        A = tf.layers.dense(A, num_units, activation=tf.nn.relu)
        return tf.layers.dense(A, num_units)

# TODO: layer normalization (possibly with masking)
input_feed_forward_layer_1 = input_attention_layer_1 + feed_forward_layer(input_attention_layer_1, hp.embedding_size)

# TODO: how important is this masking?
input_feed_forward_layer_1 *= expanded_sequence_mask

# Softmax
# -------

output_logits = tf.layers.dense(input_feed_forward_layer_1, hp.num_target_classes)

# Loss
# ----

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_sequences,
                                                        logits=output_logits)
losses *= sequence_mask

total_loss = tf.reduce_sum(losses)
mean_loss  = total_loss / tf.cast(tf.reduce_sum(input_lengths), tf.float32)

# Training
# --------

optimizer = tf.train.AdamOptimizer(1e-3)
# gradient clipping
gradients, variables = zip(*optimizer.compute_gradients(mean_loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
train_op = optimizer.apply_gradients(zip(gradients, variables))

In [193]:
sess.run(tf.global_variables_initializer())

sess.run(tf.shape(total_loss))

array([], dtype=int32)