Deep Learning
=============

Assignment 3.2
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        name = f.namelist()[0]
        data = tf.compat.as_str(f.read(name))
    return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
        return 0

def id2char(dictid):
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    elif dictid == -1:
        return ''
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
  
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)] # the max index in 1-hot encoding -> character

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    print(len(s))
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)
s = train_batches.next()
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

64
['ists advoca', 'ary governm', 'hes nationa', 'd monasteri', 'raca prince', 'chard baer ', 'rgical lang', 'for passeng', 'the nationa', 'took place ', 'ther well k', 'seven six s', 'ith a gloss', 'robably bee', 'to recogniz', 'ceived the ', 'icant than ', 'ritic of th', 'ight in sig', 's uncaused ', ' lost as in', 'cellular ic', 'e size of t', ' him a stic', 'drugs confu', ' take to co', ' the priest', 'im to name ', 'd barred at', 'standard fo', ' such as es', 'ze on the g', 'e of the or', 'd hiver one', 'y eight mar', 'the lead ch', 'es classica', 'ce the non ', 'al analysis', 'mormons bel', 't or at lea', ' disagreed ', 'ing system ', 'btypes base', 'anguages th', 'r commissio', 'ess one nin', 'nux suse li', ' the first ', 'zi concentr', ' society ne', 'elatively s', 'etworks sha', 'or hirohito', 'litical ini', 'n most of t', 'iskerdoo ri', 'ic overview', 'air compone', 'om acnm acc', ' centerline', 'e than any ', 'devotional ', 'de such dev']
1
[' a']
1
['an']


In [7]:
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size]) # 1x27
    return b/np.sum(b, 1)[:,None] # 1x27

Simple LSTM Model.

In [8]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
      # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # 640x64 * 64x27 -> 640x27
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [9]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
          [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:]) # 640x27
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.294889 learning rate: 10.000000
Minibatch perplexity: 26.97
ovararrav    txewx mnbr alis l  bpm yltbzi etoqlil vdhqitthirtxkeibjzktsos hiwpe
drumd rvtziyeghiwi nehrt fqpi  gefft ampwuoboezs  yppqcvr rdctcaxxi y tgrbldptgs
quo  sracnaoepsnftoet tqlygizrrliafyc tun laieyaoounjnqzlesu bzzeoitlzjeh nmcidr
f t  ekmalitxsfc i jcrmgpvce cataresgwsc lry eaorg kelvmn e brityoapaykxcuvblouf
csqdoyarrcanften v iot rc htlgnmlhih es scts ez nueseee piszaaan nrf  lh wnc vtc
Validation set perplexity: 20.16
Average loss at step 100: 2.593430 learning rate: 10.000000
Minibatch perplexity: 10.91
Validation set perplexity: 10.24
Average loss at step 200: 2.253966 learning rate: 10.000000
Minibatch perplexity: 8.58
Validation set perplexity: 8.46
Average loss at step 300: 2.100439 learning rate: 10.000000
Minibatch perplexity: 7.45
Validation set perplexity: 7.82
Average loss at step 400: 1.995835 learning rate: 10.000000
Minibatch perplexity: 7.49
Validation set per

Validation set perplexity: 4.38
Average loss at step 4500: 1.619640 learning rate: 10.000000
Minibatch perplexity: 5.21
Validation set perplexity: 4.54
Average loss at step 4600: 1.616109 learning rate: 10.000000
Minibatch perplexity: 5.14
Validation set perplexity: 4.64
Average loss at step 4700: 1.623173 learning rate: 10.000000
Minibatch perplexity: 5.26
Validation set perplexity: 4.49
Average loss at step 4800: 1.627443 learning rate: 10.000000
Minibatch perplexity: 4.51
Validation set perplexity: 4.44
Average loss at step 4900: 1.630576 learning rate: 10.000000
Minibatch perplexity: 5.25
Validation set perplexity: 4.53
Average loss at step 5000: 1.604835 learning rate: 1.000000
Minibatch perplexity: 4.42
zating the somethic e l stand mess and for gradized world overs tenest gravi by 
polity vila retempands to the reference obderenge to batrofigan welk or suncilie
vay the voster intashed gruch state s empire from the on there led s subjectling
fecttin instated on the veararge betwe

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [10]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
    # Parameters:
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    ifcox = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcob = tf.Variable(tf.truncated_normal([1, 4 * num_nodes])) # 27x256
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        
        gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob # 64x27 * 27x256 -> 64x256 -> split into 4 64x64
        gate_split = tf.split(gate, 4, 1) # split into 4 arrs across dimension 1
        
        input_gate = tf.sigmoid(gate_split[0])
        forget_gate = tf.sigmoid(gate_split[1])
        update = gate_split[2]
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(gate_split[3])
        
        return output_gate * tf.tanh(state), state # output is 64x64

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
      # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # 640x64 * 64x27 -> 640x27
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [11]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
          [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.314170 learning rate: 10.000000
Minibatch perplexity: 27.50
s  e    e   w   e          e                            g                       
f  e                           e        ee          r    e  e        e          
s                       r                     t e     er             s       e  
v t e  e         t  e                       e               e                   
x    t e              t              ee   r                                e    
Validation set perplexity: 250.26
Average loss at step 100: 2.705637 learning rate: 10.000000
Minibatch perplexity: 11.04
Validation set perplexity: 10.90
Average loss at step 200: 2.322724 learning rate: 10.000000
Minibatch perplexity: 9.12
Validation set perplexity: 9.59
Average loss at step 300: 2.185354 learning rate: 10.000000
Minibatch perplexity: 7.10
Validation set perplexity: 8.86
Average loss at step 400: 2.119677 learning rate: 10.000000
Minibatch perplexity: 8.39
Validation set pe

Validation set perplexity: 5.18
Average loss at step 4500: 1.689370 learning rate: 10.000000
Minibatch perplexity: 5.72
Validation set perplexity: 5.34
Average loss at step 4600: 1.673864 learning rate: 10.000000
Minibatch perplexity: 5.47
Validation set perplexity: 5.16
Average loss at step 4700: 1.670473 learning rate: 10.000000
Minibatch perplexity: 5.07
Validation set perplexity: 5.21
Average loss at step 4800: 1.656100 learning rate: 10.000000
Minibatch perplexity: 4.94
Validation set perplexity: 5.20
Average loss at step 4900: 1.662757 learning rate: 10.000000
Minibatch perplexity: 5.13
Validation set perplexity: 5.03
Average loss at step 5000: 1.656075 learning rate: 1.000000
Minibatch perplexity: 5.14
cide specited has eustern one nine six a desimsus seal thearce me with canide sa
hag mostlu adracran arkang one dephated lite which four tominaria at british rel
less area demip rajestate notamen at jew of hilked the limet and ordidenral has 
parts and officate th the achiclybelit

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

In [12]:
# a) - introduced embedding lookup on the inputs + feeding the embeddings to the LSTM cell
# The simplified matrix multiplication from the above task is also used here.

embedding_size = 128

graph = tf.Graph()
with graph.as_default():
    # Parameters:
    vocabulary_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # 27x128
    
    num_nodes = 64
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    ifcox = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcob = tf.Variable(tf.truncated_normal([1, 4 * num_nodes])) # 27x256
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob # 64x128 * 128x256 -> 64x256 -> split into 4 64x64
        gate_split = tf.split(gate, 4, 1) # split into 4 arrs across dimension 1
        
        input_gate = tf.sigmoid(gate_split[0]) # 64x64
        forget_gate = tf.sigmoid(gate_split[1]) # 64x64
        update = gate_split[2] # 64x64
        state = forget_gate * state + input_gate * tf.tanh(update) # 64x64
        output_gate = tf.sigmoid(gate_split[3]) # 64x64
        
        return output_gate * tf.tanh(state), state # 64x64

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    
    for i in train_inputs: # 64x27 -> 64x128 (almost nothing changes the output size remains the same)
        
        # argmax returns 64, containing the indecies of the characters 0 - 26
        i_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, tf.argmax(i, dimension=1)) 
        output, state = lstm_cell(i_embedding, output, state) # Passing the embedding 64x128
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
      # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # 640x64 * 64x27 -> 640x27
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    sample_input_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, tf.argmax(sample_input, dimension=1))
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input_embedding, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

Instructions for updating:
Use the `axis` argument instead
Instructions for updating:
Use tf.cast instead.


In [13]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
          [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.307521 learning rate: 10.000000
Minibatch perplexity: 27.32
cassp  amng sx nt  soae  en       r    r  n      zspde n   n  ai  i ekr c  e  ti
k            ef   eisw  l    se  dio    w ea nro i     ep  e  ti  a erg  ue e   
 a ean em   do    ah  arc  asm oe id    efcwm ee        c tok    seu en       f 
aic    oee    t  ie    t et     dgtit  e  akok  t  aen   sc     y     e erem  h 
ep tq  r sgnv    str     etr e  or     ae  r oen      h ie e im r a    n      ay
Validation set perplexity: 28.09
Average loss at step 100: 2.379922 learning rate: 10.000000
Minibatch perplexity: 10.55
Validation set perplexity: 9.36
Average loss at step 200: 2.084412 learning rate: 10.000000
Minibatch perplexity: 7.50
Validation set perplexity: 8.14
Average loss at step 300: 1.979916 learning rate: 10.000000
Minibatch perplexity: 6.51
Validation set perplexity: 7.54
Average loss at step 400: 1.924629 learning rate: 10.000000
Minibatch perplexity: 6.49
Validation set perp

Validation set perplexity: 5.25
Average loss at step 4500: 1.650034 learning rate: 10.000000
Minibatch perplexity: 5.06
Validation set perplexity: 5.18
Average loss at step 4600: 1.650389 learning rate: 10.000000
Minibatch perplexity: 5.20
Validation set perplexity: 5.18
Average loss at step 4700: 1.622083 learning rate: 10.000000
Minibatch perplexity: 5.51
Validation set perplexity: 5.19
Average loss at step 4800: 1.607349 learning rate: 10.000000
Minibatch perplexity: 5.24
Validation set perplexity: 5.33
Average loss at step 4900: 1.623801 learning rate: 10.000000
Minibatch perplexity: 5.19
Validation set perplexity: 5.10
Average loss at step 5000: 1.643666 learning rate: 1.000000
Minibatch perplexity: 5.47
de with a produles truss shor very viso then largery and the size considere tran
ly hubbisticle of the smaller thampor by gree one primed to commissian that trec
ms dichanged licearpan c work for the michle cas chros loging three zero six for
zbun off fuet indar drascus were weigh

In [14]:
# b) - Added a bigram-based LSTM, modeled on the character LSTM above. The model uses the same batches, except
# for the validation batch, where at least 3 characters are needed to get output
# The bigrams are made like this:
# if we have input xyzt this is changed to (xy), (yz), (zt). Therefore if the input is of size n it become
# of size n - 1
# The labels are shifted by 2 instead of 1 

# The embedding lookup is done as follows: 
# We get 2 indecies for the first and second characted in the bigram, then the index of the first 
# character is multiplied by the size of the vocabulary and the second index is added to get a unique
# index for every pair of characters.


embedding_size = 128

graph = tf.Graph()
with graph.as_default():
    # Parameters:
    vocabulary_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size*vocabulary_size, embedding_size], -1.0, 1.0)) # 27x128
    
    num_nodes = 64
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    ifcox = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcob = tf.Variable(tf.truncated_normal([1, 4 * num_nodes])) # 27x256
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob # 64x128 * 128x256 -> 64x256 -> split into 4 64x64
        gate_split = tf.split(gate, 4, 1) # split into 4 arrs across dimension 1
        
        input_gate = tf.sigmoid(gate_split[0]) # 64x64
        forget_gate = tf.sigmoid(gate_split[1]) # 64x64
        update = gate_split[2] # 64x64
        state = forget_gate * state + input_gate * tf.tanh(update) # 64x64
        output_gate = tf.sigmoid(gate_split[3]) # 64x64
        
        return output_gate * tf.tanh(state), state # 64x64
  
    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
        
    # batch size is 64, train data contains (num_unrollings + 1)x64x27
    train_labels = train_data[2:]  # labels are inputs shifted by TWO time steps.
    
    train_inputs = train_data[:num_unrollings]
    train_inputs_zipped = zip(train_inputs[:-1], train_inputs[1:])
    
    
    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    
    for i in train_inputs_zipped: # 64x27 -> 64x128 (almost nothing changes the output size remain)
        
        # Convert 0..26, 0..26 into 0...729
        # The way to do it usign the formula idx1*27 + idx2; idx1 is from 0 to 26; idx2 is from 0 to 26
        # This way for every bigram xy the index will be different if the bigrams are different
        
        idx1 = tf.argmax(i[0], dimension=1)
        idx2 = tf.argmax(i[1], dimension=1)
        
        # dim is 64,
        i_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, idx1*vocabulary_size + idx2)
        
        output, state = lstm_cell(i_embedding, output, state) # Passing the embedding 64x128
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
      # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # 640x64 * 64x27 -> 640x27
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = list()
    
    sample_input.extend([tf.placeholder(tf.float32, shape=[1, vocabulary_size]), 
                         tf.placeholder(tf.float32, shape=[1, vocabulary_size])])
    
    sample_input_idx1 = tf.argmax(sample_input[0], dimension=1)
    sample_input_idx2 = tf.argmax(sample_input[1], dimension=1)
    
    # Only 1 bigram -> 1 output
    sample_input_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, 
                                                    sample_input_idx1 * vocabulary_size + sample_input_idx2)
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input_embedding, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [15]:
import collections

num_steps = 7001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2) # Need at least 3 letters for bigram model

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
          [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[2:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = collections.deque(maxlen=2) # Used as buffer
                    # Add 2 sample letters to the buffer for input for the bigram model
                    feed.extend([random_distribution(), random_distribution()])
                    sentence = characters(feed[0])[0] + characters(feed[1])[0]
                    
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input[0]: feed[0],
                                                             sample_input[1]: feed[1]})
                        feed.append(sample(prediction))
                        sentence += characters(feed[-1])[0]
                        
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input[0]: b[0],
                                                      sample_input[1]: b[1]})
                valid_logprob = valid_logprob + logprob(predictions, b[2])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.287021 learning rate: 10.000000
Minibatch perplexity: 26.76
jy  ern oe laj rn d      ae e   i s oni     t   go o e i f r             ee   e l
vtw    eo          e aht e             e         i k                             
hlot    i ok e       e     e a w        e      a a i n         e sta      e      
erteen ue   i z                     r           k   si          htrgir   e i a w 
pq         i e     s          ce ne                 m                     r      
Validation set perplexity: 30.43
Average loss at step 100: 2.337829 learning rate: 10.000000
Minibatch perplexity: 8.27
Validation set perplexity: 9.13
Average loss at step 200: 2.006639 learning rate: 10.000000
Minibatch perplexity: 7.16
Validation set perplexity: 8.73
Average loss at step 300: 1.921440 learning rate: 10.000000
Minibatch perplexity: 6.29
Validation set perplexity: 8.09
Average loss at step 400: 1.859646 learning rate: 10.000000
Minibatch perplexity: 7.18
Validation set 

Validation set perplexity: 7.15
Average loss at step 4500: 1.585834 learning rate: 10.000000
Minibatch perplexity: 5.17
Validation set perplexity: 7.07
Average loss at step 4600: 1.594951 learning rate: 10.000000
Minibatch perplexity: 4.69
Validation set perplexity: 7.01
Average loss at step 4700: 1.603879 learning rate: 10.000000
Minibatch perplexity: 4.50
Validation set perplexity: 7.52
Average loss at step 4800: 1.599634 learning rate: 10.000000
Minibatch perplexity: 4.37
Validation set perplexity: 7.17
Average loss at step 4900: 1.621592 learning rate: 10.000000
Minibatch perplexity: 5.21
Validation set perplexity: 7.11
Average loss at step 5000: 1.629236 learning rate: 1.000000
Minibatch perplexity: 4.85
ojence of konization and essider in dogisach is insivisia base the great but allo
khalst leon is of lasted abstackentius nic may world it them played other or unde
cn eur s histon tens it groups warno is would directon by and have lights in the 
gno position cominary the brokend p

In [16]:
# c) Added dropout to the non-recurrent part of the network (15% chance to drop a neuron)

embedding_size = 128

graph = tf.Graph()
with graph.as_default():
    # Parameters:
    vocabulary_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size*vocabulary_size, embedding_size], -1.0, 1.0)) # 27x128
    
    num_nodes = 64
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    ifcox = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1)) # 27x256
    ifcob = tf.Variable(tf.truncated_normal([1, 4 * num_nodes])) # 27x256
    
    drop_rate = tf.placeholder(tf.float32)
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob # 64x128 * 128x256 -> 64x256 -> split into 4 64x64
        gate_split = tf.split(gate, 4, 1) # split into 4 arrs across dimension 1
        
        input_gate = tf.sigmoid(gate_split[0]) # 64x64
        forget_gate = tf.sigmoid(gate_split[1]) # 64x64
        update = gate_split[2] # 64x64
        state = forget_gate * state + input_gate * tf.tanh(update) # 64x64
        output_gate = tf.sigmoid(gate_split[3]) # 64x64
        
        return output_gate * tf.tanh(state), state # 64x64
  
    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    
    # batch size is 64, train data contains (num_unrollings + 1)x64x27
    train_labels = train_data[2:]  # labels are inputs shifted by TWO time steps.
    
    train_inputs = train_data[:num_unrollings]
    train_inputs_zipped = zip(train_inputs[:-1], train_inputs[1:])
    
    
    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    
    for i in train_inputs_zipped: # 64x27 -> 64x128 (almost nothing changes the output size remain)
        
        # Convert 0..26, 0..26 into 0...729
        # The way to do it usign the formula idx1*27 + idx2; idx1 is from 0 to 26; idx2 is from 0 to 26
        # This way for every bigram xy the index will be different if the bigrams are different
        
        idx1 = tf.argmax(i[0], dimension=1)
        idx2 = tf.argmax(i[1], dimension=1)
        
        # dim is 64,
        i_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, idx1*vocabulary_size + idx2)
        
        output, state = lstm_cell(i_embedding, output, state) # Passing the embedding 64x128
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
      # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # 640x64 * 64x27 -> 640x27
        dropped = tf.nn.dropout(logits, rate=drop_rate)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=dropped))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = list()
    
    sample_input.extend([tf.placeholder(tf.float32, shape=[1, vocabulary_size]), 
                         tf.placeholder(tf.float32, shape=[1, vocabulary_size])])
    
    sample_input_idx1 = tf.argmax(sample_input[0], dimension=1)
    sample_input_idx2 = tf.argmax(sample_input[1], dimension=1)
    
    # Only 1 bigram -> 1 output
    sample_input_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, 
                                                    sample_input_idx1 * vocabulary_size + sample_input_idx2)
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input_embedding, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [17]:
import collections

num_steps = 15001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2) # Need at least 3 letters for bigram model

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        feed_dict[drop_rate] = 0.10
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
          [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[2:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = collections.deque(maxlen=2) # Used as buffer
                    # Add 2 sample letters to the buffer for input for the bigram model
                    feed.extend([random_distribution(), random_distribution()])
                    sentence = characters(feed[0])[0] + characters(feed[1])[0]
                    
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input[0]: feed[0],
                                                             sample_input[1]: feed[1]})
                        feed.append(sample(prediction))
                        sentence += characters(feed[-1])[0]
                        
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input[0]: b[0],
                                                      sample_input[1]: b[1],
                                                      drop_rate: 0.0})
                valid_logprob = valid_logprob + logprob(predictions, b[2])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.296660 learning rate: 10.000000
Minibatch perplexity: 26.65
yf   qe  i  b  i   ea f  jv    e z    l dd    c b    z  a g d  d  hm e p  a  e   
hbfa  z  j  t   p  is   x t  le  wni   r r c  r e t    ee  g  i i d   o l  m     
fk  u  e  i g e we  c   l r he   a  e  g wht  c   e b ee  a   e  i w me  e  y  y 
mo s e    r  g wp po s  t  c   m   r lz  b    we t  p  t  d   d  t e e  m  t e  s
fnb  e   r cm a  e   y hn esoes  dd  t  k h     e  y e   we  a  t  g n  e  i  q  
Validation set perplexity: 56.66
Average loss at step 100: 2.480084 learning rate: 10.000000
Minibatch perplexity: 8.86
Validation set perplexity: 9.86
Average loss at step 200: 2.147658 learning rate: 10.000000
Minibatch perplexity: 8.16
Validation set perplexity: 8.67
Average loss at step 300: 2.070184 learning rate: 10.000000
Minibatch perplexity: 6.59
Validation set perplexity: 8.45
Average loss at step 400: 2.006303 learning rate: 10.000000
Minibatch perplexity: 6.79
Validation set 

Validation set perplexity: 7.10
Average loss at step 4500: 1.755394 learning rate: 10.000000
Minibatch perplexity: 5.38
Validation set perplexity: 7.38
Average loss at step 4600: 1.744840 learning rate: 10.000000
Minibatch perplexity: 4.77
Validation set perplexity: 7.07
Average loss at step 4700: 1.747263 learning rate: 10.000000
Minibatch perplexity: 5.03
Validation set perplexity: 7.04
Average loss at step 4800: 1.759026 learning rate: 10.000000
Minibatch perplexity: 5.46
Validation set perplexity: 6.99
Average loss at step 4900: 1.744407 learning rate: 10.000000
Minibatch perplexity: 5.08
Validation set perplexity: 6.80
Average loss at step 5000: 1.759398 learning rate: 1.000000
Minibatch perplexity: 4.94
fliring as car calist and teaheintephazod deb arriore change withht stitude to ca
unfathe of the classe find al in traintings reterrdin cereation ences frecloser d
ychablecutence bomepy leffected ans incressed ves prohibits xed to images bites m
yudniv kuwas aw harry all in prove 

Validation set perplexity: 6.48
Average loss at step 9100: 1.723326 learning rate: 1.000000
Minibatch perplexity: 4.70
Validation set perplexity: 6.50
Average loss at step 9200: 1.746263 learning rate: 1.000000
Minibatch perplexity: 5.01
Validation set perplexity: 6.46
Average loss at step 9300: 1.735722 learning rate: 1.000000
Minibatch perplexity: 5.48
Validation set perplexity: 6.58
Average loss at step 9400: 1.709109 learning rate: 1.000000
Minibatch perplexity: 5.08
Validation set perplexity: 6.54
Average loss at step 9500: 1.728671 learning rate: 1.000000
Minibatch perplexity: 4.24
Validation set perplexity: 6.56
Average loss at step 9600: 1.723109 learning rate: 1.000000
Minibatch perplexity: 4.46
Validation set perplexity: 6.61
Average loss at step 9700: 1.726572 learning rate: 1.000000
Minibatch perplexity: 4.75
Validation set perplexity: 6.60
Average loss at step 9800: 1.729513 learning rate: 1.000000
Minibatch perplexity: 5.03
Validation set perplexity: 6.60
Average loss at 

bqce pense to partly life resultiple down the define ensurrouxyus a mzle er needs
Validation set perplexity: 6.49
Average loss at step 14100: 1.719881 learning rate: 0.100000
Minibatch perplexity: 4.79
Validation set perplexity: 6.49
Average loss at step 14200: 1.715717 learning rate: 0.100000
Minibatch perplexity: 5.47
Validation set perplexity: 6.49
Average loss at step 14300: 1.706301 learning rate: 0.100000
Minibatch perplexity: 4.95
Validation set perplexity: 6.49
Average loss at step 14400: 1.725723 learning rate: 0.100000
Minibatch perplexity: 5.52
Validation set perplexity: 6.51
Average loss at step 14500: 1.741789 learning rate: 0.100000
Minibatch perplexity: 4.99
Validation set perplexity: 6.52
Average loss at step 14600: 1.725941 learning rate: 0.100000
Minibatch perplexity: 5.39
Validation set perplexity: 6.52
Average loss at step 14700: 1.734592 learning rate: 0.100000
Minibatch perplexity: 4.90
Validation set perplexity: 6.51
Average loss at step 14800: 1.725109 learning 

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [18]:
from tensorflow.python.layers.core import Dense

batch_size = 128
rnn_size = 50
num_layers = 2
embedding_size = 64
learning_rate = 0.001
sequence_length = 7

end = batch_size*6*20000 # to get ~20k batches with the assumption that average word length should be around 6
train_text = text[:end]
train_size = len(train_text)
print(train_size, train_text[:128])

15360000  anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english re


In [19]:
def create_dictionaries(data):
    special_words = ['<PAD>', '<UNK>', '<GO>',  '<END>']

    words = list(string.ascii_lowercase) + special_words
    dictionary = {word: word_i for word_i, word in enumerate(words)}

    return dictionary, dict(zip(dictionary.values(), dictionary.keys())) 

# build dictionary
dictionary, reverse_dictionary = create_dictionaries(train_text)

# Convert each word to dictionary representations
x_ids = [[dictionary.get(letter, dictionary['<UNK>']) for letter in word] for word in train_text.split(' ')]
y_ids = [x[::-1] for x in x_ids]

print("Example raw data:")
print(train_text.split(' ')[:5])
print("Example sequence")
print(x_ids[:5])
print("\n")
print("Example output")
print(y_ids[:5])

Example raw data:
['', 'anarchism', 'originated', 'as', 'a']
Example sequence
[[], [0, 13, 0, 17, 2, 7, 8, 18, 12], [14, 17, 8, 6, 8, 13, 0, 19, 4, 3], [0, 18], [0]]


Example output
[[], [12, 18, 8, 7, 2, 17, 0, 13, 0], [3, 4, 19, 0, 13, 8, 6, 8, 17, 14], [18, 0], [0]]


In [20]:
def create_cell(rnn_size):
    cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
    
    return cell

In [21]:
def get_model_inputs():
    # Tensors with name are needed to be accessed later when the model is retrieved
    inputs = tf.placeholder(tf.int32, [batch_size, None], name = 'inputs')
    labels = tf.placeholder(tf.int32, [batch_size, None])
    lr = tf.placeholder(tf.float32)

    input_sequence_length = tf.placeholder(tf.int32, (batch_size,), name = 'input_sequence_length')
    label_sequence_length = tf.placeholder(tf.int32, (batch_size,), name = 'label_sequence_length')
    
    max_label_sequence_length = tf.reduce_max(label_sequence_length)
    
    return inputs, labels, lr, input_sequence_length, label_sequence_length, max_label_sequence_length

In [22]:
def encoding_layer(inputs, rnn_size, num_layers,
                   input_sequence_length, vocab_size, 
                   embedding_size):

    # embedd the inpiut
    embed_input = tf.contrib.layers.embed_sequence(inputs, vocab_size, embedding_size)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
    encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell, embed_input, sequence_length=input_sequence_length, dtype=tf.float32)
    
    return encoder_output, encoder_state

In [23]:
# preprocessing to remove the last char and add the GO symbol for the decoder
def process_decoder_input(labels, dictionary, batch_size):
    ending = tf.strided_slice(labels, [0, 0], [batch_size, -1], [1, 1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], dictionary['<GO>']), ending], 1)

    return decoder_input

In [24]:
def decoding_layer(dictionary, embedding_size, num_layers, rnn_size,
                   labels_sequence_length, max_label_sequence_length, encoder_state, decoder_input):

    vocab_size = len(dictionary)
    decoder_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)

    decoder_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
     
    output_layer = Dense(vocab_size,
                         kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))


    with tf.variable_scope("decode"):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                            sequence_length=labels_sequence_length,
                                                            time_major=False)
        
        training_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                                           training_helper,
                                                           encoder_state,
                                                           output_layer) 
        
        training_decoder_output = tf.contrib.seq2seq.dynamic_decode(training_decoder,impute_finished=True,
                                                                       maximum_iterations=max_label_sequence_length)[0]

    with tf.variable_scope("decode", reuse=True):
        start_tokens = tf.tile(tf.constant([dictionary['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')

        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                start_tokens,
                                                                dictionary['<END>'])
        
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                                        inference_helper,
                                                        encoder_state,
                                                        output_layer)
        
        inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_label_sequence_length)[0]
         
    
    return training_decoder_output, inference_decoder_output

In [25]:
def seq2seq_model(input_data, labels, lr, inputs_sequence_length, labels_sequence_length, 
                  max_label_sequence_length, vocab_size, embedding_size, rnn_size, num_layers):
    
    _, encoder_state = encoding_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  inputs_sequence_length,
                                  vocab_size, 
                                  embedding_size)
    
    decoder_input = process_decoder_input(labels, dictionary, batch_size)
    
    training_decoder_output, inference_decoder_output = decoding_layer(dictionary, 
                                                                       embedding_size, 
                                                                       num_layers, 
                                                                       rnn_size,
                                                                       labels_sequence_length,
                                                                       max_label_sequence_length,
                                                                       encoder_state, 
                                                                       decoder_input) 
    
    return training_decoder_output, inference_decoder_output

In [26]:
train_graph = tf.Graph()

with train_graph.as_default():
    
    input_data, labels, lr, input_sequence_length, label_sequence_length, max_label_sequence_length = get_model_inputs()
    
    training_decoder_output, inference_decoder_output = seq2seq_model(input_data, 
                                                                      labels, 
                                                                      lr, 
                                                                      input_sequence_length,
                                                                      label_sequence_length, 
                                                                      max_label_sequence_length, 
                                                                      len(dictionary),
                                                                      embedding_size, 
                                                                      rnn_size, 
                                                                      num_layers)    

    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(label_sequence_length, max_label_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):

        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            labels,
            masks)

        optimizer = tf.train.AdamOptimizer(lr)

        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


In [27]:
def pad_batch(batch, pad_int):
    # make sure each word has the same length
    max_length = max([len(word) for word in batch])
    return [word + [pad_int] * (max_length - len(word)) for word in batch]

In [28]:
# Modificiation of the batch generator above, this one iterates the text and returns padded
# word representations based on the longest word

class BatchGenerator(object):
    def __init__(self, labels, inputs, batch_size, input_pad_int, label_pad_int):
        self._labels = labels
        self._inputs = inputs
        self._batch_size = batch_size
        self._input_pad_int = input_pad_int
        self._label_pad_int = label_pad_int
        self._cursor = 0
  
    def next(self):
        # Generates a single batch
        start_i = self._cursor * self._batch_size
        input_batch = self._inputs[start_i:start_i + self._batch_size]
        label_batch = self._labels[start_i:start_i + self._batch_size]
        pad_input_batch = np.array(pad_batch(input_batch, self._input_pad_int))
        pad_label_batch = np.array(pad_batch(label_batch, self._label_pad_int))
        
        pad_label_lengths = []
        for i in pad_label_batch:
            pad_label_lengths.append(len(i))
        
        pad_input_lengths = []
        for i in pad_input_batch:
            pad_input_lengths.append(len(i))
            
        self._cursor = self._cursor + 1
        
        return pad_label_batch, pad_input_batch, pad_label_lengths, pad_input_lengths

In [29]:
# Train and validation data split

train_input = x_ids[batch_size:]
train_labels = y_ids[batch_size:]

valid_input = x_ids[:batch_size]
valid_labels = y_ids[:batch_size]

batch_generator_valid = \
        BatchGenerator(valid_labels, valid_input, batch_size, dictionary['<PAD>'], dictionary['<PAD>'])
batch_generator_train = \
        BatchGenerator(train_labels, train_input, batch_size, dictionary['<PAD>'], dictionary['<PAD>'])


(valid_labels_batch, valid_input_batch, valid_labels_lengths, valid_input_lengths) = batch_generator_valid.next()

display_step = 1000

checkpoint = "seq2seq.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
        
    for batch_i in range(0, len(train_input)//batch_size):
        
        (labels_batch, input_batch, labels_lengths, input_lengths) = batch_generator_train.next()

        _, loss = sess.run(
            [train_op, cost],
            {input_data: input_batch,
             labels: labels_batch,
             lr: learning_rate,
             label_sequence_length: labels_lengths,
             input_sequence_length: input_lengths})

        if batch_i % display_step == 0 and batch_i > 0:

            validation_loss = sess.run(
            [cost],
            {input_data: valid_input_batch,
             labels: valid_labels_batch,
             lr: learning_rate,
             label_sequence_length: valid_labels_lengths,
             input_sequence_length: valid_input_lengths})

            print('Batch %d/%d - Loss: %.3f  - Validation loss: %.3f' % (batch_i, 
                          len(train_input) // batch_size, 
                          loss, 
                          validation_loss[0]))

    # save the model state
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

Batch 1000/20435 - Loss: 0.803  - Validation loss: 0.760
Batch 2000/20435 - Loss: 0.301  - Validation loss: 0.314
Batch 3000/20435 - Loss: 0.176  - Validation loss: 0.153
Batch 4000/20435 - Loss: 0.076  - Validation loss: 0.082
Batch 5000/20435 - Loss: 0.052  - Validation loss: 0.041
Batch 6000/20435 - Loss: 0.016  - Validation loss: 0.034
Batch 7000/20435 - Loss: 0.025  - Validation loss: 0.020
Batch 8000/20435 - Loss: 0.013  - Validation loss: 0.022
Batch 9000/20435 - Loss: 0.017  - Validation loss: 0.019
Batch 10000/20435 - Loss: 0.008  - Validation loss: 0.004
Batch 11000/20435 - Loss: 0.014  - Validation loss: 0.004
Batch 12000/20435 - Loss: 0.006  - Validation loss: 0.008
Batch 13000/20435 - Loss: 0.010  - Validation loss: 0.006
Batch 14000/20435 - Loss: 0.007  - Validation loss: 0.003
Batch 15000/20435 - Loss: 0.002  - Validation loss: 0.003
Batch 16000/20435 - Loss: 0.003  - Validation loss: 0.005
Batch 17000/20435 - Loss: 0.002  - Validation loss: 0.003
Batch 18000/20435 - Los

In [30]:
def to_seq(text):
    # preprocess text for the model
    return [dictionary.get(word, dictionary['<UNK>']) for word in text] + [dictionary['<PAD>']]*(sequence_length-len(text))

In [31]:
# Sample sentence + output
# The sentence is split into words and fed to the seq2seq model, then each word is outputed + the final sentence
# at the end.

pad = dictionary["<PAD>"] 

checkpoint = "./seq2seq.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    
    # load the model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    input_sequence_length = loaded_graph.get_tensor_by_name('input_sequence_length:0')
    label_sequence_length = loaded_graph.get_tensor_by_name('label_sequence_length:0')
    
    input_sentence = 'the quick brown fox'
    words = input_sentence.split(' ')
    
    print()
    print('Original Text: %s' % input_sentence)
    print('  Word Ids: %s' % ([[letter for letter in to_seq(word)] for word in words]))
    inputWords = [" ".join([reverse_dictionary[i] for i in to_seq(word)]) for word in words]
    print('Input words: %s' % inputWords)
    
    print()
    print("Output:")
    outputs = list()
    for word in input_sentence.split(' '):
        word_seq = to_seq(word)
        answer_logits = sess.run(logits, {input_data: [word_seq]*batch_size, 
                                      label_sequence_length: [len(word_seq)]*batch_size, 
                                      input_sequence_length: [len(word_seq)]*batch_size})[0]

        
        output_word = ''.join([reverse_dictionary[i] for i in answer_logits if i != pad])

        print('Word Ids: %s' % ([i for i in answer_logits if i != pad]))
        print('Response Word: %s' % (output_word))
        outputs.append(output_word)
    
    print()
    print("Sentence output: %s" % (' '.join(outputs)))
        

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./seq2seq.ckpt

Original Text: the quick brown fox
  Word Ids: [[19, 7, 4, 26, 26, 26, 26], [16, 20, 8, 2, 10, 26, 26], [1, 17, 14, 22, 13, 26, 26], [5, 14, 23, 26, 26, 26, 26]]
Input words: ['t h e <PAD> <PAD> <PAD> <PAD>', 'q u i c k <PAD> <PAD>', 'b r o w n <PAD> <PAD>', 'f o x <PAD> <PAD> <PAD> <PAD>']

Output:
Word Ids: [4, 7, 19]
Response Word: eht
Word Ids: [10, 2, 8, 20, 16]
Response Word: kciuq
Word Ids: [13, 22, 14, 17, 1]
Response Word: nworb
Word Ids: [23, 14, 5]
Response Word: xof

Sentence output: eht kciuq nworb xof
