RNN based word-level language model for the Shakespeare plays and poems

In [1]:
import os
import os.path
import shutil
import numpy as np
import pandas as pd
import string
import collections
import itertools
from functools import reduce
import random
import json
import pickle

import warnings
warnings.filterwarnings('ignore')

import nltk
print('NTLK version', nltk.__version__)

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
print('TensorFlow version', tf.__version__)

NTLK version 3.2.3
TensorFlow version 1.3.0


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adubitskiy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
with open('shakespeare_plays.pickle', 'rb') as handle:
    speeches = pickle.load(handle)

In [4]:
with open('shakespeare_poetries.pickle', 'rb') as handle:
    poetries = pickle.load(handle)

In [5]:
shakespeare_texts = [s['speech_text'] for s in speeches] + [p['text'] for p in poetries]

Text sample (first ten speeches)

In [6]:
print(shakespeare_texts[:10])

['In delivering my son from me, I bury a second husband.', "And I in going, madam, weep o'er my father's death\nanew: but I must attend his majesty's command, to\nwhom I am now in ward, evermore in subjection.", 'You shall find of the king a husband, madam; you,\nsir, a father: he that so generally is at all times\ngood must of necessity hold his virtue to you; whose\nworthiness would stir it up where it wanted rather\nthan lack it where there is such abundance.', "What hope is there of his majesty's amendment?", 'He hath abandoned his physicians, madam; under whose\npractises he hath persecuted time with hope, and\nfinds no other advantage in the process but only the\nlosing of hope by time.', "This young gentlewoman had a father,--O, that\n'had'! how sad a passage 'tis!--whose skill was\nalmost as great as his honesty; had it stretched so\nfar, would have made nature immortal, and death\nshould have play for lack of work. Would, for the\nking's sake, he were living! I think it would 

In [7]:
class SentenceGenerator(object):
    def __init__(self, texts):
        self.texts = texts
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    def __iter__(self):
        for text in self.texts:
            for s in self.tokenizer.tokenize(text.lower()):
                for p in s.splitlines():
                    yield p

In [8]:
for s in itertools.islice(SentenceGenerator(shakespeare_texts), 5):
    print(s)
    print('----------------------------------------')

in delivering my son from me, i bury a second husband.
----------------------------------------
and i in going, madam, weep o'er my father's death
----------------------------------------
anew: but i must attend his majesty's command, to
----------------------------------------
whom i am now in ward, evermore in subjection.
----------------------------------------
you shall find of the king a husband, madam; you,
----------------------------------------


In [9]:
class WordGenerator(object):
    
    def __init__(self, texts):
        self.texts = texts
        self.trans = str.maketrans('','', string.punctuation)

    def __iter__(self):
        
        for s in SentenceGenerator(self.texts):
            for w in s.translate(self.trans).split():
                yield w

In [10]:
for w in itertools.islice(WordGenerator(shakespeare_texts), 20):
    print(w)

in
delivering
my
son
from
me
i
bury
a
second
husband
and
i
in
going
madam
weep
oer
my
fathers


In [11]:
def build_vocabulary(texts):
    wordGen = WordGenerator(texts)
    counter = collections.Counter(wordGen)
    # unique list of words with the frequencies
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    # unique list of words
    words, x = list(zip(*count_pairs))
    # reserve 0 for padding, 1 for out of vocabulary
    start_index = 2
    return words, dict(zip(words, range(start_index, len(words) + start_index)))

In [12]:
shakespeare_words, shakespeare_vocabulary = build_vocabulary(shakespeare_texts)
print('Shakespeare vocabulary size:', len(shakespeare_vocabulary))

Shakespeare vocabulary size: 27042


Make sure we could translate back and forth

In [13]:
i = shakespeare_vocabulary['transport']
print(i, shakespeare_words[i- 2])

7114 transport


In [14]:
# generates sentences using the words Ids
class EmbeddedSentenceGenerator(object):

    def __init__(self, texts, vocabulary):
        self.texts = texts
        self.vocabulary = vocabulary
        
    def __iter__(self):
        trans = str.maketrans('','', string.punctuation)

        for s in SentenceGenerator(self.texts):
            yield [ self.vocabulary[w] if w in self.vocabulary else 1 for w in s.translate(trans).split()]            

In [15]:
for s in itertools.islice(EmbeddedSentenceGenerator(shakespeare_texts, shakespeare_vocabulary), 5):
    print(s)
    print('----------------------------------------')

[11, 8418, 9, 186, 48, 15, 4, 1908, 7, 716, 296]
----------------------------------------
[3, 4, 11, 793, 190, 524, 420, 9, 366, 128]
----------------------------------------
[5067, 23, 4, 88, 675, 18, 4610, 561, 5]
----------------------------------------
[226, 4, 64, 45, 11, 3994, 2499, 11, 6380]
----------------------------------------
[8, 39, 188, 6, 2, 107, 7, 296, 190, 8]
----------------------------------------


Make sure we could recover

In [16]:
for s in itertools.islice(EmbeddedSentenceGenerator(shakespeare_texts, shakespeare_vocabulary), 5):
    print([shakespeare_words[i-2] for i in s])
    print('----------------------------------------')

['in', 'delivering', 'my', 'son', 'from', 'me', 'i', 'bury', 'a', 'second', 'husband']
----------------------------------------
['and', 'i', 'in', 'going', 'madam', 'weep', 'oer', 'my', 'fathers', 'death']
----------------------------------------
['anew', 'but', 'i', 'must', 'attend', 'his', 'majestys', 'command', 'to']
----------------------------------------
['whom', 'i', 'am', 'now', 'in', 'ward', 'evermore', 'in', 'subjection']
----------------------------------------
['you', 'shall', 'find', 'of', 'the', 'king', 'a', 'husband', 'madam', 'you']
----------------------------------------


In [17]:
# factory for creating the epochs
class EpochFactory(object):
    def __init__(self, sentences):
        self.raw_data = [word for sentence in sentences for word in sentence]

    def epoch(self, batch_size, time_steps):
        return Epoch(self.raw_data, batch_size, time_steps)
    
# provides one epoch worth of data 
class Epoch(object):
    def __init__(self, words, batch_size, time_steps):
        self.raw_data = words
        self.batch_size = batch_size
        self.time_steps = time_steps
    
    def __iter__(self):

        data_len = np.size(self.raw_data)
        batch_len = data_len // self.batch_size
        chunk_len = (batch_len - 1) // self.time_steps

        assert (chunk_len > 0), "chunk_len == 0, decrease batch_size or num_steps"

        data = np.reshape(self.raw_data[0 : self.batch_size * batch_len], [self.batch_size, batch_len])
        
        for i in range(chunk_len):
            x = data[0 : self.batch_size , i * self.time_steps     : (i + 1) * self.time_steps] 
            y = data[0 : self.batch_size , i * self.time_steps + 1 : (i + 1) * self.time_steps + 1]
            yield x, y, float(i)/float(chunk_len)

In [18]:
shakespeare_sentences = [s for s in EmbeddedSentenceGenerator(shakespeare_texts, shakespeare_vocabulary)]
print('Total %d sentences'%len(shakespeare_sentences))

Total 116152 sentences


In [19]:
fact = EpochFactory(shakespeare_sentences)

Sample chunk

In [20]:
epoch = fact.epoch(batch_size = 8, time_steps = 12)
for x, y, progress in itertools.islice(epoch, 1):
    print('---------------Inputs----------------')
    for s in x:
        print([i for i in s])
    print('---------------Targets---------------')
    for s in y:
        print([i for i in s])

---------------Inputs----------------
[11, 8418, 9, 186, 48, 15, 4, 1908, 7, 716, 296, 3]
[938, 5, 21340, 53, 37, 14, 83, 19, 1280, 47, 2, 1456]
[8, 38, 75, 78, 731, 140, 37, 615, 2148, 19, 31, 140]
[12, 832, 237, 1294, 10, 4, 905, 3, 26418, 11, 7, 627]
[7, 3473, 63, 262, 8, 1469, 13753, 15853, 2357, 9970, 13396, 7688]
[6, 107, 1180, 14837, 1438, 5, 107, 664, 3119, 65, 4, 59]
[28, 23, 30, 3655, 334, 8, 5, 24, 16, 86, 29, 8]
[26, 12081, 7, 1424, 7, 393, 7, 509, 17, 173, 32, 134]
---------------Targets---------------
[8418, 9, 186, 48, 15, 4, 1908, 7, 716, 296, 3, 4]
[5, 21340, 53, 37, 14, 83, 19, 1280, 47, 2, 1456, 124]
[38, 75, 78, 731, 140, 37, 615, 2148, 19, 31, 140, 46]
[832, 237, 1294, 10, 4, 905, 3, 26418, 11, 7, 627, 25]
[3473, 63, 262, 8, 1469, 13753, 15853, 2357, 9970, 13396, 7688, 14624]
[107, 1180, 14837, 1438, 5, 107, 664, 3119, 65, 4, 59, 756]
[23, 30, 3655, 334, 8, 5, 24, 16, 86, 29, 8, 19]
[12081, 7, 1424, 7, 393, 7, 509, 17, 173, 32, 134, 50]


Sample chunk recovered

In [21]:
epoch = fact.epoch(batch_size = 8, time_steps = 12)
for x, y, progress in itertools.islice(epoch, 1):
    print('---------------Inputs----------------')
    for s in x:
        print([shakespeare_words[i-2] for i in s])
    print('---------------Targets---------------')
    for s in y:
        print([shakespeare_words[i-2] for i in s])

---------------Inputs----------------
['in', 'delivering', 'my', 'son', 'from', 'me', 'i', 'bury', 'a', 'second', 'husband', 'and']
['int', 'to', 'mete', 'at', 'if', 'it', 'may', 'be', 'wide', 'o', 'the', 'bow']
['you', 'are', 'like', 'an', 'honourable', 'father', 'if', 'signior', 'leonato', 'be', 'her', 'father']
['is', 'fine', 'full', 'perfect', 'that', 'i', 'taste', 'and', 'violenteth', 'in', 'a', 'sense']
['a', 'bachelor', 'how', 'answer', 'you', 'la', 'plus', 'belle', 'katharine', 'du', 'monde', 'mon']
['of', 'king', 'henry', 'vi', 'ghost', 'to', 'king', 'richard', 'iii', 'when', 'i', 'was']
['so', 'but', 'what', 'compact', 'mean', 'you', 'to', 'have', 'with', 'us', 'will', 'you']
['thou', 'counterfeitst', 'a', 'bark', 'a', 'sea', 'a', 'wind', 'for', 'still', 'thy', 'eyes']
---------------Targets---------------
['delivering', 'my', 'son', 'from', 'me', 'i', 'bury', 'a', 'second', 'husband', 'and', 'i']
['to', 'mete', 'at', 'if', 'it', 'may', 'be', 'wide', 'o', 'the', 'bow', 'hand'

Prepare train, validation and test data sets
We have ~ 51K speehes lets put 5% to test and 5% to validation. Let's make them continues chunks.

In [22]:
np.random.seed(30)
test_size = int(len(shakespeare_sentences)*0.05)
start_valid = np.random.randint(0, len(shakespeare_sentences)/2 - test_size)
end_valid = start_valid + test_size
start_test = np.random.randint(len(shakespeare_sentences)/2, len(shakespeare_sentences) - test_size)
end_test = start_test + test_size
print('Test / valid sample size: %d, valid start at %d, test start at %d'%(test_size, start_valid, start_test))

Test / valid sample size: 5807, valid start at 38693, test start at 62593


In [23]:
s = shakespeare_sentences
train_sentences = s[:start_valid] + s[end_valid:start_test] + s[end_test:]
valid_sentences = s[start_valid:end_valid]
test_sentences = s[start_test:end_test]
print(
    'Training has %d, validation has %d and test has %d speeches'%
    (len(train_sentences), len(valid_sentences), len(test_sentences)))

Training has 104538, validation has 5807 and test has 5807 speeches


In [24]:
train_fact = EpochFactory(train_sentences)
valid_fact = EpochFactory(valid_sentences)
test_fact = EpochFactory(test_sentences)

In [25]:
class RNN(object):
    
    @classmethod
    def restore(cls, session, model_directory):
        with open(cls._parameters_file(model_directory)) as f:
            parameters = json.load(f)
        model = cls(
            parameters["max_gradient"],
            parameters["batch_size"], 
            parameters["time_steps"], 
            parameters["vocabulary_size"],
            parameters["hidden_units"], 
            parameters["layers"]
        )
        tf.train.Saver().restore(session, cls._model_file(model_directory))
        return model

    @staticmethod
    def _parameters_file(model_directory):
        return os.path.join(model_directory, "parameters.json")

    @staticmethod
    def _model_file(model_directory):
        return os.path.join(model_directory, "model")

    def __init__(self, max_gradient, batch_size, time_steps, vocabulary_size, hidden_units, layers):

        self.max_gradient = max_gradient
        self.layers = layers

        with tf.name_scope("Parameters"):
            self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
            self.keep_probability = tf.placeholder(tf.float32, name="keep_probability")

        with tf.name_scope("Input"):
            self.input = tf.placeholder(tf.int32, shape=(batch_size, time_steps), name="input")
            self.targets = tf.placeholder(tf.int32, shape=(batch_size, time_steps), name="targets")

        with tf.name_scope("Embedding"):
            self.embedding = tf.Variable(
                tf.random_uniform((vocabulary_size, hidden_units), -1.0, 1.0),
                dtype=tf.float32,
                name="embedding"
            )
            self.embedded_input = tf.nn.embedding_lookup(self.embedding, self.input, name="embedded_input")

        with tf.name_scope("RNN"):
            # it is a bit harder to manage unconcatenated state
            # for our purposes it should be OK to use concatenated state
            cell = tf.nn.rnn_cell.LSTMCell(hidden_units, state_is_tuple = False)
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.keep_probability)
            rnn_layers = tf.nn.rnn_cell.MultiRNNCell([cell] * layers, state_is_tuple = False)
            
            self.reset_state = rnn_layers.zero_state(batch_size, dtype=tf.float32)
            self.state = tf.placeholder(tf.float32, self.reset_state.get_shape(), "state")
            
            self.outputs, self.next_state = tf.nn.dynamic_rnn(
                rnn_layers, self.embedded_input, initial_state=self.state, time_major=False)

        with tf.name_scope("Cost"):
            # Concatenate all the batches into a single row.
            self.flattened_outputs = tf.reshape(
                tf.concat( self.outputs, 1),
                (-1, hidden_units),
                name="flattened_outputs"
            )
            
            # Project the outputs onto the vocabulary.
            self.w = tf.get_variable(
                "w", (hidden_units, vocabulary_size), initializer = tf.truncated_normal_initializer)
            self.b = tf.get_variable(
                "b", vocabulary_size, initializer = tf.truncated_normal_initializer)
            # Compare predictions to labels.
            self.predicted = tf.matmul(self.flattened_outputs, self.w) + self.b

            # The log-perplexity for each sequence
            self.loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
                [self.predicted],
                [tf.concat(self.targets, -1)],
                [tf.ones(batch_size * time_steps)]
            )
            # average log-perplexity over the batch
            self.cost = tf.div(tf.reduce_sum(self.loss), batch_size, name="cost")

        with tf.name_scope("Train"):
            self.validation_perplexity = tf.Variable(
                dtype=tf.float32, initial_value=float("inf"), trainable=False, name="validation_perplexity")
            tf.summary.scalar(self.validation_perplexity.op.name, self.validation_perplexity)
            self.training_epoch_perplexity = tf.Variable(
                dtype=tf.float32, initial_value=float("inf"), trainable=False, name="training_epoch_perplexity")
            tf.summary.scalar(self.training_epoch_perplexity.op.name, self.training_epoch_perplexity)
            self.iteration = tf.Variable(0, dtype=tf.int64, name="iteration", trainable=False)
            # gradient clipping
            self.gradients, _ = tf.clip_by_global_norm(
                tf.gradients(self.cost, tf.trainable_variables()), max_gradient, name="clip_gradients")
            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train_step = optimizer.apply_gradients(
                zip(self.gradients, tf.trainable_variables()), name="train_step", global_step=self.iteration)

        self.initialize = tf.global_variables_initializer()
        self.summary = tf.summary.merge_all()

    @property
    def batch_size(self):
        return self.input.get_shape()[0].value

    @property
    def time_steps(self):
        return self.input.get_shape()[1].value

    @property
    def vocabulary_size(self):
        return self.embedding.get_shape()[0].value

    @property
    def hidden_units(self):
        return self.embedding.get_shape()[1].value

    def train(
            self, 
            session, 
            training_factory,
            parameters,
            exit_criteria,
            validation,
            logging_interval,
            directories):

        epoch = 1
        iteration = 0
        state = None
        summary = self.summary_writer(directories.summary, session)
        validation_hist = []
        
        try:
            # Enumerate over the training set until exit criteria are met.
            while True:
                epoch_cost = 0.0
                epoch_iteration = 0
                
                # rest state for each epoch
                state = session.run(self.reset_state)
                
                # Enumerate over a single epoch of the training set
                for x, y, complete in training_factory.epoch(self.batch_size, self.time_steps):
                    _, cost, state, iteration = session.run(
                        [self.train_step, self.cost, self.next_state, self.iteration],
                        feed_dict={
                            self.input: x,
                            self.targets: y,
                            # pass previous epoch state
                            self.state: state,
                            self.learning_rate: parameters.learning_rate,
                            self.keep_probability: parameters.keep_probability
                        })
                    epoch_cost += cost
                    epoch_iteration += self.time_steps
                    if self._interval(iteration, logging_interval):
                        tf.logging.info(
                            "Epoch %d (%0.4f complete), Iteration %d: epoch training perplexity %0.4f" %
                            (epoch, complete, iteration, self.perplexity(epoch_cost, epoch_iteration)))
                    
                    if validation is not None and self._interval(iteration, validation.interval):
                        validation_perplexity = self.test(session, validation.epoch_factory)
                        self.store_validation_perplexity(session, summary, iteration, validation_perplexity)
                        tf.logging.info(
                            "Epoch %d, Iteration %d: validation perplexity %0.4f" %
                            (epoch, iteration, validation_perplexity))
                        # save model if improved
                        validation_hist.append(validation_perplexity)
                        if (directories.model is not None) and (validation_perplexity == min(validation_hist)):
                            model_filename = self._model_file(directories.model)
                            tf.train.Saver().save(session, model_filename)
                            self._write_model_parameters(directories.model)
                            tf.logging.info("Saved model in %s " % directories.model)
                        
                    if exit_criteria.max_iterations is not None and iteration > exit_criteria.max_iterations:
                        raise StopTrainingException()

                self.store_training_epoch_perplexity(
                    session, summary, iteration, self.perplexity(epoch_cost, epoch_iteration))
                epoch += 1
                if exit_criteria.max_epochs is not None and epoch > exit_criteria.max_epochs:
                    raise StopTrainingException()
        except (StopTrainingException, KeyboardInterrupt):
            pass
        
        tf.logging.info("Stop training at epoch %d, iteration %d" % (epoch, iteration))
        summary.close()

    def _write_model_parameters(self, model_directory):
        parameters = {
            "max_gradient": self.max_gradient,
            "batch_size": self.batch_size,
            "time_steps": self.time_steps,
            "vocabulary_size": self.vocabulary_size,
            "hidden_units": self.hidden_units,
            "layers": self.layers
        }
        with open(self._parameters_file(model_directory), "w") as f:
            json.dump(parameters, f, indent=4)

    def test(self, session, epoch_factory):
        state = session.run(self.reset_state)
        epoch_cost = 0.0
        epoch_iteration = 0
        epoch = epoch_factory.epoch(self.batch_size, self.time_steps)
        for x, y, _ in epoch:
            cost, state = session.run(
                [self.cost, self.next_state],
                feed_dict={
                    self.input: x, 
                    self.targets: y,
                    self.state: state,
                    self.keep_probability: 1.0
                }
            )
            epoch_cost += cost
            epoch_iteration += self.time_steps
        return self.perplexity(epoch_cost, epoch_iteration)

    @staticmethod
    def _interval(iteration, interval):
        return interval is not None and iteration > 1 and iteration % interval == 0

    @staticmethod
    def perplexity(cost, iterations):
        return np.exp(cost / iterations)

    def store_validation_perplexity(self, session, summary, iteration, validation_perplexity):
        session.run(self.validation_perplexity.assign(validation_perplexity))
        summary.add_summary(session.run(self.summary), global_step=iteration)

    def store_training_epoch_perplexity(self, session, summary, iteration, training_perplexity):
        session.run(self.training_epoch_perplexity.assign(training_perplexity))
        summary.add_summary(session.run(self.summary), global_step=iteration)

    @staticmethod
    def summary_writer(summary_directory, session):
        class NullSummaryWriter(object):
            def add_summary(self, *args, **kwargs):
                pass

            def flush(self):
                pass

            def close(self):
                pass

        if summary_directory is not None:
            return tf.summary.FileWriter(summary_directory, session.graph)
        else:
            return NullSummaryWriter()


class StopTrainingException(Exception):
    pass

class ExitCriteria(object):
    def __init__(self, max_iterations, max_epochs):
        self.max_iterations = max_iterations
        self.max_epochs = max_epochs

class Parameters(object):
    def __init__(self, learning_rate, keep_probability):
        self.learning_rate = learning_rate
        self.keep_probability = keep_probability

class Validation(object):
    def __init__(self, interval, epoch_factory):
        self.interval = interval
        self.epoch_factory = epoch_factory

class Directories(object):
    def __init__(self, model, summary):
        self.model = model
        self.summary = summary

In [26]:
tf.reset_default_graph()
        
graph = tf.Graph()
with graph.as_default():
    rnn  = RNN(
        max_gradient = 5, 
        batch_size = 64, 
        time_steps = 10, 
        # Add vocabulary slots of out of vocabulary (index 1) and padding (index 0).
        vocabulary_size = len(shakespeare_vocabulary) + 2, 
        hidden_units = 512, 
        layers = 2)



In [27]:
shutil.rmtree('model', ignore_errors = True)
os.makedirs('model')
shutil.rmtree('summary', ignore_errors = True)
os.makedirs('summary')

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    rnn.train(
        session = sess, 
        training_factory = train_fact, 
        parameters = Parameters(learning_rate = 0.001, keep_probability = 0.7),
        exit_criteria = ExitCriteria(max_iterations = None, max_epochs = 5),
        validation = Validation(interval = 100, epoch_factory = valid_fact), 
        logging_interval = 50, 
        directories = Directories('model', 'summary')
    )

INFO:tensorflow:Epoch 1 (0.0459 complete), Iteration 50: epoch training perplexity 13751.1564
INFO:tensorflow:Epoch 1 (0.0928 complete), Iteration 100: epoch training perplexity 6169.3391
INFO:tensorflow:Epoch 1, Iteration 100: validation perplexity 1588.0681
INFO:tensorflow:Saved model in model 
INFO:tensorflow:Epoch 1 (0.1396 complete), Iteration 150: epoch training perplexity 5603.2166
INFO:tensorflow:Epoch 1 (0.1865 complete), Iteration 200: epoch training perplexity 3810.4872
INFO:tensorflow:Epoch 1, Iteration 200: validation perplexity 1051.8382
INFO:tensorflow:Saved model in model 
INFO:tensorflow:Epoch 1 (0.2334 complete), Iteration 250: epoch training perplexity 2913.1507
INFO:tensorflow:Epoch 1 (0.2802 complete), Iteration 300: epoch training perplexity 2412.2825
INFO:tensorflow:Epoch 1, Iteration 300: validation perplexity 900.0939
INFO:tensorflow:Saved model in model 
INFO:tensorflow:Epoch 1 (0.3271 complete), Iteration 350: epoch training perplexity 2089.6530
INFO:tensorfl

INFO:tensorflow:Saved model in model 
INFO:tensorflow:Epoch 3 (0.6701 complete), Iteration 2850: epoch training perplexity 388.9852
INFO:tensorflow:Epoch 3 (0.7170 complete), Iteration 2900: epoch training perplexity 389.8953
INFO:tensorflow:Epoch 3, Iteration 2900: validation perplexity 544.8230
INFO:tensorflow:Saved model in model 
INFO:tensorflow:Epoch 3 (0.7638 complete), Iteration 2950: epoch training perplexity 390.0539
INFO:tensorflow:Epoch 3 (0.8107 complete), Iteration 3000: epoch training perplexity 389.6283
INFO:tensorflow:Epoch 3, Iteration 3000: validation perplexity 545.5295
INFO:tensorflow:Epoch 3 (0.8575 complete), Iteration 3050: epoch training perplexity 388.7897
INFO:tensorflow:Epoch 3 (0.9044 complete), Iteration 3100: epoch training perplexity 387.9840
INFO:tensorflow:Epoch 3, Iteration 3100: validation perplexity 543.3475
INFO:tensorflow:Saved model in model 
INFO:tensorflow:Epoch 3 (0.9513 complete), Iteration 3150: epoch training perplexity 387.7841
INFO:tensorf

Calculate RNN model performance on the test data

In [28]:
tf.reset_default_graph()
with tf.Session() as sess:
    rnn = RNN.restore(sess, 'model')
    rnn_test_perplexity = rnn.test(sess, test_fact)

INFO:tensorflow:Restoring parameters from model\model


In order to evaluate our RNN language model we build baseline unigram model to compare the perplexity on the test set.

In [29]:
class UniGram(object):

    def __init__(self, texts):
        self.counter = collections.Counter(WordGenerator(texts))
        self.corpus_size = sum(self.counter.values())

    # word probability is the word freq divided by the corpus size
    def word_proba(self, word):
        freq = self.counter[word]
        return float(freq)/float(self.corpus_size)
    
    # assumes independence
    def sentence_proba(self, sentence):
        word_probs = [self.word_proba(w) for w in sentence]
        return reduce(lambda x, y: x*y, word_probs)
    
    def perplexity(self, sentence):
        p = self.sentence_proba(sentence)
        return pow(p, -1.0/float(len(sentence)))

In [30]:
uniGram = UniGram(shakespeare_texts)

Make sure it works

In [31]:
s = ['in', 'delivering', 'my', 'son', 'from', 'me']
for w in s:
    print("Probability of '%s' is %f" % (w, uniGram.word_proba(w)))
print('Sentence probability is ' , uniGram.sentence_proba(s))
print('Sentence perplexity is ' , uniGram.perplexity(s))

Probability of 'in' is 0.013105
Probability of 'delivering' is 0.000005
Probability of 'my' is 0.014527
Probability of 'son' is 0.000650
Probability of 'from' is 0.003156
Probability of 'me' is 0.009196
Sentence probability is  1.8897761650936493e-17
Sentence perplexity is  612.7240664727524


Calculate perplexity for each sentence and average

In [32]:
test_epoch = test_fact.epoch(batch_size = 1, time_steps = 10)
pp = []
for x, y, progress in test_epoch:
    for s in x:
        pp.append(uniGram.perplexity([shakespeare_words[i-2] for i in s]))
uni_test_perplexity = np.mean(pp)

In [33]:
print('RNN model test perplexity ', rnn_test_perplexity)
print('Unigram model test perplexity ', uni_test_perplexity)

RNN model test perplexity  477.119320215
Unigram model test perplexity  1345.33048967


Our RNN language model shows superior performance!