RNN based language model for the vShakespeare plays and poems

[Recurrent neural network based language model
](http://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf)

In [1]:
import os
import os.path
import shutil
import numpy as np
import pandas as pd
import string
import collections
import itertools
import random
import json
import pickle

import nltk.data

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
print(tf.__version__)

1.3.0


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adubitskiy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
with open('shakespeare_plays.pickle', 'rb') as handle:
    speeches = pickle.load(handle)

In [4]:
with open('shakespeare_poetries.pickle', 'rb') as handle:
    poetries = pickle.load(handle)

In [5]:
shakespeare_texts = [s['speech_text'] for s in speeches] + [p['text'] for p in poetries]

In [6]:
print(shakespeare_texts[:10])

['In delivering my son from me, I bury a second husband.', "And I in going, madam, weep o'er my father's death\nanew: but I must attend his majesty's command, to\nwhom I am now in ward, evermore in subjection.", 'You shall find of the king a husband, madam; you,\nsir, a father: he that so generally is at all times\ngood must of necessity hold his virtue to you; whose\nworthiness would stir it up where it wanted rather\nthan lack it where there is such abundance.', "What hope is there of his majesty's amendment?", 'He hath abandoned his physicians, madam; under whose\npractises he hath persecuted time with hope, and\nfinds no other advantage in the process but only the\nlosing of hope by time.', "This young gentlewoman had a father,--O, that\n'had'! how sad a passage 'tis!--whose skill was\nalmost as great as his honesty; had it stretched so\nfar, would have made nature immortal, and death\nshould have play for lack of work. Would, for the\nking's sake, he were living! I think it would 

In [7]:
class SentenceGenerator(object):
    def __init__(self, texts):
        self.texts = texts

    def __iter__(self):
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        for text in self.texts:
            for s in tokenizer.tokenize(text):
                yield s

In [8]:
for s in itertools.islice(SentenceGenerator(shakespeare_texts), 5):
    print(s)
    print('----------------------------------------')

In delivering my son from me, I bury a second husband.
----------------------------------------
And I in going, madam, weep o'er my father's death
anew: but I must attend his majesty's command, to
whom I am now in ward, evermore in subjection.
----------------------------------------
You shall find of the king a husband, madam; you,
sir, a father: he that so generally is at all times
good must of necessity hold his virtue to you; whose
worthiness would stir it up where it wanted rather
than lack it where there is such abundance.
----------------------------------------
What hope is there of his majesty's amendment?
----------------------------------------
He hath abandoned his physicians, madam; under whose
practises he hath persecuted time with hope, and
finds no other advantage in the process but only the
losing of hope by time.
----------------------------------------


In [9]:
class WordGenerator(object):
    
    def __init__(self, texts):
        self.texts = texts

    def __iter__(self):
        trans = str.maketrans('','', string.punctuation)

        for s in SentenceGenerator(self.texts):
            for w in s.translate(trans).split():
                yield w

In [10]:
for w in itertools.islice(WordGenerator(shakespeare_texts), 20):
    print(w)

In
delivering
my
son
from
me
I
bury
a
second
husband
And
I
in
going
madam
weep
oer
my
fathers


In [11]:
def build_vocabulary(texts):
    wordGen = WordGenerator(texts)
    counter = collections.Counter(wordGen)
    # unique list of words with the frequencies
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    # unique list of words
    words, x = list(zip(*count_pairs))
    # reserve 0 for padding, 1 for out of vocabulary
    start_index = 2
    return words, dict(zip(words, range(start_index, len(words) + start_index)))

In [12]:
shakespeare_words, shakespeare_vocabulary = build_vocabulary(shakespeare_texts)
print('Shakespeare vocabulary size:', len(shakespeare_vocabulary))

Shakespeare vocabulary size: 32006


In [13]:
i = shakespeare_vocabulary['transport']
print(i, shakespeare_words[i- 2])

8920 transport


In [14]:
class EmbeddedSentenceGenerator(object):

    def __init__(self, texts, vocabulary):
        self.texts = texts
        self.vocabulary = vocabulary
        
    def __iter__(self):
        trans = str.maketrans('','', string.punctuation)

        for s in SentenceGenerator(self.texts):
            yield [ self.vocabulary[w] if w in self.vocabulary else 1 for w in s.translate(trans).split()]            

In [15]:
for s in itertools.islice(EmbeddedSentenceGenerator(shakespeare_texts, shakespeare_vocabulary), 5):
    print(s)
    print('----------------------------------------')

[109, 11554, 9, 213, 50, 14, 3, 2259, 8, 877, 334]
----------------------------------------
[16, 3, 10, 902, 312, 599, 502, 9, 406, 152, 5601, 31, 3, 92, 823, 18, 4998, 646, 5, 303, 3, 57, 60, 10, 4277, 3047, 10, 7033]
----------------------------------------
[86, 39, 215, 6, 2, 134, 8, 334, 312, 7, 64, 8, 159, 24, 12, 30, 4968, 11, 52, 34, 496, 49, 92, 6, 2035, 285, 18, 512, 5, 7, 269, 3669, 55, 1130, 15, 120, 158, 15, 7899, 353, 90, 887, 15, 158, 93, 11, 99, 4877]
----------------------------------------
[56, 313, 11, 93, 6, 18, 4998, 11185]
----------------------------------------
[104, 72, 13996, 18, 5366, 312, 526, 269, 4738, 24, 72, 27863, 121, 17, 313, 4, 1735, 43, 179, 1239, 10, 2, 3479, 31, 392, 2, 3463, 6, 313, 40, 121]
----------------------------------------


Make sure we could recover

In [16]:
for s in itertools.islice(EmbeddedSentenceGenerator(shakespeare_texts, shakespeare_vocabulary), 5):
    print([shakespeare_words[i-2] for i in s])
    print('----------------------------------------')

['In', 'delivering', 'my', 'son', 'from', 'me', 'I', 'bury', 'a', 'second', 'husband']
----------------------------------------
['And', 'I', 'in', 'going', 'madam', 'weep', 'oer', 'my', 'fathers', 'death', 'anew', 'but', 'I', 'must', 'attend', 'his', 'majestys', 'command', 'to', 'whom', 'I', 'am', 'now', 'in', 'ward', 'evermore', 'in', 'subjection']
----------------------------------------
['You', 'shall', 'find', 'of', 'the', 'king', 'a', 'husband', 'madam', 'you', 'sir', 'a', 'father', 'he', 'that', 'so', 'generally', 'is', 'at', 'all', 'times', 'good', 'must', 'of', 'necessity', 'hold', 'his', 'virtue', 'to', 'you', 'whose', 'worthiness', 'would', 'stir', 'it', 'up', 'where', 'it', 'wanted', 'rather', 'than', 'lack', 'it', 'where', 'there', 'is', 'such', 'abundance']
----------------------------------------
['What', 'hope', 'is', 'there', 'of', 'his', 'majestys', 'amendment']
----------------------------------------
['He', 'hath', 'abandoned', 'his', 'physicians', 'madam', 'under', 

In [17]:
# factory for creating the epochs
class EpochFactory(object):
    def __init__(self, sentences):
        self.raw_data = [word for sentence in sentences for word in sentence]

    def epoch(self, batch_size, time_steps):
        return Epoch(self.raw_data, batch_size, time_steps)
    
# provides one epoch worth of data 
class Epoch(object):
    def __init__(self, words, batch_size, time_steps):
        self.raw_data = words
        self.batch_size = batch_size
        self.time_steps = time_steps
    
    def __iter__(self):

        data_len = np.size(self.raw_data)
        batch_len = data_len // self.batch_size
        chunk_len = (batch_len - 1) // self.time_steps

        assert (chunk_len > 0), "chunk_len == 0, decrease batch_size or num_steps"

        data = np.reshape(self.raw_data[0 : self.batch_size * batch_len], [self.batch_size, batch_len])
        
        for i in range(chunk_len):
            x = data[0 : self.batch_size , i * self.time_steps     : (i + 1) * self.time_steps] 
            y = data[0 : self.batch_size , i * self.time_steps + 1 : (i + 1) * self.time_steps + 1]
            yield x, y, float(i)/float(chunk_len)

In [18]:
shakespeare_sentences = [s for s in EmbeddedSentenceGenerator(shakespeare_texts, shakespeare_vocabulary)]
print('Total %d sentences'%len(shakespeare_sentences))

Total 51693 sentences


In [19]:
fact = EpochFactory(shakespeare_sentences)

Sample chunk

In [20]:
epoch = fact.epoch(batch_size = 8, time_steps = 12)
for x, y, progress in itertools.islice(epoch, 1):
    print('---------------Inputs----------------')
    for s in x:
        print([i for i in s])
    print('---------------Targets---------------')
    for s in y:
        print([i for i in s])

---------------Inputs----------------
[109, 11554, 9, 213, 50, 14, 3, 2259, 8, 877, 334, 16]
[1017, 5, 26765, 52, 61, 15, 89, 19, 20975, 274, 2, 1668]
[7, 37, 81, 83, 805, 159, 85, 776, 2810, 19, 28, 159]
[11, 939, 290, 1423, 12, 3, 1020, 16, 31430, 10, 8, 695]
[8, 3715, 110, 302, 7, 1773, 16117, 21783, 3400, 11620, 15794, 8610]
[6, 693, 2053, 13926, 2999, 35, 948, 2602, 3258, 141, 3, 58]
[30, 46, 47, 4137, 378, 7, 5, 23, 17, 77, 255, 7]
[136, 14615, 8, 1576, 8, 438, 8, 569, 69, 209, 33, 150]
---------------Targets---------------
[11554, 9, 213, 50, 14, 3, 2259, 8, 877, 334, 16, 3]
[5, 26765, 52, 61, 15, 89, 19, 20975, 274, 2, 1668, 140]
[37, 81, 83, 805, 159, 85, 776, 2810, 19, 28, 159, 51]
[939, 290, 1423, 12, 3, 1020, 16, 31430, 10, 8, 695, 29]
[3715, 110, 302, 7, 1773, 16117, 21783, 3400, 11620, 15794, 8610, 16920]
[693, 2053, 13926, 2999, 35, 948, 2602, 3258, 141, 3, 58, 843]
[46, 47, 4137, 378, 7, 5, 23, 17, 77, 255, 7, 19]
[14615, 8, 1576, 8, 438, 8, 569, 69, 209, 33, 150, 80]


Sample chunk recovered

In [21]:
epoch = fact.epoch(batch_size = 8, time_steps = 12)
for x, y, progress in itertools.islice(epoch, 1):
    print('---------------Inputs----------------')
    for s in x:
        print([shakespeare_words[i-2] for i in s])
    print('---------------Targets---------------')
    for s in y:
        print([shakespeare_words[i-2] for i in s])

---------------Inputs----------------
['In', 'delivering', 'my', 'son', 'from', 'me', 'I', 'bury', 'a', 'second', 'husband', 'And']
['int', 'to', 'mete', 'at', 'if', 'it', 'may', 'be', 'Wide', 'o', 'the', 'bow']
['you', 'are', 'like', 'an', 'honourable', 'father', 'If', 'Signior', 'Leonato', 'be', 'her', 'father']
['is', 'fine', 'full', 'perfect', 'that', 'I', 'taste', 'And', 'violenteth', 'in', 'a', 'sense']
['a', 'bachelor', 'How', 'answer', 'you', 'la', 'plus', 'belle', 'Katharine', 'du', 'monde', 'mon']
['of', 'King', 'Henry', 'VI', 'Ghost', 'To', 'KING', 'RICHARD', 'III', 'When', 'I', 'was']
['so', 'But', 'what', 'compact', 'mean', 'you', 'to', 'have', 'with', 'us', 'Will', 'you']
['Thou', 'counterfeitst', 'a', 'bark', 'a', 'sea', 'a', 'wind', 'For', 'still', 'thy', 'eyes']
---------------Targets---------------
['delivering', 'my', 'son', 'from', 'me', 'I', 'bury', 'a', 'second', 'husband', 'And', 'I']
['to', 'mete', 'at', 'if', 'it', 'may', 'be', 'Wide', 'o', 'the', 'bow', 'hand'

Prepare train, validation and test data sets
We have ~ 51K speehes lets put 5% to test and 5% to validation. Let's make them continues chunks.

In [22]:
np.random.seed(30)
test_size = int(len(shakespeare_sentences)*0.05)
start_valid = np.random.randint(0, len(shakespeare_sentences)/2 - test_size)
end_valid = start_valid + test_size
start_test = np.random.randint(len(shakespeare_sentences)/2, len(shakespeare_sentences) - test_size)
end_test = start_test + test_size
print('Test / valid sample size: %d, valid start at %d, test start at %d'%(test_size, start_valid, start_test))

Test / valid sample size: 2584, valid start at 5925, test start at 30363


In [23]:
s = shakespeare_sentences
train_sentences = s[:start_valid] + s[end_valid:start_test] + s[end_test:]
valid_sentences = s[start_valid:end_valid]
test_sentences = s[start_test:end_test]
print(
    'Training has %d, validation has %d and test has %d speeches'%
    (len(train_sentences), len(valid_sentences), len(test_sentences)))

Training has 46525, validation has 2584 and test has 2584 speeches


In [24]:
train_fact = EpochFactory(train_sentences)
valid_fact = EpochFactory(valid_sentences)
train_fact = EpochFactory(test_sentences)

In [25]:
class RNN(object):
    
    @classmethod
    def restore(cls, session, model_directory):
        with open(cls._parameters_file(model_directory)) as f:
            parameters = json.load(f)
        model = cls(
            parameters["max_gradient"],
            parameters["batch_size"], 
            parameters["time_steps"], 
            parameters["vocabulary_size"],
            parameters["hidden_units"], 
            parameters["layers"]
        )
        tf.train.Saver().restore(session, cls._model_file(model_directory))
        return model

    @staticmethod
    def _parameters_file(model_directory):
        return os.path.join(model_directory, "parameters.json")

    @staticmethod
    def _model_file(model_directory):
        return os.path.join(model_directory, "model")

    def __init__(self, max_gradient, batch_size, time_steps, vocabulary_size, hidden_units, layers):

        self.max_gradient = max_gradient
        self.layers = layers
        # Add vocabulary slots of out of vocabulary (index 1) and padding (index 0).
        vocabulary_size += 2

        with tf.name_scope("Parameters"):
            self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
            self.keep_probability = tf.placeholder(tf.float32, name="keep_probability")

        with tf.name_scope("Input"):
            self.input = tf.placeholder(tf.int32, shape=(batch_size, time_steps), name="input")
            self.targets = tf.placeholder(tf.int32, shape=(batch_size, time_steps), name="targets")

        with tf.name_scope("Embedding"):
            self.embedding = tf.Variable(
                tf.random_uniform((vocabulary_size, hidden_units), -1.0, 1.0),
                dtype=tf.float32,
                name="embedding"
            )
            self.embedded_input = tf.nn.embedding_lookup(self.embedding, self.input, name="embedded_input")

        with tf.name_scope("RNN"):
            # it is a bit harder to manage unconcatenated state
            # for our purposes it should be OK to use concatenated state
            cell = tf.nn.rnn_cell.LSTMCell(hidden_units, state_is_tuple = False)
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.keep_probability)
            rnn_layers = tf.nn.rnn_cell.MultiRNNCell([cell] * layers, state_is_tuple = False)
            
            self.reset_state = rnn_layers.zero_state(batch_size, dtype=tf.float32)
            self.state = tf.placeholder(tf.float32, self.reset_state.get_shape(), "state")
            
            self.outputs, self.next_state = tf.nn.dynamic_rnn(
                rnn_layers, self.embedded_input, initial_state=self.state, time_major=False)

        with tf.name_scope("Cost"):
            # Concatenate all the batches into a single row.
            self.flattened_outputs = tf.reshape(
                tf.concat( self.outputs, 1),
                (-1, hidden_units),
                name="flattened_outputs"
            )
            
            # Project the outputs onto the vocabulary.
            self.w = tf.get_variable(
                "w", (hidden_units, vocabulary_size), initializer = tf.truncated_normal_initializer)
            self.b = tf.get_variable(
                "b", vocabulary_size, initializer = tf.truncated_normal_initializer)
            self.predicted = tf.matmul(self.flattened_outputs, self.w) + self.b
            
            # Compare predictions to labels.
            self.loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
                [self.predicted],
                [tf.concat(self.targets, -1)],
                [tf.ones(batch_size * time_steps)]
            )
            self.cost = tf.div(tf.reduce_sum(self.loss), batch_size, name="cost")

        with tf.name_scope("Train"):
            self.validation_perplexity = tf.Variable(
                dtype=tf.float32, initial_value=float("inf"), trainable=False, name="validation_perplexity")
            tf.summary.scalar(self.validation_perplexity.op.name, self.validation_perplexity)
            self.training_epoch_perplexity = tf.Variable(
                dtype=tf.float32, initial_value=float("inf"), trainable=False, name="training_epoch_perplexity")
            tf.summary.scalar(self.training_epoch_perplexity.op.name, self.training_epoch_perplexity)
            self.iteration = tf.Variable(0, dtype=tf.int64, name="iteration", trainable=False)
            # gradient clipping
            self.gradients, _ = tf.clip_by_global_norm(
                tf.gradients(self.cost, tf.trainable_variables()), max_gradient, name="clip_gradients")
            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train_step = optimizer.apply_gradients(
                zip(self.gradients, tf.trainable_variables()), name="train_step", global_step=self.iteration)

        self.initialize = tf.global_variables_initializer()
        self.summary = tf.summary.merge_all()

    @property
    def batch_size(self):
        return self.input.get_shape()[0].value

    @property
    def time_steps(self):
        return self.input.get_shape()[1].value

    @property
    def vocabulary_size(self):
        return self.embedding.get_shape()[0].value

    @property
    def hidden_units(self):
        return self.embedding.get_shape()[1].value

    def train(
            self, 
            session, 
            training_factory,
            parameters,
            exit_criteria,
            validation,
            logging_interval,
            directories):
        epoch = 1
        iteration = 0
        state = None
        summary = self.summary_writer(directories.summary, session)
        try:
            # Enumerate over the training set until exit criteria are met.
            while True:
                epoch_cost = 0.0
                epoch_iteration = 0
                
                # rest state for each epoch
                state = session.run(self.reset_state)
                
                # Enumerate over a single epoch of the training set
                for x, y, complete in training_factory.epoch(self.batch_size, self.time_steps):
                    _, cost, state, iteration = session.run(
                        [self.train_step, self.cost, self.next_state, self.iteration],
                        feed_dict={
                            self.input: x,
                            self.targets: y,
                            # pass previous epoch state
                            self.state: state,
                            self.learning_rate: parameters.learning_rate,
                            self.keep_probability: parameters.keep_probability
                        })
                    epoch_cost += cost
                    epoch_iteration += self.time_steps
                    if self._interval(iteration, logging_interval):
                        tf.logging.info(
                            "Epoch %d (%0.4f complete), Iteration %d: epoch training perplexity %0.4f" %
                            (epoch, complete, iteration, self.perplexity(epoch_cost, epoch_iteration))
                    )
                    if validation is not None and self._interval(iteration, validation.interval):
                        validation_perplexity = self.test(session, validation.epoch_factory)
                        self.store_validation_perplexity(session, summary, iteration, validation_perplexity)
                        tf.logging.info(
                            "Epoch %d, Iteration %d: validation perplexity %0.4f" %
                            (epoch, iteration, validation_perplexity)
                    )
                        
                    if exit_criteria.max_iterations is not None and iteration > exit_criteria.max_iterations:
                        raise StopTrainingException()

                self.store_training_epoch_perplexity(
                    session, summary, iteration, self.perplexity(epoch_cost, epoch_iteration))
                epoch += 1
                if exit_criteria.max_epochs is not None and epoch > exit_criteria.max_epochs:
                    raise StopTrainingException()
        except (StopTrainingException, KeyboardInterrupt):
            pass
        
        tf.logging.info("Stop training at epoch %d, iteration %d" % (epoch, iteration))
        summary.close()
        
        if directories.model is not None:
            model_filename = self._model_file(directories.model)
            tf.train.Saver().save(session, model_filename)
            self._write_model_parameters(directories.model)
            tf.logging.info("Saved model in %s " % directories.model)

    def _write_model_parameters(self, model_directory):
        parameters = {
            "max_gradient": self.max_gradient,
            "batch_size": self.batch_size,
            "time_steps": self.time_steps,
            "vocabulary_size": self.vocabulary_size,
            "hidden_units": self.hidden_units,
            "layers": self.layers
        }
        with open(self._parameters_file(model_directory), "w") as f:
            json.dump(parameters, f, indent=4)

    def test(self, session, epoch_factory):
        state = session.run(self.reset_state)
        epoch_cost = epoch_iteration = 0
        epoch = epoch_factory.epoch(self.batch_size, self.time_steps)
        for x, y, _ in epoch:
            cost, state = session.run(
                [self.cost, self.next_state],
                feed_dict={
                    self.input: x, 
                    self.targets: y,
                    self.state: state,
                    self.keep_probability: 1.0
                }
            )
            epoch_cost += cost
            epoch_iteration += self.time_steps
        return self.perplexity(epoch_cost, epoch_iteration)

    @staticmethod
    def _interval(iteration, interval):
        return interval is not None and iteration > 1 and iteration % interval == 0

    @staticmethod
    def perplexity(cost, iterations):
        return np.exp(cost / iterations)

    def store_validation_perplexity(self, session, summary, iteration, validation_perplexity):
        session.run(self.validation_perplexity.assign(validation_perplexity))
        summary.add_summary(session.run(self.summary), global_step=iteration)

    def store_training_epoch_perplexity(self, session, summary, iteration, training_perplexity):
        session.run(self.training_epoch_perplexity.assign(training_perplexity))
        summary.add_summary(session.run(self.summary), global_step=iteration)

    @staticmethod
    def summary_writer(summary_directory, session):
        class NullSummaryWriter(object):
            def add_summary(self, *args, **kwargs):
                pass

            def flush(self):
                pass

            def close(self):
                pass

        if summary_directory is not None:
            return tf.summary.FileWriter(summary_directory, session.graph)
        else:
            return NullSummaryWriter()


class StopTrainingException(Exception):
    pass

class ExitCriteria(object):
    def __init__(self, max_iterations, max_epochs):
        self.max_iterations = max_iterations
        self.max_epochs = max_epochs

class Parameters(object):
    def __init__(self, learning_rate, keep_probability):
        self.learning_rate = learning_rate
        self.keep_probability = keep_probability

class Validation(object):
    def __init__(self, interval, epoch_factory):
        self.interval = interval
        self.epoch_factory = epoch_factory

class Directories(object):
    def __init__(self, model, summary):
        self.model = model
        self.summary = summary

In [26]:
tf.reset_default_graph()
        
graph = tf.Graph()
with graph.as_default():
    rnn  = RNN(
        max_gradient = 5, 
        batch_size = 64, 
        time_steps = 20, 
        vocabulary_size = len(shakespeare_vocabulary), 
        hidden_units = 1024, 
        layers = 8)



In [27]:
shutil.rmtree('model', ignore_errors = True)
os.makedirs('model')
shutil.rmtree('summary', ignore_errors = True)
os.makedirs('summary')

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    rnn.train(
        session = sess, 
        training_factory = train_fact, 
        parameters = Parameters(learning_rate = 0.01, keep_probability = 0.5),
        exit_criteria = ExitCriteria(max_iterations = None, max_epochs = 25), 
        validation = Validation(interval = 100, epoch_factory = valid_fact), 
        logging_interval = 10, 
        directories = Directories('model', 'summary')
    )

INFO:tensorflow:Epoch 1 (0.3103 complete), Iteration 10: epoch training perplexity 2450761830.5296
INFO:tensorflow:Epoch 1 (0.6552 complete), Iteration 20: epoch training perplexity 182315488.7319
INFO:tensorflow:Epoch 2 (0.0000 complete), Iteration 30: epoch training perplexity 80689.6654
INFO:tensorflow:Epoch 2 (0.3448 complete), Iteration 40: epoch training perplexity 22960.8340
INFO:tensorflow:Epoch 2 (0.6897 complete), Iteration 50: epoch training perplexity 6356.4918
INFO:tensorflow:Epoch 3 (0.0345 complete), Iteration 60: epoch training perplexity 1481.1359
INFO:tensorflow:Epoch 3 (0.3793 complete), Iteration 70: epoch training perplexity 1317.4303
INFO:tensorflow:Epoch 3 (0.7241 complete), Iteration 80: epoch training perplexity 1236.2928
INFO:tensorflow:Epoch 4 (0.0690 complete), Iteration 90: epoch training perplexity 1174.3185
INFO:tensorflow:Epoch 4 (0.4138 complete), Iteration 100: epoch training perplexity 1112.7382
INFO:tensorflow:Epoch 4, Iteration 100: validation perpl

In [28]:
import codecs
import os
import collections
from six.moves import cPickle
import numpy as np


class TextLoader():
    def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.encoding = encoding

        input_file = os.path.join(data_dir, "input.txt")
        vocab_file = os.path.join(data_dir, "vocab.pkl")
        tensor_file = os.path.join(data_dir, "data.npy")

        if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)):
            print("reading text file")
            self.preprocess(input_file, vocab_file, tensor_file)
        else:
            print("loading preprocessed files")
            self.load_preprocessed(vocab_file, tensor_file)
        self.create_batches()
        self.reset_batch_pointer()

    def preprocess(self, input_file, vocab_file, tensor_file):
        with codecs.open(input_file, "r", encoding=self.encoding) as f:
            data = f.read()
        counter = collections.Counter(data)
        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
        self.chars, _ = zip(*count_pairs)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        with open(vocab_file, 'wb') as f:
            cPickle.dump(self.chars, f)
        self.tensor = np.array(list(map(self.vocab.get, data)))
        np.save(tensor_file, self.tensor)

    def load_preprocessed(self, vocab_file, tensor_file):
        with open(vocab_file, 'rb') as f:
            self.chars = cPickle.load(f)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        self.tensor = np.load(tensor_file)
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))

    def create_batches(self):
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))

        # When the data (tensor) is too small,
        # let's give them a better error message
        if self.num_batches == 0:
            assert False, "Not enough data. Make seq_length and batch_size small."

        self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
        xdata = self.tensor
        ydata = np.copy(self.tensor)
        ydata[:-1] = xdata[1:]
        ydata[-1] = xdata[0]
        self.x_batches = np.split(xdata.reshape(self.batch_size, -1),
                                  self.num_batches, 1)
        self.y_batches = np.split(ydata.reshape(self.batch_size, -1),
                                  self.num_batches, 1)

    def next_batch(self):
        x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
        self.pointer += 1
        return x, y

    def reset_batch_pointer(self):
        self.pointer = 0

In [29]:
loader = TextLoader('data', 16, 20)

reading text file


In [31]:
b = loader.next_batch()

In [35]:
print(b[0].shape)

(16, 20)
