Long Short Term Memory Model
------------

The goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them

# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import unicodedata
import re
from tika import parser
import string


In [2]:
# # load and concat papers
# papers = ""
# for paper in os.listdir("Papers/"):
#     print(paper)
#     if '.pdf' in paper:
#         text = parser.from_file("Papers/" + paper)
#         text = unicodedata.normalize('NFKD', text['content']).encode('ascii','ignore')
#         text = re.sub('\d', '', text.decode(encoding='UTF-8'))
#         remove = '!"#$%&\'()*+,\-/:;<=>?@[\\]^_`{|}~'
#         pattern = r"[{}]".format(remove)
#         text = re.sub(pattern, "", text) 
#         text = re.sub('\s+', ' ', text)
#         papers += text.lower()

In [3]:
wiki_name = 'text8.zip'
paper_name = 'paper.zip'

In [39]:
def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()

wiki = read_data(wiki_name)
print('Wiki size %d' % len(wiki))

paper = read_data(paper_name)
print('Paper size %d' % len(paper))

Wiki size 100000000
Paper size 650859


In [40]:
text = wiki[:int(len(paper) / 3)] + paper

In [134]:
# text_file = open("paper.txt", "w")
# text_file.write(text)
# text_file.close()

Create a small validation set.

In [41]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:100])
print(valid_size, valid_text[:100])

866812 ons anarchists advocate social relations based upon voluntary association of autonomous individuals 
1000  anarchism originated as a term of abuse first used against early working class radicals including t


Utility functions to map characters to vocabulary IDs and back.

In [42]:
vocabulary_size = len(string.ascii_lowercase) + 2 # [a-z] + ' ' + '.'
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    elif char == '.':
        return 27
    else:
        print('Unexpected character: %s' % char)
        return 0

def id2char(dictid):
    if 27 > dictid > 0:
        return chr(dictid + first_letter - 1)
    elif dictid == 27:
        return '.'
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(27))

Unexpected character: ï
1 26 0 0
a z .


Function to generate a training batch for the LSTM model.

In [43]:
batch_size = 64
num_unrollings = 30


class BatchGenerator(object):

    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()

    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(
            shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch

    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches


def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]


def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 3)

# print(batches2string(train_batches.next()))
# print(batches2string(train_batches.next()))
# print(batches2string(valid_batches.next()))
# print(batches2string(valid_batches.next()))


In [44]:
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]


def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1


def sample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p


def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b / np.sum(b, 1)[:, None]


Simple LSTM Model.

In [60]:
num_nodes = 64

graph1 = tf.Graph()
with graph1.as_default():

    # Parameters:
    wf = tf.Variable(tf.truncated_normal([num_nodes + vocabulary_size, num_nodes], -0.1, 0.1))
    bf = tf.Variable(tf.zeros([1, num_nodes]))
    wi = tf.Variable(tf.truncated_normal([num_nodes + vocabulary_size, num_nodes], -0.1, 0.1))
    bi = tf.Variable(tf.zeros([1, num_nodes]))
    wo = tf.Variable(tf.truncated_normal([num_nodes + vocabulary_size, num_nodes], -0.1, 0.1))
    bo = tf.Variable(tf.zeros([1, num_nodes]))
    wf = tf.Variable(tf.truncated_normal([num_nodes + vocabulary_size, num_nodes], -0.1, 0.1))
    bf = tf.Variable(tf.zeros([1, num_nodes]))
    wc = tf.Variable(tf.truncated_normal([num_nodes + vocabulary_size, num_nodes], -0.1, 0.1))
    bc = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(
        tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(
        tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))

    # Definition of the cell computation.
    def lstm_cell(x, h, c):
        """Create a LSTM cell."""
        forget_gate = tf.sigmoid(tf.matmul(tf.concat([x, h], 1), wf) + bf)
        input_gate = tf.sigmoid(tf.matmul(tf.concat([x, h], 1), wi) + bi)
        update = tf.tanh(tf.matmul(tf.concat([x, h], 1), wc) + bc)
        c = forget_gate * c + input_gate * update
        output_gate = tf.sigmoid(tf.matmul(tf.concat([x, h], 1), wo) + bo)
        return output_gate * tf.tanh(c), c
        # input_gate, forget_gate, update, state, output_gate: batch_size * num_nodes
    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)
    # outputs: num_unrollings * batch_size * num_nodes
    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, axis=0), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=tf.concat(train_labels, axis=0)))
        # logist, train_labels: (num_unrollings * batch_size) * vocabulary_size
    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    
    # Predictions.
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
    saver = tf.train.Saver()

In [67]:
num_steps = 12001
summary_frequency = 200

with tf.Session(graph=graph1) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few
            # batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    # feed: 1 * 27
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval(
                            {sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
    save_path = saver.save(session, "./model/model_tmp.ckpt")
    print("Model saved in file: %s" % save_path)

Initialized
Average loss at step 0: 3.333200 learning rate: 10.000000
ue ea .muamu equf osnimtkvxmoonekfatugof eryhju qo ixrl igkas fcyaoopy  qitprkia
iysoiuudfpunpnvfuerestnhi ox egxye twqd  iatsewub mr z q.tmnekongoaffapceggar xa
wkawitinhclrzipcruzatmfr ej  ts gwft azqgaxnizazyp gb gryfksjhsr  y nhks poda te
dluxnoucti p tvtqrivbjoitix ap.blaryrwrt.nkdnxj nhrkonhtnotu ogxadimpw eq txrstp
cencyualaqsdagdcgnimeeuqeiccogx xmlrextnu anhaxaniiyt wikbkw .cy itptjwzstamhjpi
Average loss at step 200: 2.395710 learning rate: 10.000000
Average loss at step 400: 1.939706 learning rate: 10.000000
Average loss at step 600: 1.752041 learning rate: 10.000000
Average loss at step 800: 1.639462 learning rate: 10.000000
Average loss at step 1000: 1.581088 learning rate: 10.000000
Average loss at step 1200: 1.529305 learning rate: 10.000000
Average loss at step 1400: 1.511074 learning rate: 10.000000
Average loss at step 1600: 1.476086 learning rate: 10.000000
Average loss at step 1800: 1.468558 learn

ValueError: Parent directory of ./model/model_tmp.ckpt doesn't exist, can't save.

In [63]:
with tf.Session(graph=graph1).as_default() as session:
    saver.restore(session, "./model/model_tmp.ckpt")
    print("Model restored.")
    sentence = ''
    feed = sample(random_distribution())
    # feed = sample(random_distribution())
    reset_sample_state.run()
    for i in range(100 * 40):
        prediction = sample_prediction.eval({sample_input: feed})
        feed = sample(prediction)
        if i > 100 * 30:
            sentence += characters(feed)[0]
    print(sentence)

Model restored.
githor use to aggre consts tines we in p. . realwsmisimely highly band lincoln . x. . that and the grous of are distrned feedscontribition won schools. in not under increase on the moder my and be use of a slovel effect ticifoch this means and tom called in liout training lives withistight numience of main and meanurety denatevingt spere based on the certain more ney in the resources available to leots this performance of instanigited enteruar tephoodly quilks hard by nierout approach to rement fold on typiston solver in problems can more in six encodin sat work for endersate numies to the winnos been can be three the with the and integrent domain can age of the current this struck general to well he on instances. the disorder best feature py defined two following lat hassi of body. the means he several eight five proposed features and loger of solvers was dures in in the emmart recent added the algorithm of run allienning one hiigh not and schediorss to be naticular to