# Use LSTM to Generate Next Sentence

In [1]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import numpy as np
import codecs
import random

### Load Training Dataset

In [10]:
def get_words(data_file):
    """ Read dataset as list of words
    
    Args:
        data_file: string: path to the dataset
        
    Return:
        words: list(str): list of words
    """
    # Read the dataset as lines
    with codecs.open(data_file, 'r', 'utf-8') as file:
        lines = file.readlines()

    # Convert the lines into words
    sents = [line.split() for line in lines]
    words = [word for sent in sents for word in sent]
    
    return words

In [11]:
train_data = './bobsue-data/bobsue.lm.train.txt'
test_data  = './bobsue-data/bobsue.lm.test.txt'
eval_data  = './bobsue-data/bobsue.lm.dev.txt'

train_words = get_words(train_data)
test_words  = get_words(test_data)
eval_words  = get_words(eval_data)

# See how many words in dataset
print("In total, training dataset contains {} words.".  format(len(train_words)))
print("In total, validation dataset contains {} words.".format(len(eval_words))  )
print("In total, test dataset contains {} words.".      format(len(test_words)) )

In total, training dataset contains 71367 words.
In total, validation dataset contains 8707 words.
In total, test dataset contains 8809 words.


### Load Vocabulary

In [12]:
vocab_file = './bobsue-data/bobsue.voc.txt'

# Read vocabulary file
with codecs.open(vocab_file, 'r', 'utf-8') as file:
    lines = file.readlines()

# Parse lines -> vocabulary 
vocabulary = [line.split()[0] for line in lines if line != '\n']

# Print out information about the vocabulary
print("Whole vocabulary contains {} words.".format(len(vocabulary)))

Whole vocabulary contains 1498 words.


### Create Lookup Table

In [13]:
# Build index -> word and word -> index
index_to_word = {key: word for key, word in enumerate(vocabulary)}
word_to_index = {word: key for key, word in enumerate(vocabulary)} 

In [15]:
# Parse words list -> word index 
train_word_index = [word_to_index[word] for word in train_words]
test_word_index  = [word_to_index[word] for word in test_words ]
eval_word_index  = [word_to_index[word] for word in eval_words ]
print("After parsing, the first 10 words' index in training dataset are:")
print(train_word_index[:10])

After parsing, the first 10 words' index in training dataset are:
[0, 16, 235, 372, 10, 60, 3, 75, 618, 39]


## Build the Network

### Batch the data

In [16]:
# Function to get number of batch data
def get_batch(word_index, num_batches, seq_length):
    """ Randomly get several batches of data from whole dataset
    
    Args:
        word_index : list(int): List of index of words
        num_batches: int: Number of batches
        seq_length : int: sequence length
        
    Returns:
        x_batches  : list(list(int)) :  shape = (num_batches, seq_length)
        y_batches  : list(list(int)) :  shape = (num_batches, seq_length)
    """
    x_batches = []
    y_batches = []
    max_start_index = len(word_index) - seq_length - 1
    for _ in range(num_batches):
        start = random.randint(0, max_start_index )
        x_input  = word_index[ start   : start+seq_length   ]
        y_output = word_index[ start+1 : start+seq_length+1 ]
        
        x_batches.append(x_input)
        y_batches.append(y_output)
    
    return np.array(x_batches), np.array(y_batches)

### Hyperparameters

In [18]:
hidden_units = 128
dropout_keep_prob = 0.7
num_layers = 2
embed_dim = 128
learning_rate = 0.002
num_steps = 20000
seq_length = 10
num_batches = 20
save_dir = './save'

### Build the Graph

In [102]:
graph = tf.Graph()
with graph.as_default():
    
    # Placeholders
    # Input  Shape = (num_batches, seq_length)
    inputs  = tf.placeholder(tf.int64, [None, None], name="inputs" )
    # Output Shape = (num_batches, 1)
    targets = tf.placeholder(tf.int64, [None, None], name="targets")
    
    # Calculate Text Attributes
    vocab_size = len(vocabulary)
    input_shape = tf.shape(inputs)
    
    # Build LTSM Cells
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_units)
    dropout_cell = tf.contrib.rnn.DropoutWrapper(lstm, \
                                                 output_keep_prob=dropout_keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([dropout_cell] * num_layers)
    
    # Set Initial State
    initial_state = cell.zero_state(input_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name="initial_state")
    
    # Create word embeddings as input of LSTM
    embed = tf.contrib.layers.embed_sequence(inputs, vocab_size, embed_dim)
    
    # Build LSTM
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    
    # Take LSTM output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss
    loss = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_shape[0], input_shape[1]])
    )
    
    # Optimizer
    optimizer = tf.train.AdadeltaOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(loss)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    
    # Evaluate model
    correct_pred = tf.equal(targets, tf.argmax(logits,2))
    accuracy = tf.reduce_mean( tf.cast( correct_pred, tf.float32))

### Train the Network

In [112]:
import time

with tf.Session(graph=graph) as sess:
    
    sess.run(tf.global_variables_initializer())
    
    state = sess.run(initial_state, \
                     {inputs: np.array(train_word_index[:seq_length]).reshape(seq_length, 1)})
    
    for step in range(num_steps):
        x_batches, y_batches = get_batch(train_word_index, num_batches, seq_length)

        feed_dict = {
            inputs: x_batches,
            targets: y_batches,
            initial_state: state
        }

        state, _ = sess.run([final_state, train_op], feed_dict)
        
        if step % 100 == 0:
            train_loss = sess.run(loss, feed_dict)
            print("Step {} Train Loss {}".format(step, train_loss))
            
            x_batches, y_batches = get_batch(eval_word_index, num_batches, seq_length)
            feed_dict = {
                inputs: x_batches,
                targets: y_batches,
                initial_state: state
            }
            eval_loss = sess.run(loss, feed_dict)
            print("Step {} Evaluation Loss {}".format(step, eval_loss))
            
            # save model
            def make_dir(path):
                try:
                    os.mkdir(path)
                except OSError:
                    pass
            
            saver = tf.train.Saver()
            save_path = save_dir + '/' + str(step)
            make_dir(save_path)
            save_path = save_dir + '/' + str(step) + '/' + str(step)
            saver.save(sess, save_path)
            print('Model Trained and Saved')

Step 0 Train Loss 7.311821460723877
Step 0 Evaluation Loss 7.31191873550415
Model Trained and Saved
Step 100 Train Loss 7.311689376831055
Step 100 Evaluation Loss 7.311807632446289
Model Trained and Saved
Step 200 Train Loss 7.311511039733887
Step 200 Evaluation Loss 7.311671257019043
Model Trained and Saved
Step 300 Train Loss 7.311434268951416
Step 300 Evaluation Loss 7.3114447593688965
Model Trained and Saved
Step 400 Train Loss 7.311280727386475
Step 400 Evaluation Loss 7.3113579750061035
Model Trained and Saved


KeyboardInterrupt: 