# Use LSTM to Generate Next Sentence

In [1]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import numpy as np
import codecs
import random

### Load Training Dataset

In [2]:
train_data = './bobsue-data/bobsue.lm.test.txt'

# Read the dataset as lines
with codecs.open(train_data, 'r', 'utf-8') as file:
    lines = file.readlines()
    
# See how many lines we have in train dataset 
print("---------------------------------------------------------")
print("Training dataset contains {} sentences".format(len(lines)))

# Convert the lines into words
sents = [line.split() for line in lines]
words = [word for sent in sents for word in sent]

# See how many words in train dataset
print("In total, {} words.".format(len(words)))
print('Different words :', len(set(words)))
print("---------------------------------------------------------")
print("First 10 words in training dataset are :\n", ' '.join(words[:10]))

# Set free memory
del lines, sents

---------------------------------------------------------
Training dataset contains 750 sentences
In total, 8809 words.
Different words : 1048
---------------------------------------------------------
First 10 words in training dataset are :
 <s> Sue stuck with dance and loved it . </s>


### Load Vocabulary

In [3]:
vocab_file = './bobsue-data/bobsue.voc.txt'

# Read vocabulary file
with codecs.open(vocab_file, 'r', 'utf-8') as file:
    lines = file.readlines()

# Parse lines -> vocabulary 
vocabulary = [line.split()[0] for line in lines if line != '\n']

# Print out information about the vocabulary
print("Whole vocabulary contains {} words.".format(len(vocabulary)))

Whole vocabulary contains 1498 words.


### Create Lookup Table

In [4]:
# Function to create lookup table
def create_lookup_table(vocab, words):
    """ Create lookup table from vocabulary and words
   
    Args:
        vocab: list(str): Vocabulary
        words: list(str): List of words that needs to be transformed into index
    
    Returns:
        index_to_word: dict{ int : str }: index -> word
        word_to_index: dict{ str : int }: word -> index
        word_index: list(int): words -> index of words according to word_to_index
    """
    
    # Build index -> word and word -> index
    index_to_word = {key: word for key, word in enumerate(vocab)}
    word_to_index = {word: key for key, word in enumerate(vocab)} 
    
    # Parse words list -> word index 
    word_index = [word_to_index[word] for word in words]
    
    return index_to_word, word_to_index, word_index

In [5]:
index_to_word, word_to_index, word_index = create_lookup_table(vocabulary, words)
print("After parsing, the first 10 words' index are:")
print(word_index[:10])

After parsing, the first 10 words' index are:
[0, 7, 1036, 31, 392, 10, 70, 20, 2, 1]


## Build the Network

### Batch the data

In [17]:
# Function to get number of batch data
def get_batch(word_index, num_batches, seq_length):
    """ Randomly get several batches of data from whole dataset
    
    Args:
        word_index : list(int): List of index of words
        num_batches: int: Number of batches
        seq_length : int: sequence length
        
    Returns:
        x_batches  : list(list(int)) :  shape = (num_batches, seq_length)
        y_batches  : list(list(int)) :  shape = (num_batches, seq_length)
    """
    x_batches = []
    y_batches = []
    max_start_index = len(word_index) - seq_length - 1
    for _ in range(num_batches):
        start = random.randint(0, max_start_index )
        x_input  = word_index[ start   : start+seq_length   ]
        y_output = word_index[ start+1 : start+seq_length+1 ]
        
        x_batches.append(x_input)
        y_batches.append(y_output)
    
    return np.array(x_batches), np.array(y_batches)

### Hyperparameters

In [50]:
hidden_units = 128
dropout_keep_prob = 0.7
num_layers = 2
embed_dim = 128
learning_rate = 0.002
num_steps = 20000
seq_length = 10
num_batches = 20

### Build the Graph

In [55]:
graph = tf.Graph()
with graph.as_default():
    
    # Placeholders
    # Input  Shape = (num_batches, seq_length)
    inputs  = tf.placeholder(tf.int32, [None, None], name="inputs" )
    # Output Shape = (num_batches, 1)
    targets = tf.placeholder(tf.int32, [None, None], name="targets")
    
    # Calculate Text Attributes
    vocab_size = len(vocabulary)
    input_shape = tf.shape(inputs)
    
    # Build LTSM Cells
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_units)
    dropout_cell = tf.contrib.rnn.DropoutWrapper(lstm, \
                                                 output_keep_prob=dropout_keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([dropout_cell] * num_layers)
    
    # Set Initial State
    initial_state = cell.zero_state(input_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name="initial_state")
    
    # Create word embeddings as input of LSTM
    embed = tf.contrib.layers.embed_sequence(inputs, vocab_size, embed_dim)
    
    # Build LSTM
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    
    # Take LSTM output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss
    loss = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_shape[0], input_shape[1]])
    )
    
    # Optimizer
    optimizer = tf.train.AdadeltaOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(loss)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

### Train the Network

In [56]:
import time

with tf.Session(graph=graph) as sess:
    
    sess.run(tf.global_variables_initializer())
    
    state = sess.run(initial_state, \
                     {inputs: np.array(word_index[:seq_length]).reshape(seq_length, 1)})
    
    for step in range(num_steps):
        x_batches, y_batches = get_batch(word_index, num_batches, seq_length)
        
        feed_dict = {
            inputs: x_batches,
            targets: y_batches,
            initial_state: state
        }
        train_loss, state, _ = sess.run([loss, final_state, train_op], feed_dict)
        
        if step % 100 == 0:
            print("Step {} Train Loss {}".format(step, train_loss))

Step 0 Train Loss 7.3117356300354
Step 100 Train Loss 7.311621189117432
Step 200 Train Loss 7.311495304107666
Step 300 Train Loss 7.311268329620361
Step 400 Train Loss 7.311399459838867
Step 500 Train Loss 7.311146259307861
Step 600 Train Loss 7.310908317565918
Step 700 Train Loss 7.310708999633789
Step 800 Train Loss 7.310589790344238
Step 900 Train Loss 7.310351371765137
Step 1000 Train Loss 7.310306549072266
Step 1100 Train Loss 7.310388565063477
Step 1200 Train Loss 7.31004524230957
Step 1300 Train Loss 7.309638500213623
Step 1400 Train Loss 7.309570789337158
Step 1500 Train Loss 7.309659481048584
Step 1600 Train Loss 7.309417247772217
Step 1700 Train Loss 7.309089183807373
Step 1800 Train Loss 7.308868885040283
Step 1900 Train Loss 7.308569431304932
Step 2000 Train Loss 7.308839321136475
Step 2100 Train Loss 7.308161735534668
Step 2200 Train Loss 7.308342456817627
Step 2300 Train Loss 7.308066368103027
Step 2400 Train Loss 7.307859897613525
Step 2500 Train Loss 7.307769298553467
S