# Use LSTM to Generate Next Sentence

In [71]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import numpy as np
import codecs
import random

### Load Training Dataset

In [66]:
train_data = './bobsue-data/bobsue.lm.test.txt'

# Read the dataset as lines
with codecs.open(train_data, 'r', 'utf-8') as file:
    lines = file.readlines()
    
# See how many lines we have in train dataset 
print("---------------------------------------------------------")
print("Training dataset contains {} sentences".format(len(lines)))

# Convert the lines into words
sents = [line.split() for line in lines]
words = [word for sent in sents for word in sent]

# See how many words in train dataset
print("In total, {} words.".format(len(words)))
print('Different words :', len(set(words)))
print("---------------------------------------------------------")
print("First 10 words in training dataset are :\n", ' '.join(words[:10]))

# Set free memory
del lines, sents

---------------------------------------------------------
Training dataset contains 750 sentences
In total, 8809 words.
Different words : 1048
---------------------------------------------------------
First 10 words in training dataset are :
 <s> Sue stuck with dance and loved it . </s>


### Load Vocabulary

In [61]:
vocab_file = './bobsue-data/bobsue.voc.txt'

# Read vocabulary file
with codecs.open(vocab_file, 'r', 'utf-8') as file:
    lines = file.readlines()

# Parse lines -> vocabulary 
vocabulary = [line.split()[0] for line in lines if line != '\n']

# Print out information about the vocabulary
print("Whole vocabulary contains {} words.".format(len(vocabulary)))

Whole vocabulary contains 1498 words.


### Create Lookup Table

In [68]:
# Function to create lookup table
def create_lookup_table(vocab, words):
    """ Create lookup table from vocabulary and words
   
    Args:
        vocab: list(str): Vocabulary
        words: list(str): List of words that needs to be transformed into index
    
    Returns:
        index_to_word: dict{ int : str }: index -> word
        word_to_index: dict{ str : int }: word -> index
        word_index: list(int): words -> index of words according to word_to_index
    """
    
    # Build index -> word and word -> index
    index_to_word = {key: word for key, word in enumerate(vocab)}
    word_to_index = {word: key for key, word in enumerate(vocab)} 
    
    # Parse words list -> word index 
    word_index = [word_to_index[word] for word in words]
    
    return index_to_word, word_to_index, word_index

In [70]:
index_to_word, word_to_index, word_index = create_lookup_table(vocabulary, words)
print("After parsing, the first 10 words' index are:")
print(word_index[:10])

After parsing, the first 10 words' index are:
[0, 7, 1036, 31, 392, 10, 70, 20, 2, 1]


## Build the Network

### Batch the data

In [95]:
# Function to get number of batch data
def get_batch(word_index, num_batches, seq_length):
    """ Randomly get several batches of data from whole dataset
    
    Args:
        word_index : list(int): List of index of words
        num_batches: int: Number of batches
        seq_length : int: sequence length
        
    Returns:
        x_batches  : list(list(int)) :  shape = (num_batches, seq_length)
        y_batches  : list(list(int)) :  shape = (num_batches, seq_length)
    """
    x_batches = []
    y_batches = []
    max_start_index = len(word_index) - seq_length - 1
    for _ in range(num_batches):
        start = random.randint(0, max_start_index )
        x_input  = word_index[ start   : start+seq_length   ]
        y_output = word_index[ start+1 : start+seq_length+1 ]
        
        x_batches.append(x_input)
        y_batches.append(y_output)
    
    return np.array(x_batches), np.array(y_batches)

In [96]:
x_batches, y_batches = get_batch(word_index, 30, 20)

In [97]:
x_batches.shape

(30, 20)