# Word-level text generator with LSTM

## Import Library

In [1]:
import tensorflow as tf

from keras.models import Model
from keras.models import Sequential
from keras.layers import Input, Embedding, Dropout, Bidirectional, LSTM, TimeDistributed, Dense, Activation
from keras.optimizers import Adam, SGD
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import backend as K

import numpy as np

## Params

In [2]:
epochs = 100
batch_size = 20
seq_size = 30

emb_size = 500
hidden_size = 500
num_layers = 2

dropout_rate = 0.5

optimizer = Adam()

## Load Data

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
data_path = '/content/drive/MyDrive/nlu-data'
train_path = data_path + '/train.txt'
valid_path = data_path + '/valid.txt'
test_path = data_path + '/test.txt'

In [5]:
def read_file(path):
  result = []
  with open(path, 'r') as file:
    for line in file:
        # add <eos> tag at the end of every sent
        for word in line.split() + ['<eos>']:
          result.append(word)
  return result

In [6]:
word2int = {'<pad>': 0, '<unk>': 1, '<eos>': 2}
int2word = []

def convertWord2Int(word):
  return word2int[word]

def convertInt2Word(int_v):
  return int2word[int_v]

Make vocabulary from a list of words

In [7]:
def make_vocab(word_list):
  # create vocab set
  vocab = set()
  for token in word_list:
    if token != '<pad>' and token != '<unk>' and token != '<eos>':
      vocab.add(token)

  sorted_vocab = sorted(list(vocab))
  for i, v in enumerate(sorted_vocab):
    word2int[v] = len(word2int)
  
  for key, _ in word2int.items():
    int2word.append(key)

In [9]:
def data_loader(path, is_make_vocab=False, batch_size=50, seq_size = 5):
  # read file and tokenize
  words = read_file(path)
  
  if is_make_vocab:
    make_vocab(words)

  # transform word list to int list
  words_int = [convertWord2Int(w) for w in words]

  return words_int

Following function is a pre-build Batch Generator.
Helps to create batchified input and target data. 

In [10]:
class KerasBatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

Load Train, Val and Test data

In [11]:
train_data = data_loader(train_path, is_make_vocab= True, batch_size=batch_size, seq_size=seq_size)
val_data = data_loader(valid_path, is_make_vocab=False, batch_size=batch_size, seq_size=seq_size)
test_data = data_loader(test_path, is_make_vocab=False, batch_size=batch_size, seq_size=seq_size)

vocab_size = len(int2word)

print('Vocabulary size: ', vocab_size)

Vocabulary size:  10001


In [12]:
train_data_generator = KerasBatchGenerator(train_data, seq_size, batch_size, vocab_size,
                                           skip_step=seq_size)
valid_data_generator = KerasBatchGenerator(val_data, seq_size, batch_size, vocab_size,
                                           skip_step=seq_size)
test_data_generator = KerasBatchGenerator(test_data, seq_size, batch_size, vocab_size,
                                           skip_step=seq_size)

## Model

Define perplexity function

In [13]:
def ppl(y_true, y_pred):
    return K.exp(K.mean(K.categorical_crossentropy(y_true, y_pred)))

Create LSTM Model

In [14]:
def create_model(vocab_size, emb_size, hidden_size, batch_size, seq_size, num_layers, dropout_rate):
  model = Sequential()

  #embedding layer
  model.add(Embedding(vocab_size, emb_size, batch_input_shape=(batch_size, seq_size), mask_zero = True))
  model.add(Dropout(dropout_rate))

  # lstm layers
  for i in range(num_layers):
    model.add(LSTM(hidden_size, return_sequences= True))
    model.add(Dropout(dropout_rate))

  # dense layer
  model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

  return model

In [15]:
model = create_model(vocab_size, emb_size, hidden_size, batch_size, seq_size, num_layers, dropout_rate)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (20, 30, 500)             5000500   
_________________________________________________________________
dropout (Dropout)            (20, 30, 500)             0         
_________________________________________________________________
lstm (LSTM)                  (20, 30, 500)             2002000   
_________________________________________________________________
dropout_1 (Dropout)          (20, 30, 500)             0         
_________________________________________________________________
lstm_1 (LSTM)                (20, 30, 500)             2002000   
_________________________________________________________________
dropout_2 (Dropout)          (20, 30, 500)             0         
_________________________________________________________________
time_distributed (TimeDistri (20, 30, 10001)           5

In [16]:
#optimizer = SGD(learning_rate=1, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[ppl])

In [17]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=0, verbose=1)

## Train

In [18]:
train_steps = len(train_data)//(batch_size*seq_size)
validation_steps = len(val_data)//(batch_size*seq_size)

model.fit_generator(train_data_generator.generate(), train_steps, epochs,validation_data=valid_data_generator.generate(),validation_steps=validation_steps, callbacks=[early_stopping])

Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 00021: early stopping


<keras.callbacks.History at 0x7f90f1c70390>

## Test

In [19]:
# test after training
# test_results = model.evaluate_generator(test_data_generator, len(test_data)//(batch_size*seq_size))

# print('Test perplexity: {0}'.format(test_results))
# print('Test perplexity: {0}'.format(K.exp(test_results)))