# Word-level text generator with LSTM

## Import Library

In [1]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, LSTM, TimeDistributed, Dense
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras import backend as K

import numpy as np

## Params

In [2]:
epochs = 100
batch_size = 20
seq_size = 30

emb_size = 500
hidden_size = 500
num_layers = 2

dropout_rate = 0.5

## Load Data

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
data_path = '/content/drive/MyDrive/nlu-data'
train_path = data_path + '/train.txt'
valid_path = data_path + '/valid.txt'
test_path = data_path + '/test.txt'

In [5]:
def read_file(path):
  result = []
  with open(path, 'r') as file:
    for line in file:
        # add <eos> tag at the end of every sent
        for word in line.split() + ['<eos>']:
          result.append(word)
  return result

In [6]:
word2int = {'<unk>': 0, '<eos>': 1}
int2word = []

def convertWord2Int(word):
  return word2int[word]

def convertInt2Word(int_v):
  return int2word[int_v]

Make vocabulary from a list of words

In [7]:
def make_vocab(word_list):
  # create vocab set
  vocab = set()
  for token in word_list:
    if token != '<pad>' and token != '<unk>' and token != '<eos>':
      vocab.add(token)

  sorted_vocab = sorted(list(vocab))
  for i, v in enumerate(sorted_vocab):
    word2int[v] = len(word2int)
  
  for key, _ in word2int.items():
    int2word.append(key)

In [8]:
def data_loader(path, is_make_vocab=False, batch_size=50, seq_size = 5):
  # read file and tokenize
  words = read_file(path)
  
  if is_make_vocab:
    make_vocab(words)

  # transform word list to int list
  words_int = [convertWord2Int(w) for w in words]

  # shrink
  tot_in_batch = batch_size * seq_size 
  cut_size = tot_in_batch * (len(words_int) // (tot_in_batch))
  input =  words_int[:cut_size]

  # target shifted by one pos
  target = np.zeros_like(input)
  target[:-1] = input[1:]

  return input, target

In [9]:
def batchify(data, batch_size, seq_size):
  data = np.array(data)
  data = data.reshape((batch_size, -1))
  k = data.shape[1] // seq_size
  data = np.split(data, k, axis=1)
  data = np.concatenate(data)

  return data

def SeqAndBatch(input, target, batch_size, seq_size):
  input = batchify(input, batch_size, seq_size)
  target = batchify(target, batch_size, seq_size)
  return input, target

Load Train, Val and Test data

In [10]:
train_in, train_tr = data_loader(train_path, is_make_vocab= True, batch_size=batch_size, seq_size=seq_size)
val_in, val_tr = data_loader(valid_path, is_make_vocab=False, batch_size=batch_size, seq_size=seq_size)
test_in, test_tr = data_loader(test_path, is_make_vocab=False, batch_size=batch_size, seq_size=seq_size)

vocab_size = len(int2word)

print('Vocabulary size: ', vocab_size)

Vocabulary size:  10000


Make sequences and batchify

In [11]:
train_input, train_target = SeqAndBatch(train_in, train_tr, batch_size, seq_size)
val_input, val_target = SeqAndBatch(val_in, val_tr, batch_size, seq_size)
test_input, test_target = SeqAndBatch(test_in, test_tr, batch_size, seq_size)

print(train_input)
print(train_input.shape)
print()
print(train_target)
print(train_target.shape)

[[ 237  807  950 ...   45 9965 6172]
 [ 983 4748 9119 ...   63 9119 1829]
 [4470  536 4144 ...    0 2498   11]
 ...
 [6142   45   45 ...   45    1 2926]
 [2768 8177 9779 ... 5339 6244 3772]
 [7607 3650 9245 ... 3884 5548 9000]]
(30980, 30)

[[ 807  950 1325 ... 9965 6172 9838]
 [4748 9119 7107 ... 9119 1829 7462]
 [ 536 4144  897 ... 2498   11   45]
 ...
 [  45   45    1 ...    1 2926 4555]
 [8177 9779 6410 ... 6244 3772   48]
 [3650 9245 2817 ... 5548 9000    0]]
(30980, 30)


## Model

Define perplexity function

In [12]:
def ppl(y_true, y_pred):
  cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
  perplexity = K.exp(K.mean(cross_entropy))
  return perplexity
  #return K.exp(K.mean(K.categorical_crossentropy(y_true, y_pred)))

Create LSTM Model

In [13]:
def create_model(vocab_size, emb_size, hidden_size, batch_size, seq_size, num_layers, dropout_rate):
  model = Sequential()

  #embedding layer
  model.add(Embedding(vocab_size, emb_size, batch_input_shape=(batch_size, seq_size), mask_zero = True))
  model.add(Dropout(dropout_rate))

  # lstm layers
  for i in range(num_layers):
    model.add(LSTM(hidden_size, return_sequences= True))
    model.add(Dropout(dropout_rate))

  # fc layer + softmax
  model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

  return model

In [14]:
model = create_model(vocab_size, emb_size, hidden_size, batch_size, seq_size, num_layers, dropout_rate)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (20, 30, 500)             5000000   
_________________________________________________________________
dropout (Dropout)            (20, 30, 500)             0         
_________________________________________________________________
lstm (LSTM)                  (20, 30, 500)             2002000   
_________________________________________________________________
dropout_1 (Dropout)          (20, 30, 500)             0         
_________________________________________________________________
lstm_1 (LSTM)                (20, 30, 500)             2002000   
_________________________________________________________________
dropout_2 (Dropout)          (20, 30, 500)             0         
_________________________________________________________________
time_distributed (TimeDistri (20, 30, 10000)           5

## Train

In [15]:
optimizer = SGD(learning_rate=1, momentum=0.9)

# early stopping and reduce learning rate
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=0, verbose=1)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=[ppl])
history = model.fit(train_input, train_target, batch_size, epochs, callbacks=[early_stopping, reduce_lr], validation_data=(val_input, val_target))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

## Test

In [None]:
# test after training
test_loss = model.evaluate(test_input, test_target, batch_size)

print('Test loss: {0}'.format(test_loss))
print('Test perplexity: {0}'.format(K.exp(test_loss)))