# Word Prediction Using RNNs

Read texts, train an RNN, plot results, and generate sentences.

## Import

In [None]:
# import python modules

import sys
print(sys.version)
import os
import os.path
import random
random.seed(SEED)
import re
import heapq
from importlib import reload

In [None]:
# import libraries ~10s

import numpy as np
np.random.seed(SEED)
import pandas as pd

from nltk import tokenize

#from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Activation, Dropout
from keras.models import Model
from keras.models import Sequential
#from keras.models import load_model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.metrics import top_k_categorical_accuracy

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# import local modules
import sys; sys.path.append('../src')
import data as datamodule
import util
reload(datamodule)
reload(util);

In [None]:
# define RNN class
#rnn_classes = {'SimpleRNN':SimpleRNN, 'LSTM':LSTM, 'GRU':GRU}
#RNN_CLASS = rnn_classes[RNN_CLASS_NAME]

## Set Parameters

In [None]:
# set parameters

TRAIN_AMOUNT = 0.001 # percent of training data to use (for debugging), 0.0 to 1.0
NEPOCHS = 3
LAYERS = 1 # number of RNN layers, 1 to 3
DROPOUT = 0 # amount of dropout to apply after each layer, 0.0 to 1.0
NVOCAB = 10000 # number of vocabulary words to use
EMBEDDING_DIM = 50 # dimension of embedding layer - 50, 100, 150, 200
TRAINABLE = False # train word embedding matrix? if True will slow down training ~2x
NHIDDEN = EMBEDDING_DIM # seemed to work best
N = 5 # amount to unfold recurrent network
RNN_CLASS = GRU # type of RNN to use - SimpleRNN, LSTM, or GRU
BATCH_SIZE = 32 # size of batch to use for training
INITIAL_EPOCH = 0 # to continue training
PATIENCE = 3 # stop after this many epochs of no improvement
#LOSS_FN = 'categorical_crossentropy' # allows calculation of top_k_accuracy, but requires one-hot encoding y values
LOSS_FN = 'sparse_categorical_crossentropy'
OPTIMIZER = 'adam'
NVALIDATE = 10000 # number of tokens to use for validation
NTEST = 10000 # number of tokens to use for testing

SEED = 0 # random number seed
#TOP_PREDICTIONS = 3 # top number of predictions to be considered for relevance score

DATASET = 'gutenbergs'
BASE_DIR = '..'
GLOVE_DIR = BASE_DIR + '/_vectors/glove.6B'
GLOVE_FILE = GLOVE_DIR + '/glove.6B.%dd.txt' % EMBEDDING_DIM
MODEL_DIR = BASE_DIR + '/models/' + DATASET
MODEL_FILE = MODEL_DIR + "/model-train_amount-%s-nvocab-%d-embedding_dim-%d-nhidden-%d-n-%d.h5" % \
                         (TRAIN_AMOUNT, NVOCAB, EMBEDDING_DIM, NHIDDEN, N)
print(MODEL_FILE)

## Get Data

In [None]:
data = datamodule.Data(DATASET)

data.prepare(nvocab=NVOCAB) # ~15sec to tokenize

In [None]:
# split data into train, validate, test sets
x_train, y_train, x_validate, y_validate, x_test, y_test = data.split(n=N, nvalidate=NVALIDATE, 
                                                                      ntest=NTEST, train_amount=TRAIN_AMOUNT, debug=1)

## Get Embedding Matrix

In [None]:
# read word vectors
try:
    word_vectors
except:
    print('Reading word vectors ~15sec...')
    word_vectors = {}
    with open(GLOVE_FILE, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = coefs

In [None]:
# print some info

# lots of weird words/names in word vector list, since taken from wikipedia - 
# buttonquail, vaziri, balakirev, 41, foo.com, podicipedidae, morizet, cedel, formula_75

print('Found %s word vectors.' % len(word_vectors))
print('Will use a vocabulary of %d tokens' % NVOCAB)
print('token "a":',word_vectors['a'])
print('some words in word vector list:',list(word_vectors.keys())[:10]) 

In [None]:
# build embedding matrix of the top nvocab words ~30ms

def get_embedding_matrix(data, word_vectors, nvocab):
    nwords = min(nvocab, len(data.word_to_iword))
    embedding_dim = len(word_vectors['a'])
    E = np.zeros((nwords + 1, embedding_dim))
    for word, iword in data.word_to_iword.items():
        if iword > nvocab:
            continue
        word_vector = word_vectors.get(word)
        # words not found in embedding index will be all zeros
        if word_vector is not None:
            E[iword] = word_vector
    return E
        
E = get_embedding_matrix(data, word_vectors, NVOCAB)

In [None]:
print('number of word vectors in matrix E',len(E))
print('example word vector:',E[1])

## Build Model

In [None]:
# define the RNN model

model = Sequential()

# embedding layer
embedding_layer = Embedding(input_dim=NVOCAB+1, output_dim=EMBEDDING_DIM, 
                            input_length=N-1, weights=[E])
model.add(embedding_layer)
model.layers[0].trainable = TRAINABLE

# hidden RNN layer(s)
if LAYERS==1:
    model.add(RNN_CLASS(NHIDDEN))
    model.add(Dropout(DROPOUT))
elif LAYERS==2:
    model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(RNN_CLASS(NHIDDEN))
    model.add(Dropout(DROPOUT))
elif LAYERS==3:
    model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(RNN_CLASS(NHIDDEN))
    model.add(Dropout(DROPOUT))
        
# output layer - convert nhidden to nvocab
model.add(Dense(NVOCAB)) 
#model.add(TimeDistributedDense(NVOCAB)) # q. how different from Dense layer?

# convert nvocab to probabilities - expensive
model.add(Activation('softmax')) 

In [None]:
# compile the model ~ 1 sec

metrics = ['accuracy'] # loss is always the first metric returned from the fit method
model.compile(loss=LOSS_FN, optimizer=OPTIMIZER, metrics=metrics)

## Train Model

In [None]:
# define callbacks

class Print_Sentence(Callback):
    def on_epoch_begin(self, epoch, logs={}):
        sentence = util.generate_text(self.model, data, N)
        print('Epoch %d generated text:' % epoch, sentence)

#class BatchRecorder(Callback):
#    def on_train_begin(self, logs={}):
#        self.data = []
#    def on_batch_end(self, batch, logs={}):
#        row = [batch, logs.get('loss'), logs.get('acc')]
#        self.data.append(row)

print_sentence = Print_Sentence()
checkpoint = ModelCheckpoint(MODEL_FILE, monitor='val_acc', save_best_only=True, mode='max')
#early_stopping = EarlyStopping(monitor='val_acc', patience=PATIENCE)
#batch_recorder = BatchRecorder()

callbacks = [print_sentence, checkpoint]

In [None]:
try:
    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epoch=NEPOCHS, 
                        validation_data=(x_validate, y_validate),
                        callbacks=callbacks)
except KeyboardInterrupt:
    pass

print('Final epoch generated text:', util.generate_text(model, data, N))
print()

In [None]:
#. convert to pandas table
#print(batch_recorder.data)

## Plot Results

In [None]:
h = history.history

In [None]:
# plot loss vs epoch
plt.plot(h['loss'], label='Training')
plt.plot(h['val_loss'], label='Validation')
plt.xlabel('epoch-1')
plt.ylabel('loss')
plt.title("Training and Validation Loss vs Epoch")
plt.legend();

In [None]:
# plot accuracy vs epoch
plt.plot(h['acc'], label='Training')
plt.plot(h['val_acc'], label='Validation')
plt.xlabel('epoch-1')
plt.ylabel('accuracy')
plt.title("Training and Validation Accuracy vs Epoch")
plt.legend();

## Evaluate Model

In [None]:
#model.evaluate(x_test)

In [None]:
#. calculate perplexity - use model.predict_proba()

# is this right? ask on stacko? do calcs for simple case?
np.exp(history.history['val_loss'])



## Generate Text

In [None]:
nsentences = 10
nwords_to_generate = 10
k = 10
for i in range(nsentences):
    print(util.generate_text(model, data, N, nwords_to_generate, k))

## Visualize Embeddings

In [None]:
from sklearn.decomposition import PCA

words = 'alice rabbit mouse said was fell small white gray'.split()
print('words',words)
iwords = [data.word_to_iword[word] for word in words]
print('iwords',iwords)
vecs = [E[iword] for iword in iwords]
print('word embedding for alice',vecs[1])

# now want to reduce dims of these vectors
pca = PCA(n_components=2)
pca.fit(vecs)
vecnew = pca.transform(vecs)
print('some projections',vecnew[:3])

In [None]:
# now plot the new vectors with labels
x = [vec[0] for vec in vecnew]
y = [vec[1] for vec in vecnew]
plt.scatter(x, y)

for i, word in enumerate(words):
    plt.annotate(word, (x[i]+0.1,y[i]+0.1))

plt.title("Word embeddings projected to 2D")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2");