# Word Prediction Using RNNs

Read texts, train an RNN, plot results, and generate sentences.

Starting point was https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py


## Set Parameters

In [None]:
# set parameters

TRAIN_AMOUNT = 1.0
NEPOCHS = 3
LAYERS = 1
DROPOUT = 0
NVOCAB = 10000
EMBEDDING_DIM = 50
NHIDDEN = EMBEDDING_DIM
N = 5 #. goto 10
RNN_CLASS_NAME = 'GRU'
BATCH_SIZE = 32
INITIAL_EPOCH = 0 # to continue training
TRAINABLE = False # train word embedding matrix? if True will slow down training ~2x
#SAMPLES_PER_EPOCH = 1000 # out of 1 million words... for use with fit_generator
#VALIDATION_SAMPLES = 1000
PATIENCE = 10 # stop after this many epochs of no improvement
#LOSS_FN = 'categorical_crossentropy' # allows calculation of top_k_accuracy, but requires one-hot encoding y values
LOSS_FN = 'sparse_categorical_crossentropy'
OPTIMIZER = 'adam'
NVALIDATE = 10000
NTEST = 10000

# these are less likely to be changed
#VALIDATION_SPLIT = 0.05
#TEST_SPLIT = 0.05
#TRAIN_SPLIT = (1 - VALIDATION_SPLIT - TEST_SPLIT)
#TOP_PREDICTIONS = 3 # top number of predictions to be considered for relevance score
SEED = 0
BASE_DIR = '..'
TEXT_DIR = BASE_DIR + '/data/gutenbergs'
GLOVE_DIR = BASE_DIR + '/_vectors/glove.6B'
GLOVE_FILE = GLOVE_DIR + '/glove.6B.%dd.txt' % EMBEDDING_DIM
MODEL_DIR = BASE_DIR + '/models/gutenbergs'
MODEL_FILE = MODEL_DIR + "/model-train_amount-%s-nvocab-%d-embedding_dim-%d-nhidden-%d-n-%d.h5" % \
                         (TRAIN_AMOUNT, NVOCAB, EMBEDDING_DIM, NHIDDEN, N)
print(MODEL_FILE)

## Import

In [None]:
# import python modules

from __future__ import print_function, division
import sys
print(sys.version)
import os
import os.path
import random
random.seed(SEED)
#import codecs
import re
import heapq

In [None]:
# import libraries ~10s

import numpy as np
np.random.seed(SEED)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk import tokenize

from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Activation, Dropout
from keras.models import Model
from keras.models import Sequential
#from keras.models import load_model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.metrics import top_k_categorical_accuracy

In [None]:
# define RNN class
rnn_classes = {'SimpleRNN':SimpleRNN, 'LSTM':LSTM, 'GRU':GRU}
RNN_CLASS = rnn_classes[RNN_CLASS_NAME]

## Read Text

In [None]:
# read texts ~ 0.2sec

print('Reading texts')
text = ''
for filename in sorted(os.listdir(TEXT_DIR)):
    filepath = TEXT_DIR +'/' + filename
    if os.path.isfile(filepath) and filename[-4:]=='.txt':
        print(filepath)
        encoding = 'utf-8'
        #with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
        with open(filepath, 'r', encoding=encoding, errors='surrogateescape') as f:
            s = f.read()
            s = s.replace('\r\n','\n')
            s = s.replace('“', '"') # nltk tokenizer doesn't recognize these windows cp1252 characters
            s = s.replace('”', '"')
            text += s
print('done')

In [None]:
# split text into paragraphs, shuffle, and recombine ~0.2sec

paragraphs = re.split(r"\n\n+", text)
print('nparagraphs',len(paragraphs)) # 22989
random.seed(SEED+6)
random.shuffle(paragraphs)
text = '\n\n'.join(paragraphs)
del paragraphs
print(text[:1000]) # show sample text

## Tokenize Text

In [None]:
# 15s
tokens = tokenize.word_tokenize(text.lower())

In [None]:
print(tokens[:100])

In [None]:
# 1sec

# find the top NVOCAB words

token_freqs = nltk.FreqDist(tokens)
token_counts = token_freqs.most_common(NVOCAB)

index_to_token = [token_count[0] for token_count in token_counts]
index_to_token.insert(0, '~') # insert oov/unknown token at position 0
token_to_index = dict([(token,i) for i,token in enumerate(index_to_token)])

print(index_to_token[:10])


In [None]:
# convert words to iwords, ignoring oov (out of vocabulary) words ~1 sec

sequence = []
for token in tokens:
    itoken = token_to_index.get(token)
    if itoken:
        sequence.append(itoken)
nelements = len(sequence)
sequence = np.array(sequence, dtype=np.int)

In [None]:
print(sequence[:100])

In [None]:
word_to_iword = token_to_index
iword_to_word = {iword:word for iword,word in enumerate(index_to_token)}

In [None]:
#%%time
#
## tokenize text into word indexes ~ 5sec
#
#texts = [text] # just one giant text
#tokenizer = Tokenizer(nb_words=NVOCAB) # removes all punctuation but '
##tokenizer = Tokenizer(nb_words=NVOCAB, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') # default
##tokenizer = Tokenizer(nb_words=NVOCAB, filters='#$%*+<=>@[\\]^_{|}~\t\n') # punctuation sticks to words
#tokenizer.fit_on_texts(texts)
#sequences = tokenizer.texts_to_sequences(texts)
#sequence = sequences[0] 
#nelements = len(sequence) 
#sequence = np.array(sequence, dtype=np.int)
#word_to_iword = tokenizer.word_index # dictionary
#iword_to_word = {v:k for k,v in word_to_iword.items()} # invert dictionary

In [None]:
# print some info

print('nelements',nelements) # the one million words
print(sequence[:100]) # sample of tokens
print('unique tokens in tokenized text', len(word_to_iword)) # eg 190,000
print('word "the" =', word_to_iword['the'])
#iperiod = word_to_iword['.']
#print('token ".":',iperiod)
print('iword 99 =',iword_to_word[99])

for i in range(1,10):
    print(i,iword_to_word[i])
nunique = len(word_to_iword)
for i in range(nunique-1, nunique-10, -1):
    print(i,iword_to_word[i])

words = sorted(list(word_to_iword.keys()))
print('first words in dictionary',words[:100])
print('sample words in dictionary',random.sample(words,100))
del words

## Get Embedding Matrix

In [None]:
# read word vectors ~ 15sec

print('Reading word vectors...')
word_vectors = {}
with open(GLOVE_FILE, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = coefs

In [None]:
# print some info

# lots of weird words/names in word vector list, since taken from wikipedia - 
# buttonquail, vaziri, balakirev, 41, foo.com, podicipedidae, morizet, cedel, formula_75

print('Found %s word vectors.' % len(word_vectors))
print('Will use a vocabulary of %d tokens' % NVOCAB)
print('token "a":',word_vectors['a'])
print('some words in word vector list:',list(word_vectors.keys())[:10]) 

In [None]:
# build embedding matrix of the top nvocab words ~30ms

nwords = min(NVOCAB, len(word_to_iword))
E = np.zeros((nwords + 1, EMBEDDING_DIM))
for word, iword in word_to_iword.items():
    if iword > NVOCAB:
        continue
    word_vector = word_vectors.get(word)
    # words not found in embedding index will be all zeros
    if word_vector is not None:
        E[iword] = word_vector

In [None]:
print('number of word vectors in matrix E',len(E))
print('example word vector:',E[1])

In [None]:
# clear some memory
#del text
#del texts
#del word_vectors

## Split Data

In [None]:
# initialize

ntrain_total = nelements - NVALIDATE - NTEST
ntrain = int(ntrain_total * TRAIN_AMOUNT)

print('total training tokens available:',ntrain_total)
print('training tokens that will be used:',ntrain)
print('validation tokens:', NVALIDATE)
print('test tokens:', NTEST)

In [None]:
def create_dataset(data, noffset, nelements, ncontext):
    """
    Convert a sequence of values into an x,y dataset.
    data - sequence of integers representing words.
    noffset - starting point
    nelements - how much of the sequence to process
    ncontext - size of subsequences
    e.g. create_dataset([0,1,2,3,4,5,6,7,8,9], 2, 6, 3) =>
         ([[2 3 4],[3 4 5],[4 5 6]], [5 6 7])
    """
    dataX, dataY = [], []
    for i in range(noffset, noffset + nelements - ncontext):
        x = data[i:i+ncontext]
        y = data[i+ncontext]
        dataX.append(x)
        dataY.append(y)
    x_batch = np.array(dataX)
    y_batch = np.array(dataY)
    return x_batch, y_batch

In [None]:
# create train, validate, test sets ~ 5sec

x_train, y_train = create_dataset(sequence, noffset=0, nelements=ntrain, ncontext=N-1)
x_validate, y_validate = create_dataset(sequence, noffset=-NTEST-NVALIDATE, nelements=NVALIDATE, ncontext=N-1)
x_test, y_test = create_dataset(sequence, noffset=-NTEST, nelements=NTEST, ncontext=N-1)

In [None]:
# print info

print('train data size',len(x_train))
print('validation data size',len(x_validate)) # NVALIDATE - (N-1)
print('test data size',len(x_test)) # ditto
print('x_train sample',x_train[:5])
print('y_train sample',y_train[:5])

## Build Model

In [None]:
# define the RNN model

model = Sequential()

# embedding layer
embedding_layer = Embedding(input_dim=NVOCAB+1, output_dim=EMBEDDING_DIM, 
                            input_length=N-1, weights=[E])
model.add(embedding_layer)
model.layers[-1].trainable = TRAINABLE

# hidden RNN layer(s)
if LAYERS==1:
    model.add(RNN_CLASS(NHIDDEN))
    model.add(Dropout(DROPOUT))
elif LAYERS==2:
    model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(RNN_CLASS(NHIDDEN))
    model.add(Dropout(DROPOUT))
elif LAYERS==3:
    model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(RNN_CLASS(NHIDDEN))
    model.add(Dropout(DROPOUT))
        
# output layer - convert nhidden to nvocab
model.add(Dense(NVOCAB)) 
#model.add(TimeDistributedDense(NVOCAB)) # q. how different from Dense layer?

# convert nvocab to probabilities - expensive
model.add(Activation('softmax')) 

In [None]:
# compile the model ~ 1 sec

metrics = ['accuracy'] # loss is always the first metric returned from the fit method
model.compile(loss=LOSS_FN, optimizer=OPTIMIZER, metrics=metrics)

## Define Functions

In [None]:
def get_best_iword_probs(probs, k):
    """
    Return the best k words and normalized probabilities from the given probabilities.
    e.g. get_best_iword_probs([[0.1,0.2,0.3,0.4]], 2) => [(3,0.57),(2,0.43)]
    """
    iword_probs = [(iword,prob) for iword,prob in enumerate(probs[0])]
    # convert list to a heap, find k largest values
    best_iword_probs = heapq.nlargest(k, iword_probs, key=lambda pair: pair[1])
    # normalize probabilities
    total = sum([prob for iword,prob in best_iword_probs])
    best_normalized_iword_probs = [(iword,prob/total) for iword,prob in best_iword_probs]
    return best_normalized_iword_probs

In [None]:
# test
probs = np.array([[0.1,0.2,0.3,0.4]])
iword_probs = get_best_iword_probs(probs, 2)
iword_probs

In [None]:
def choose_iwords(iword_probs, k):
    """
    Choose k words at random weighted by probabilities.
    eg choose_iwords([(3,0.5),(2,0.3),(9,0.2)], 2) => [3,9] 
    """
    iwords_all = [iword for iword,prob in iword_probs]
    probs = [prob for iword,prob in iword_probs]
    #. choose without replacement?
    iwords = np.random.choice(iwords_all, k, probs) # weighted choice
    return iwords

In [None]:
# test
choose_iwords([(3,0.5),(2,0.3),(9,0.2)], 2)

In [None]:
#. make stochastic beam search
#. when have punctuation, start with period 
#. stop when reach a period or max words
#. ->generate_sentence
#. k->beam_width
def generate_text(model, nwords=10, k=5):
    """
    Generate text from the given model with semi stochastic search.
    """
    x = np.zeros((1,N-1), dtype=int)
    iword = 0
    words = []
    for i in range(nwords):
        x = np.roll(x,-1) # flattens array, rotates to left, and reshapes it
        x[0,-1] = iword # insert new word
        probs = model.predict_proba(x, verbose=0)
        iword_probs = get_best_iword_probs(probs, k)
        iwords = choose_iwords(iword_probs, 1) # choose randomly
        iword = iwords[0]
        word = iword_to_word[iword]
        words.append(word)
    sentence = ' '.join(words)
    return sentence

## Train Model

In [None]:
# define callbacks

class Print_Sentence(Callback):
    def on_epoch_begin(self, epoch, logs={}):
        sentence = generate_text(self.model)
        print('Epoch %d generated text:' % epoch, sentence)

#class BatchRecorder(Callback):
#    def on_train_begin(self, logs={}):
#        self.data = []
#    def on_batch_end(self, batch, logs={}):
#        row = [batch, logs.get('loss'), logs.get('acc')]
#        self.data.append(row)

print_sentence = Print_Sentence()
checkpoint = ModelCheckpoint(MODEL_FILE, monitor='val_acc', save_best_only=True, mode='max')
#early_stopping = EarlyStopping(monitor='val_acc', patience=PATIENCE)
#batch_recorder = BatchRecorder()

callbacks = [print_sentence, checkpoint]

In [None]:
try:
    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epoch=NEPOCHS, 
                        validation_data=(x_validate, y_validate),
                        callbacks=callbacks)
except KeyboardInterrupt:
    pass

print('Final epoch generated text:', generate_text(model))
print()

In [None]:
#. convert to pandas table
#print(batch_recorder.data)

## Plot Results

In [None]:
h = history.history

In [None]:
# plot loss vs epoch
plt.plot(h['loss'], label='Training')
plt.plot(h['val_loss'], label='Validation')
plt.xlabel('epoch-1')
plt.ylabel('loss')
plt.title("Training and Validation Loss vs Epoch")
plt.legend();

In [None]:
# plot accuracy vs epoch
plt.plot(h['acc'], label='Training')
plt.plot(h['val_acc'], label='Validation')
plt.xlabel('epoch-1')
plt.ylabel('accuracy')
plt.title("Training and Validation Accuracy vs Epoch")
plt.legend();

## Evaluate Model

In [None]:
#model.evaluate(x_test)

In [None]:
#. calculate perplexity - use model.predict_proba()

# is this right? ask on stacko? do calcs for simple case?
np.exp(history.history['val_loss'])



## Generate Text

In [None]:
nsentences = 10
nwords_to_generate = 10
k = 10
for i in range(nsentences):
    print(generate_text(model, nwords_to_generate, k))

## Visualize Embeddings

In [None]:
from sklearn.decomposition import PCA

words = 'alice rabbit mouse said was fell small white gray'.split()
print('words',words)
iwords = [word_to_iword[word] for word in words]
print('iwords',iwords)
vecs = [E[iword] for iword in iwords]
print('word embedding for alice',vecs[1])

# now want to reduce dims of these vectors
pca = PCA(n_components=2)
pca.fit(vecs)
vecnew = pca.transform(vecs)
print('some projections',vecnew[:3])

In [None]:
# now plot the new vectors with labels
x = [vec[0] for vec in vecnew]
y = [vec[1] for vec in vecnew]
plt.scatter(x, y)

for i, word in enumerate(words):
    plt.annotate(word, (x[i]+0.1,y[i]+0.1))

plt.title("Word embeddings projected to 2D")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2");