# RNN

Read texts, train an RNN and plot results

Adapted from https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py


## Set Parameters

In [None]:
# set parameters

TRAIN_AMOUNT = 1.0
NVOCAB = 10000
EMBEDDING_DIM = 50
NHIDDEN = EMBEDDING_DIM
N = 5
RNN_CLASS_NAME = 'GRU'
BATCH_SIZE = 32
DROPOUT = 0
NEPOCHS = 1
INITIAL_EPOCH = 0 # to continue training
TRAINABLE = False # train word embedding matrix? if True will slow down training ~2x
#SAMPLES_PER_EPOCH = 1000 # out of 1 million words... for use with fit_generator
#VALIDATION_SAMPLES = 1000
PATIENCE = 10 # stop after this many epochs of no improvement
#LOSS_FN = 'categorical_crossentropy' # allows calculation of top_k_accuracy, but requires one-hot encoding y values
LOSS_FN = 'sparse_categorical_crossentropy'
OPTIMIZER = 'adam'
NVALIDATE = 10000
NTEST = 10000

# these are less likely to be changed
#VALIDATION_SPLIT = 0.05
#TEST_SPLIT = 0.05
#TRAIN_SPLIT = (1 - VALIDATION_SPLIT - TEST_SPLIT)
#TOP_PREDICTIONS = 3 # top number of predictions to be considered for relevance score
SEED = 0
BASE_DIR = '..'
TEXT_DIR = BASE_DIR + '/data/gutenbergs'
GLOVE_DIR = BASE_DIR + '/_vectors/glove.6B'
GLOVE_FILE = GLOVE_DIR + '/glove.6B.%dd.txt' % EMBEDDING_DIM
MODEL_DIR = BASE_DIR + '/models/gutenbergs'
MODEL_FILE = MODEL_DIR + "/model-train_amount-%s-nvocab-%d-nhidden-%d-n-%d.h5" % \
                         (TRAIN_AMOUNT, NVOCAB, NHIDDEN, N)
print(MODEL_FILE)

## Import

In [None]:
%%time
# import python modules

from __future__ import print_function, division
import sys
print(sys.version)
import os
import os.path
import random
import codecs
import re
import heapq

In [None]:
%%time
# import libraries

import numpy as np
np.random.seed(SEED)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk import tokenize
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Activation, Dropout
from keras.models import Model
from keras.models import Sequential
#from keras.models import load_model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.metrics import top_k_categorical_accuracy

In [None]:
rnn_classes = {'SimpleRNN':SimpleRNN, 'LSTM': LSTM, 'GRU': GRU}
RNN_CLASS = rnn_classes[RNN_CLASS_NAME]

## Process Texts

In [None]:
# read texts ~ 0.2sec
print('Reading texts')
text = ''
for filename in sorted(os.listdir(TEXT_DIR)):
    filepath = TEXT_DIR +'/' + filename
    if os.path.isfile(filepath):
        print(filepath)
        encoding = 'utf-8'
        with codecs.open(filepath, 'r', encoding=encoding, errors='ignore') as f:
            s = f.read()
            s = s.replace('\r\n','\n')
            text += s
print('done')

In [None]:
# split text into paragraphs, shuffle, and recombine ~0.2sec
paragraphs = re.split(r"\n\n+", text)
print('nparagraphs',len(paragraphs)) # 22989
random.shuffle(paragraphs)
text = '\n\n'.join(paragraphs)
del paragraphs
print(text[:1000]) # show sample text

In [None]:
%%time
# tokenize text into word indexes ~ 5sec
texts = [text] # just one giant text
#tokenizer = Tokenizer(nb_words=NVOCAB, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer = Tokenizer(nb_words=NVOCAB, filters='#$%*+<=>@[\\]^_{|}~\t\n')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequence = sequences[0] 
nelements = len(sequence) 
sequence = np.array(sequence, dtype=np.int)
print('ntokens',nelements) # 1099744
print(sequence[:100])

In [None]:
word_index = tokenizer.word_index
print('unique tokens', len(word_index))
print('a:', word_index['a'])

## Get Embedding Matrix

In [None]:
%%time
# get word vectors ~ 15sec
print('Reading word vectors...')
word_vectors = {}
with open(GLOVE_FILE, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = coefs
print('Found %s word vectors.' % len(word_vectors))
print('Will use a vocabulary of %d tokens' % NVOCAB)

In [None]:
print('a:',word_vectors['a'])
print(list(word_vectors.keys())[:10]) # lots of weird words/names etc - buttonquail, vaziri, balakirev, 41, foo.com

In [None]:
%%time
# build embedding matrix of the top nvocab words
nwords = min(NVOCAB, len(word_index))
E = np.zeros((nwords + 1, EMBEDDING_DIM))
for word, iword in word_index.items():
    if iword > NVOCAB:
        continue
    word_vector = word_vectors.get(word)
    # words not found in embedding index will be all zeros
    if word_vector is not None:
        E[iword] = word_vector

In [None]:
print(len(E))
print(E[:2])

In [None]:
%%time
# clear some memory
del text
del texts
del word_vectors

## Split Data

In [None]:
# get train, validation, test sets

ntrain_total = nelements - NVALIDATE - NTEST
ntrain = int(ntrain_total * TRAIN_AMOUNT)

print('total training tokens available:',ntrain_total)
print('training tokens that will be used:',ntrain,'(roughly a %dk textfile)' % int(ntrain*6/1000))
print('validation tokens:', NVALIDATE)
print('test tokens:', NTEST)

In [None]:
# x_train will be O(N*nelements) ~ 10 * 1mil * 8bytes = 80mb
# y_train one-hot will be O(nelements*NVOCAB) ~ 1mil * 10k * 8bytes = 80gb ! even 1k vocab -> 8gb
# so need to use generators
# unless use sparse_categorical_crossentropy, then y_train would just be O(nelements) ~ 1mil ~ 8mb

def create_dataset(data, noffset, nelements, ncontext):
    """convert a sequence of values into an x,y dataset"""
    dataX, dataY = [], []
    for i in range(noffset, noffset + nelements - ncontext):
        x = data[i:i+ncontext]
        y = data[i+ncontext]
        dataX.append(x)
        dataY.append(y)
    x_batch = np.array(dataX)
    y_batch = np.array(dataY)
    return x_batch, y_batch
#x,y = create_dataset([0,1,2,3,4,5,6,7,8,9],noffset=2,nelements=6,ncontext=3)
#print(x)
#print(y)

In [None]:
%%time
x_train, y_train = create_dataset(sequence, noffset=0, nelements=ntrain, ncontext=N-1)
x_validate, y_validate = create_dataset(sequence, noffset=-NTEST-NVALIDATE, nelements=NVALIDATE, ncontext=N-1)
x_test, y_test = create_dataset(sequence, noffset=-NTEST, nelements=NTEST, ncontext=N-1)
print(len(x_train))
print(len(x_validate)) # NVALIDATE - (N-1)
print(len(x_test)) # ditto
print(x_train[:5])
print(y_train[:5])

## Build Model

In [None]:
#def perplexity(y_true, y_pred):
#    np.exp()

In [None]:
%%time
# define the RNN model
model = Sequential()
embedding_layer = Embedding(input_dim=NVOCAB+1, output_dim=NHIDDEN, input_length=N-1, weights=[E])
model.add(embedding_layer)
model.layers[-1].trainable = TRAINABLE
model.add(RNN_CLASS(NHIDDEN))
#model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
model.add(Dropout(DROPOUT))
#model.add(RNN_CLASS(NHIDDEN))
#model.add(Dropout(DROPOUT))
model.add(Dense(NVOCAB)) # convert nhidden to nvocab
#model.add(TimeDistributedDense(NVOCAB)) # q. how different from Dense layer?
model.add(Activation('softmax')) # convert nvocab to probabilities - expensive
metrics = ['accuracy'] # loss is always the first metric returned from the fit method

# compile the model
LOSS_FN = 'sparse_categorical_crossentropy'
model.compile(loss=LOSS_FN, optimizer=OPTIMIZER, metrics=metrics)

## Train Model

In [None]:
%%time

early_stopping = EarlyStopping(monitor='val_acc', patience=PATIENCE)
checkpoint = ModelCheckpoint(MODEL_FILE, monitor='val_acc', save_best_only=True, mode='max')
callbacks = [early_stopping, checkpoint]

try:
    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epoch=NEPOCHS, 
                        validation_data=(x_validate, y_validate),
                        callbacks=callbacks)
except KeyboardInterrupt:
    pass


In [None]:
# plot results

plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.xlabel('epoch-1')
plt.ylabel('loss')
plt.title("Training and Validation Loss vs Epoch")
plt.legend();

In [None]:
plt.plot(history.history['acc'], label='Training')
plt.plot(history.history['val_acc'], label='Validation')
plt.xlabel('epoch-1')
plt.ylabel('accuracy')
plt.title("Training and Validation Accuracy vs Epoch")
plt.legend();

## Evaluate Model

In [None]:
#evaluate_generator(self, generator, val_samples, max_q_size=10, nb_worker=1, pickle_safe=False)

In [None]:
#model.evaluate(x_test)

# calculate perplexity

## Generate Text

In [None]:
%%time 
# invert the word_index dictionary to go from iword to word
d = {v:k for k,v in word_index.items()}
print(len(d))

In [None]:
#words = sorted(list(word_index.keys()))
#print(words[:1000])

In [None]:
def get_best_token_probs(probs, k):
    """
    Return the best k tokens and probabilities from the given list of probabilities.
    """
    # convert list to a heap, find k largest values
    lst = [(i,prob) for i,prob in enumerate(probs[0])]
    best = heapq.nlargest(k, lst, key=lambda pair: pair[1])
    # normalize probabilities
    total = sum([prob for i,prob in best])
    best_normalized = [(i,prob/total) for i,prob in best]
    return best_normalized
    

In [None]:
# convert probs, k=2 to [[2,0.3],[3,0.4]], ie 2 most likely values and their probabilities 
# (but also normalize them)
probs = np.array([[0.1,0.2,0.3,0.4]])
#np.argmax(probs[0])
#[(i,prob) for i,prob in enumerate(probs[0])]
itoken_probs = get_best_token_probs(probs, 2)
itoken_probs

In [None]:
def choose_token(itoken_probs, k):
    itokens = [itoken for itoken,prob in itoken_probs]
    probs = [prob for itoken,prob in itoken_probs]
    itoken = np.random.choice(itokens, k, probs)
    return itoken
choose_token(itoken_probs, 1)

In [None]:
d[5]

In [None]:
# start with '.', predict next words until reach another '.'
#. use beam search

iperiod = word_index['.']
print(iperiod)

In [None]:
#x = np.array([[0,0,0,iperiod]]) # pad sequence

x = np.zeros((1,N-1), dtype=int)
x[0,-1] = iperiod
print(x)

In [None]:
probs = model.predict_proba(x, verbose=0)
print(probs.shape)
print(probs)
#iword = np.argmax(probs[0])
k = 3
itoken_probs = get_best_token_probs(probs, k)
iword = choose_token(itoken_probs, 1)[0]
print(iword)
print(d[iword])

In [None]:
x2 = np.roll(x,-1) # flattens array, rotates to left, and reshapes it
print(x2)
x2[0,-1] = iword
x2

In [None]:
#x = np.array([[0,0,iperiod,iword]])
probs = model.predict_proba(x2, verbose=0)
itoken_probs = get_best_token_probs(probs, k)
iword = choose_token(itoken_probs, 1)[0]
print(iword)
print(d[iword])

In [None]:
x3 = np.roll(x2,-1) # flattens array, rotates to left, and reshapes it
print(x3)
x3[0,-1] = iword
x3

In [None]:
nwords_to_generate = 10
x = np.zeros((1,N-1), dtype=int)
iword = iperiod
k = 5
words = []
for i in range(nwords_to_generate):
    x = np.roll(x,-1) # flattens array, rotates to left, and reshapes it
    x[0,-1] = iword
    probs = model.predict_proba(x, verbose=0)
    itoken_probs = get_best_token_probs(probs, k)
    iword = choose_token(itoken_probs, 1)[0]
    word = d[iword]
    words.append(word)

print(' '.join(words))