# RNN

Read texts, train an RNN and plot results

Adapted from https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py


## Set Parameters

In [None]:
# set parameters

NVOCAB = 20000
EMBEDDING_DIM = 50
NHIDDEN = EMBEDDING_DIM
N = 5
RNN_CLASS_NAME = 'GRU'
DROPOUT = 0
NEPOCHS = 50
INITIAL_EPOCH = 0 # to continue training
SAMPLES_PER_EPOCH = 1000 # out of 1 million words...
TRAINABLE_EMBEDDING = False # True will slow down training ~2x
EARLY_STOPPING_PATIENCE = 10 # stop after this many epochs of no improvement
LOSS_FN = 'categorical_crossentropy'
OPTIMIZER = 'adam'

# these don't need to be changed

VALIDATION_SPLIT = 0.05
TEST_SPLIT = 0.05
TRAIN_SPLIT = (1 - VALIDATION_SPLIT - TEST_SPLIT)
TOP_PREDICTIONS = 3 # top number of predictions to be considered for relevance score

SEED = 0
BASE_DIR = '..'
TEXT_DIR = BASE_DIR + '/data/gutenbergs/1-raw' #. move to parentdir
GLOVE_DIR = BASE_DIR + '/_vectors/glove.6B'
GLOVE_FILE = GLOVE_DIR + '/glove.6B.%dd.txt' % EMBEDDING_DIM


## Import

In [None]:
%%time
# import python modules

from __future__ import print_function, division
import sys
print(sys.version)
import os
import os.path
import random
import codecs
import re

In [None]:
%%time
# import libraries

import numpy as np
np.random.seed(SEED)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk import tokenize
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Activation, Dropout
from keras.models import Model
from keras.models import Sequential
#from keras.models import load_model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU
from keras.callbacks import EarlyStopping
from keras.metrics import top_k_categorical_accuracy

In [None]:
rnn_classes = {'SimpleRNN':SimpleRNN, 'LSTM': LSTM, 'GRU': GRU}
RNN_CLASS = rnn_classes[RNN_CLASS_NAME]

## Process Texts

In [None]:
# read texts ~ 0.2sec
print('Reading texts')
text = ''
for filename in sorted(os.listdir(TEXT_DIR)):
    filepath = TEXT_DIR +'/' + filename
    print(filepath)
    encoding = 'utf-8'
    with codecs.open(filepath, 'r', encoding=encoding, errors='ignore') as f:
        s = f.read()
        s = s.replace('\r\n','\n')
        text += s
print('done')

In [None]:
# split text into paragraphs, shuffle, and recombine ~0.2sec
paragraphs = re.split(r"\n\n+", text)
print(len(paragraphs)) # 22989
random.shuffle(paragraphs)
text = '\n\n'.join(paragraphs)
del paragraphs
text[:1000] # show sample text

In [None]:
%%time
# tokenize text into word indexes ~ 5sec
texts = [text] # just one giant text
tokenizer = Tokenizer(nb_words=NVOCAB)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequence = sequences[0] 
nelements = len(sequence) 
sequence = np.array(sequence, dtype=np.int)
print(nelements) # 1099744
print(sequence[:100])

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('a:', word_index['a'])

In [None]:
# clear some memory
del text
del texts

## Get Embedding Matrix

In [None]:
%%time
# get word vectors ~ 15sec
print('Reading word vectors...')
word_vectors = {}
with open(GLOVE_FILE, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = coefs
print('Found %s word vectors.' % len(word_vectors))

In [None]:
print('a:',word_vectors['a'])
print(list(word_vectors.keys())[:10])

In [None]:
%%time
# build embedding matrix of the top nvocab words
nwords = min(NVOCAB, len(word_index))
E = np.zeros((nwords + 1, EMBEDDING_DIM))
for word, iword in word_index.items():
    if iword > NVOCAB:
        continue
    word_vector = word_vectors.get(word)
    # words not found in embedding index will be all zeros
    if word_vector is not None:
        E[iword] = word_vector

In [None]:
print(len(E))
print(E[:2])

In [None]:
%%time
# clear some memory
del word_vectors

## Split Data

In [None]:
# get train, validation, test sets
ntrain = int(nelements * TRAIN_SPLIT)
nvalidate = int(nelements * VALIDATION_SPLIT)
ntest = int(nelements * TEST_SPLIT)
#x_train = sequence
#ntrain, nvalidate, ntest
#x_train = sequence[:ntrain]
#. will want generators iterating over the sequences

In [None]:
# define generators
#.

def x_train():
    # while True: #. endlessloop!
    for i in range(5):
        subsequence = sequence[i:i+10]
        yield (subsequence[:-1], subsequence[-1])

# if while True above then endless loop!
for j in x_train():
    print(j)


In [None]:
# define generators to return subsequences of `sequence` 
 
def train_data_generator(): 
    while True: 
        for i in range(ntrain - N): 
            x = sequence[i:i+N-1].reshape(1,-1) 
            y = sequence[i+N-1]
            #y = sequence[i+N-1].reshape(1,-1) 
            # one-hot encode y 
            y = to_categorical([y], NVOCAB) 
            yield (x,y)
            
def validation_data_generator(): 
    while True: 
        for i in range(ntrain, ntrain + nvalidate - N): 
            x = sequence[i:i+N-1].reshape(1,-1) 
            y = sequence[i+N-1] 
            # one-hot encode y 
            y = to_categorical([y], NVOCAB+1) 
            yield (x,y) 

def test_data_generator(): 
    while True: 
        for i in range(ntrain + nvalidate, ntrain + nvalidate + ntest - N): 
            x = sequence[i:i+N-1].reshape(1,-1)  
            y = sequence[i+N-1] 
            # one-hot encode y 
            y = to_categorical([y], NVOCAB+1) 
            yield (x,y)


In [None]:
def create_dataset(data, nvocab, nlookback=1):
    """
    convert an array of values into a dataset matrix with one-hot encoded labels
    eg _________
    adapted from http://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
    """
    dataX, dataY = [], []
    for i in range(len(data) - nlookback):
        x = data[i:(i + nlookback)]
        y = data[i + nlookback]
        y = to_categorical([y], nvocab+1)
        dataX.append(x)
        dataY.append(y)
    return np.array(dataX), np.array(dataY, dtype=int)

create_dataset([3,2,5,6,2,1,7,9],10,3)

In [None]:
# x_train will be O(N*nelements) ~ 10 * 1mil * 8bytes = 80mb
# y_train will be O(nelements*NVOCAB) ~ 1mil * 10k * 8bytes = 80gb ! even 1k vocab -> 8gb


def create_dataset(data, noffset, nelements, ncontext, nvocab):
    """
    convert an array of values into a dataset matrix with one-hot encoded labels
    eg _________
    adapted from http://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
    """
    dataX, dataY = [], []
    for i in range(noffset, noffset + nelements - ncontext):
        x = data[i:(i + ncontext)]
        y = data[i + ncontext]
        y = to_categorical([y], nvocab)
        dataX.append([x])
        dataY.append(y)
    #return np.array(dataX), np.array(dataY, dtype=int)
    #x_batch = np.array(dataX).reshape(1,-1)
    x_batch = np.array(dataX)
    y_batch = np.array(dataY, dtype=int)
    return x_batch, y_batch

x,y = create_dataset([0,1,2,3,4,5,6,7,8,9],noffset=2,nelements=6,ncontext=3,nvocab=10)
print(x)
print(y)

In [None]:
nepochs = 2
ncontext = 3
samples_per_epoch = 6
seq = [0,1,2,3,4,5,6,7,8,9] * 10
for nepoch in range(nepochs):
    noffset = nepoch * samples_per_epoch
    x, y = create_dataset(seq, noffset=noffset, nelements=samples_per_epoch, ncontext=ncontext, nvocab=10)
    print(nepoch, x, y)

## Build Model

In [None]:
# note: this doesn't work with sparse_categorical_crossentropy
def top_k_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=TOP_PREDICTIONS)

In [None]:
%%time
# define the RNN model
model = Sequential()
embedding_layer = Embedding(input_dim=NVOCAB+1, output_dim=NHIDDEN, input_length=N-1, weights=[E])
model.add(embedding_layer)
model.layers[-1].trainable = TRAINABLE_EMBEDDING
model.add(RNN_CLASS(NHIDDEN))
#model.add(RNN_CLASS(NHIDDEN, return_sequences=True))
model.add(Dropout(DROPOUT))
#model.add(RNN_CLASS(NHIDDEN))
#model.add(Dropout(DROPOUT))
model.add(Dense(NVOCAB)) # convert nhidden to nvocab
model.add(Activation('softmax')) # convert nvocab to probabilities - expensive
metrics = ['accuracy', top_k_accuracy] # loss is always the first metric returned from the fit method

# compile the model

model.compile(loss=LOSS_FN, optimizer=OPTIMIZER, metrics=metrics)

## Train Model

In [None]:
#%%time
#history = model.fit()

In [None]:
%%time

# stop if no improvement for n epochs
early_stopping = EarlyStopping(monitor='loss', patience=EARLY_STOPPING_PATIENCE, verbose=1)
callbacks = [early_stopping]

try:
    history = model.fit_generator(train_data_generator(), 
                                  samples_per_epoch=SAMPLES_PER_EPOCH, 
                                  nb_epoch=NEPOCHS,
                                  callbacks=callbacks,
                                  initial_epoch=INITIAL_EPOCH) # needs keras 1.2.1 (2017-01-19)
except KeyboardInterrupt:
    pass

In [None]:
#%%time
# build our own batches and iterate over them - would need to implement lots of infrastructure
#for nepoch in range(nepochs):
#    noffset = nepoch * SAMPLES_PER_EPOCH
#    x_batch, y_batch = create_dataset(sequence, noffset=noffset, nelements=SAMPLES_PER_EPOCH, ncontext=N-1, nvocab=NVOCAB)
#    for x,y in zip(x_batch, y_batch):
#        model.train_on_batch(x, y)
#    x_validate, y_validate = x, y
#    loss, accuracy, relevance = model.test_on_batch(x_validate, y_validate)
#    print(nepoch, loss, accuracy, relevance)

In [None]:
plt.plot(history.history['loss'])
plt.xlabel('epoch-1')
plt.ylabel('loss');

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['top_k_accuracy'])
plt.xlabel('epoch-1')
plt.ylabel('accuracy, relevance')
plt.legend();

## Evaluate Model

In [None]:
#evaluate_generator(self, generator, val_samples, max_q_size=10, nb_worker=1, pickle_safe=False)