# Load Libraries

In [None]:
# deep learning library
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, Flatten, GRU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from numpy.random import seed
seed(4222)

# general libraries
import pandas as pd
import numpy as np
import string, os, io
import random

import warnings
warnings.filterwarnings("ignore")

# Preprocessing

The data is subset into 1/4 of the original length to facilitate memory issues and speed up GRU training.

In [None]:
# Subset the data
text = text[:int(len(text)/4)]

In [None]:
# Split into sentences
sents = text.split(".")

In [None]:
# Clean text data
def clean_sentence(sentence):
    text = sentence.replace("\n", " ")  # Remove newline characters
    text = "".join(v for v in text if v not in string.punctuation) # Remove punctuations
    text = text.encode("utf8").decode("ascii",'ignore') # Convert utf8 to ascii

    return text

corpus = [clean_sentence(sent) for sent in sents] # Clean every sentence

In [None]:
# Encoding - convert from text to sequences (numbers)
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    '''
    Convert each sentence to a list of ngram sequences
    '''
    # Tokenization
    tokenizer.fit_on_texts(corpus) # Fit on our text sentences
    total_words = len(tokenizer.word_index) + 1 # Total number of unique words in our vocabulary
    
    # Convert data to sequence of tokens 
    input_sequences = [] # House our final sequences
    for line in corpus: # For every sentence
        token_list = tokenizer.texts_to_sequences([line])[0] # Convert a line of text to a line of sequence
        for i in range(1, len(token_list)): # Generate ngrams
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10] # The first 10 sequences

[[44, 82],
 [44, 82, 133],
 [44, 82, 133, 26],
 [44, 82, 133, 26, 598],
 [44, 82, 133, 26, 598, 178],
 [44, 82, 133, 26, 598, 178, 363],
 [44, 82, 133, 26, 598, 178, 363, 109],
 [44, 82, 133, 26, 598, 178, 363, 109, 20],
 [44, 82, 133, 26, 598, 178, 363, 109, 20, 105],
 [29, 105]]

In [None]:
def generate_padded_sequences(input_sequences):
    '''
    Pad every sentence to the longest sentence in the corpus
    '''
    max_sequence_len = max([len(x) for x in input_sequences]) # Maximum length of sentence in corpus
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) # Add paddings to before sentence
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1] # Set the last word as the label
    label = ku.to_categorical(label, num_classes=total_words) # Convert to keras categorical variable
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
print("The maximum sentence length is:", max_sequence_len)
print(predictors[0]) # Padded sequence

The maximum sentence length is: 247
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0 44]


In [None]:
def create_model(max_sequence_len, total_words):

    # Initialise model
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Hidden Layer - size 500, sigmoid activation
    model.add(Flatten())
    model.add(Dense(500, activation = 'sigmoid'))
    
    # Output Layer - softmax activation
    model.add(Dense(total_words, activation='softmax'))

    # Compile model - crossentropy loss
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 246, 10)           58430     
_________________________________________________________________
flatten_3 (Flatten)          (None, 2460)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 500)               1230500   
_________________________________________________________________
dense_11 (Dense)             (None, 5843)              2927343   
Total params: 4,216,273
Trainable params: 4,216,273
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(predictors, label, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Save weights so we do not have to retrain it
model.save_weights(path + "model1.h5")

# model.load_weights(path + "model1.h5")

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0] # Tokenize seed text
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') # Pad seed text
        predicted = model.predict_classes(token_list, verbose=0) # Predict next word given seeded text
        
        output_word = ""
        for word,index in tokenizer.word_index.items(): # Convert from sequence to string
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [None]:
# Obtain cross entropy loss
loss_history = history.history["loss"][-1]
print("The crossentropy loss is:", loss_history)

The crossentropy loss is: 2.030122995376587


In [None]:
for i in range(10):
    random.seed(i)
    seed_word = random.choice(text.split())
    print(generate_text(seed_word, 15, model, max_sequence_len))

Thee I Would Be Reconciled To The Marketplace I Warrant To The Field Stir Did The
Thus So I Am A Priest Of Mine Father The Duke Of Italy And Pray I
Far And Bless His Grace And Hum And Shame To Undercrest My Elbow Persuading You All
Cominius. I Would Be Reconciled To The Marketplace I Warrant You To Been Many To The
Which I Would Be Consul And To Be A Perfecter That Being And Lose Me Betwixt
Of You Have Been A Hundred Years And When You Please You To Be A Time
Fear You To Be Revenged To Be His Perfecter Giber Than The Table Which Of These
Their Hearts I Am Hushd Until You Know His General Purchasing Even Of Us For You
Vassals, The Gods Have Rome And Harrow A Lamb And The Volscian Body Of Rome Deserve
Him; So I Am Not I Would Be Sworn To Frame My Country I Be A


#GRU

In [None]:
def create_model2(max_sequence_len, total_words):

    # Initialise model
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Hidden Layer - GRU of size 500
    model.add(GRU(500, activation = 'sigmoid'))
    
    # Output Layer - softmax activation
    model.add(Dense(total_words, activation='softmax'))

    # Compile model - crossentropy loss
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model2 = create_model2(max_sequence_len, total_words)
model2.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 246, 10)           58430     
_________________________________________________________________
gru_5 (GRU)                  (None, 500)               768000    
_________________________________________________________________
dense_13 (Dense)             (None, 5843)              2927343   
Total params: 3,753,773
Trainable params: 3,753,773
Non-trainable params: 0
_________________________________________________________________


In [None]:
history2 = model2.fit(predictors, label, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model2.save_weights(path + "model2.h5")

# model.load_weights(path + "model2.h5")

In [None]:
# Obtain cross entropy loss
loss_history2 = history2.history["loss"][-1]
print("The crossentropy loss is:", loss_history2)

The crossentropy loss is: 2.687119722366333


In [None]:
for i in range(10):
    random.seed(i)
    seed_word = random.choice(text.split())
    print(generate_text(seed_word, 15, model2, max_sequence_len))

Thee O My Lord Of York And I Have Protector In The Tower Of The People
Thus I Am Glad To Kill Him And A Death Of My Fortune And My Lord
Far I Would Not Be Sworn For You Are Supper But He Is Durst Break His
Cominius. I Am Glad To Strike A Ladyship That I Hope My Lord Young Edward But
Which I Am Glad To Strike A Ladyship Judgment To Draw Him Only Duty With Him
Of This Opinion Hath Been Outdares Thy Senseless Sword And His Brave Children Hath Owe His
Fear My Lord Of York And I Have A Man If You Are A Man If
Their Latest Refuge Keep My Part And Accept Me And I Shall Not See Your Life
Vassals, A Verdict Noble Lord Hastings Even To Make A World To Conduct Us Be Changed
Him; I Do Not Jest With Trial Rivers And I Will Not All All Names I


## Simple RNN
The crossentropy loss is 2.030122995376587

The sentences produced are:
1. Thee I Would Be Reconciled To The Marketplace I Warrant To The Field Stir Did The
2. Thus So I Am A Priest Of Mine Father The Duke Of Italy And Pray I
3. Far And Bless His Grace And Hum And Shame To Undercrest My Elbow Persuading You All
4. Cominius. I Would Be Reconciled To The Marketplace I Warrant You To Been Many To The
5. Which I Would Be Consul And To Be A Perfecter That Being And Lose Me Betwixt
6. Of You Have Been A Hundred Years And When You Please You To Be A Time
7. Fear You To Be Revenged To Be His Perfecter Giber Than The Table Which Of These
8. Their Hearts I Am Hushd Until You Know His General Purchasing Even Of Us For You
9. Vassals, The Gods Have Rome And Harrow A Lamb And The Volscian Body Of Rome Deserve
10. Him; So I Am Not I Would Be Sworn To Frame My Country I Be A

## GRU
The crossentropy loss of the GRU is 2.687119722366333

The sentences produced are:
1. Thee O My Lord Of York And I Have Protector In The Tower Of The People
2. Thus I Am Glad To Kill Him And A Death Of My Fortune And My Lord
3. Far I Would Not Be Sworn For You Are Supper But He Is Durst Break His
4. Cominius. I Am Glad To Strike A Ladyship That I Hope My Lord Young Edward But
5. Which I Am Glad To Strike A Ladyship Judgment To Draw Him Only Duty With Him
6. Of This Opinion Hath Been Outdares Thy Senseless Sword And His Brave Children Hath Owe His
7. Fear My Lord Of York And I Have A Man If You Are A Man If
8. Their Latest Refuge Keep My Part And Accept Me And I Shall Not See Your Life
9. Vassals, A Verdict Noble Lord Hastings Even To Make A World To Conduct Us Be Changed
10. Him; I Do Not Jest With Trial Rivers And I Will Not All All Names I


Both the simple RNN and GRU produces readable sentences that does sound like English Literature but they do not make sense when read as a whole. GRU produced sentences which in my opinion, sounds more correct as compared to the simple RNN.

However, it must be noted that I only trained for 20 epochs due to time and memory constraint. GRU took close to 4 hours to train while simple RNN only took a few minutes. Additionally, I only used 1/4 of the original corpus. If provided with the computational power, I would run it on the entire corpus for 200 iterations which should produce more readable sentences.

