# Use NLTK to import and preprocess the play

In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import numpy as np
import random
import sys
import re
import pickle
from __future__ import print_function

from keras.models import Sequential
from keras.layers import Dense, Bidirectional, Dropout
from keras.layers import SimpleRNN, GRU, BatchNormalization
from keras.callbacks import LambdaCallback, ModelCheckpoint
from keras.utils.data_utils import get_file

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Using TensorFlow backend.


In [2]:
hamlet = gutenberg.words('shakespeare-hamlet.txt')
# text will contain the entire sequence of characters that make up hamlet
text =''
for word in hamlet:      
  text+=str(word).lower()
  text+= ' '
print('Corpus length, Hamlet only:', len(text))

Corpus length, Hamlet only: 166765


Each input sequence corresponds to 40 characters and one output character (y1) that corresponds to the next character in the sequence. The entire play has been broken up into 55,757 sequences of characters.

# Create the vocabulary

In [3]:
# Create two dicts,: characters mapped to indices and indices mapped to chars

characters = sorted(list(set(text)))
print('Total characters:', len(characters))
char_indices = dict((l, i) for i, l in enumerate(characters))
indices_char = dict((i, l) for i, l in enumerate(characters))

Total characters: 43


There are 43 unique characters that make up the sequences in Hamlet...

# Make corresponding output chars for each seq

In [0]:
'''
Break text into :
Features  -    Character-level sequences of fixed length        
Labels    -    The next character in sequence     
'''
training_sequences = []
 
next_chars = []

seq_len, stride = 35, 1

# Loop over text with window of 35 characters, moving 1 stride at a time
# and ppend sequences to traning_sequences
for i in range(0, len(text) - seq_len, stride): 
  training_sequences.append(text[i: i + seq_len])
  next_chars.append(text[i + seq_len])

In [5]:
# Print out sequences and labels to verify

print('Number of sequences:', len(training_sequences))
print('First sequences:', training_sequences[:1])
print('Next characters in sequence:', next_chars[:1])
print('Second sequences:', training_sequences[1:2])
print('Next characters in sequence:', next_chars[1:2])

Number of sequences: 166730
First sequences: ['[ the tragedie of hamlet by william']
Next characters in sequence: [' ']
Second sequences: [' the tragedie of hamlet by william ']
Next characters in sequence: ['s']


# Vectorize the training data

In [6]:
#Create a Matrix of zeros to OHE each character as a vector of 0
# With dimensions : (training sequences, length of each sequence, total unique characters)
x = np.zeros((len(training_sequences), seq_len, len(characters)), dtype=np.bool)
y = np.zeros((len(training_sequences), len(characters)), dtype=np.bool)
for index, sequence in enumerate(training_sequences):
  for sub_index, chars in enumerate(sequence):
    x[index, sub_index, char_indices[chars]] = 1
    y[index, char_indices[next_chars[index]]] = 1
print('Data vectorization is finished.')
print('Feature vectors shape', x.shape)
print('Label vectors shape', y.shape)

Data vectorization is finished.
Feature vectors shape (166730, 35, 43)
Label vectors shape (166730, 43)


The dimensions of the feature matrix: time steps x seq length x num chars

# Build the RNN

Make several models, each with a different RNN architecture. Train them successively to see how each one performs. The goal is to generate character-level sequences.

In [0]:
def sample(softmax_predictions, sample_threshold=1.0):   
  softmax_preds = np.asarray(softmax_predictions).astype('float64')    
  # Make array of predictions, convert to float

  log_preds = np.log(softmax_preds) / sample_threshold                 
  # Log normalize and divide by threshold

  exp_preds = np.exp(log_preds)                                        
  # Compute exponents of log normalized terms

  norm_preds = exp_preds / np.sum(exp_preds)                           
  # Normalize predictions

  prob = np.random.multinomial(1, norm_preds, 1)                       
  # Draw sample from multinomial distribution

  return np.argmax(prob)  #Return max value

In [0]:
# Custom callback function

def on_epoch_end(epoch, _):
  global model, model_name
  print('----- Generating text after Epoch: %d' % epoch)
  start_index = random.randint(0, len(text) - seq_len - 1)    
  # Random index position to start sample input sequence
  end_index = start_index + seq_len                           
  # End of sequence, corresponding to training sequence length
  sampling_range = [0.3, 0.5, 0.7, 1.0, 1.2]                  
  # Sampling entropy threshold
  for threshold in sampling_range:print('----- *Sampling Threshold* :', threshold)
  generated = ''                                          
  # Empty string to collect sequence
  sentence = text[start_index: end_index]                 
  # Random input sequence taken from Hamlet
  generated += sentence                                  
   # Add input sentence to generated
  print('Input sequence to generate from : "' + sentence + '"')     
  sys.stdout.write(generated)                            
  # Print out buffer instead of waiting till the end
  for i in range(400):                                   
    # Generate 400 next characters in the sequence
    x_pred = np.zeros((1, seq_len, len(characters)))   
    # Matrix of zeros for input sentence
  for n, char in enumerate(sentence):                
    # For character in sentence
    x_pred[0, n, char_indices[char]] = 1.          
    # Change index position for character to 1.
    preds = model.predict(x_pred, verbose=0)[0]        
    # Make prediction on input vector
    next_index = sample(preds, threshold)              
    # Get index position of next character using sample function
    next_char = indices_char[next_index]               
    # Get next character using index
    generated += next_char                             
    # Add generated character to sequence
    sentence = sentence[1:] + next_char
    sys.stdout.write(next_char)
    sys.stdout.flush()
  
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, Dropout
from keras.layers import SimpleRNN, GRU, BatchNormalization
from keras.optimizers import RMSprop

'''Fun part: Construct a bunch of functions returning different kinds of RNNs, from simple to more complex'''
def SimpleRNN_stacked_model():
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(seq_len, len(characters)), return_sequences=True))
    model.add(SimpleRNN(128))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [0]:
def SimpleRNN_stacked_model():
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(seq_len, len(characters)), return_sequences=True))
    model.add(SimpleRNN(128))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [0]:
def GRU_stacked_model():
    model = Sequential()
    model.add(GRU(128, input_shape=(seq_len, len(characters)), return_sequences=True))
    model.add(GRU(128))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [0]:
def Bi_directional_GRU():
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=(seq_len, len(characters))))
    model.add(Bidirectional(GRU(128)))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [0]:
def larger_GRU():
    model = Sequential()
    model.add(GRU(128, input_shape=(seq_len, len(characters)),
                       dropout=0.2,
                       recurrent_dropout=0.2,
                       return_sequences=True))
    model.add(GRU(128, dropout=0.2,
                  recurrent_dropout=0.2,
                  return_sequences=True))
    model.add(GRU(128, dropout=0.2,
                  recurrent_dropout=0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(len(characters), activation='softmax'))
    return model
# All defined models
all_models = [
              SimpleRNN_stacked_model,
              GRU_stacked_model,
              Bi_directional_GRU, 
              Bi_directional_GRU,
              larger_GRU]

In [0]:
def test_models(list, epochs=10):
    global model, model_name
    
    for network in list:   
        print('Initiating compilation...')
        
        # Initialize model
        model = network()
        # Get model name
        model_name = re.split(' ', str(network))[1]  
        
        #Filepath to save model with name, epoch and loss 
        filepath = "%s_epoch-{epoch:02d}-loss-{loss:.4f}.h5"%model_name
        
        #Checkpoint callback object 
        checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        print('Compiled:', str(model_name))
        
        # Initiate training
        network = model.fit(x, y,
              batch_size=100,
              epochs=epochs,
              callbacks=[print_callback, checkpoint])
        
        # Print model configuration
        model.summary()
           
        #Save model history object for later analysis
        with open('%s.pkl'%model_name, 'wb') as file_pi:
            pickle.dump(network.history, file_pi)

test_models(all_models, epochs=10)

Initiating compilation...
Compiled: SimpleRNN_stacked_model
Epoch 1/10