In [1]:
import gc
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
gc.enable()

# Explore text

In [3]:
shakespeare_data = open('shakespeare.txt', encoding="utf-8-sig").read().lower()

def replace_many(string_data, to_replace, new_string):
    # Iterate over the strings in text data
    for elem in to_replace :
        # Check if string is in the main string
        if elem in string_data :
            # Replace the string
            string_data = string_data.replace(elem, new_string)
    
    return  string_data

shakespeare_data = replace_many(shakespeare_data, ['\t', '\n', '[', ']', '\\', '_', '|', '}', '%', '$', '0', '1' , '2', '3', '4', '5', '6', '7', '8', '9', '10', '@'] , "")

In [4]:
single_chars = sorted(list(set(shakespeare_data)))
print(single_chars)

[' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '/', ':', ';', '?', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'æ', 'è', 'œ', '—', '‘', '’', '“', '”']


In [5]:
char_length = len(shakespeare_data)
print(char_length)

5469373


In [6]:
unique_chars = len(single_chars)
print(unique_chars)

51


# Process to dictionary mapping

In [7]:
char_mapping = {char:numeric for numeric, char in enumerate(single_chars)}
print(char_mapping)

{' ': 0, '!': 1, '"': 2, '&': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '/': 11, ':': 12, ';': 13, '?': 14, '`': 15, 'a': 16, 'b': 17, 'c': 18, 'd': 19, 'e': 20, 'f': 21, 'g': 22, 'h': 23, 'i': 24, 'j': 25, 'k': 26, 'l': 27, 'm': 28, 'n': 29, 'o': 30, 'p': 31, 'q': 32, 'r': 33, 's': 34, 't': 35, 'u': 36, 'v': 37, 'w': 38, 'x': 39, 'y': 40, 'z': 41, 'à': 42, 'æ': 43, 'è': 44, 'œ': 45, '—': 46, '‘': 47, '’': 48, '“': 49, '”': 50}


In [8]:
reverse_mapping = {numeric:char for numeric, char in enumerate(single_chars)}
print(reverse_mapping)

{0: ' ', 1: '!', 2: '"', 3: '&', 4: "'", 5: '(', 6: ')', 7: '*', 8: ',', 9: '-', 10: '.', 11: '/', 12: ':', 13: ';', 14: '?', 15: '`', 16: 'a', 17: 'b', 18: 'c', 19: 'd', 20: 'e', 21: 'f', 22: 'g', 23: 'h', 24: 'i', 25: 'j', 26: 'k', 27: 'l', 28: 'm', 29: 'n', 30: 'o', 31: 'p', 32: 'q', 33: 'r', 34: 's', 35: 't', 36: 'u', 37: 'v', 38: 'w', 39: 'x', 40: 'y', 41: 'z', 42: 'à', 43: 'æ', 44: 'è', 45: 'œ', 46: '—', 47: '‘', 48: '’', 49: '“', 50: '”'}


In [9]:
char_threshold = 150
inputs = []
outputs = []
char_counter = char_length - char_threshold

for i in range(0, char_counter, 1):
    
    input_chars = shakespeare_data[i:i + char_threshold]
    # this retrieves the threshold values character
    output_chars = shakespeare_data[i + char_threshold]
    #Appends threshold value char ids as a list into inputs
    inputs.append([char_mapping[char] for char in input_chars])
    #For every 100 values there is one y value which is the output
    outputs.append(char_mapping[output_chars])
    


In [10]:
print(inputs[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 23, 20, 0, 34, 30, 29, 29, 20, 35, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 27, 27, 48, 34, 0, 38, 20, 27, 27, 0, 35, 23, 16, 35, 0, 20, 29, 19, 34, 0, 38, 20, 27, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 23, 20, 0, 35, 33, 16, 22, 20, 19, 40, 0, 30, 21, 0, 16, 29, 35, 30, 29, 40, 0, 16, 29, 19, 0, 18, 27, 20, 30, 31, 16, 35, 33, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 34, 0, 40, 30, 36, 0, 27, 24, 26, 20, 0, 24, 35, 0, 0, 0, 0, 0, 0]


# Understanding one-hot encoding for char maps
    We convert our reshaped data below in data_to_use into a one-hot vector. 
    Essentially an array of 0s and 1s. 
    The 1 only occurs at the position where the chracter_id that we mapped earlier is true. 
    For example, say we have some unique character IDs, [12, 11, 6, 3, 1].
    Then say we have 1 single data output equal to 1, output = ([[0, 1, 0, 0, 0]]). Notice how the 1 only occurs at the position of 11.  
    So it means we have a binar representation of that character occuring with other chars.
    We do this iteratively in batch sizes using the threshold we determined earlier

In [11]:

# we reshape to 1 because we only plan to predict 1 char at a time, 
reshaped_data = np.reshape(inputs, (len(inputs), char_threshold, 1))

# normalize our data 
reshaped_data = reshaped_data / float(unique_chars)

# finally, we want categorical data to feed into our LSTM

cats = np_utils.to_categorical(outputs)

# Building a model with Keras

In [36]:
import tensorflow as tf
from keras import regularizers
with tf.device('/gpu:0'):

    model = Sequential()
    #Since we know the shape of our Data we can input the timestep and feature data
    #The number of timestep sequence are dealt with in the fit function
    model.add(LSTM(42, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='he_uniform', recurrent_initializer='orthogonal', 
                   bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
                   activity_regularizer=None, 
                   kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.1, implementation=1, return_sequences=False, 
                   return_state=False, go_backwards=False, stateful=False, unroll=False))
    # output layer
    model.add(Dense(cats.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop')
    model.fit(reshaped_data, cats, epochs=2, batch_size=4024)
    model.save_weights("shakespeare.hdf5")

Epoch 1/2
Epoch 2/2


In [37]:
model.load_weights("shakespeare.hdf5")

In [56]:
for i in range(10):
    randomVal = np.random.randint(0, len(inputs) - 1)
    randomStart = inputs[randomVal]
    x = np.reshape(randomStart, (1, len(randomStart), 1))
    x = x/float(unique_chars)
    pred = model.predict(x)
    index = np.argmax(pred)
    randomStart.append(index)
    randomStart = randomStart[1: len(randomStart)]
    #print(randomStart)
    print("".join([reverse_mapping[value] for value in randomStart]))

n his soulo’er which his melancholy sits on brood,and i do doubt the hatch and the disclosewill be some danger, which for to prevent,i have in quick t
e laid most heavy hand.  soothsayer. the fingers of the pow'rs above do tune    the harmony of this peace. the vision    which i made known to lucius 
 to him.  agrippa. let us go.    good enobarbus, make yourself my guest    whilst you abide here.  enobarbus. humbly, sir, i thank you.               
 in health.  valentine. how does your lady, and how thrives your love?  proteus. my tales of love were wont to weary you;    i know you joy not in a s
rt and a smock.nurse.peter!peter.anon.nurse.my fan, peter.mercutio.good peter, to hide her face; for her fan’s the fairer face.nurse.god ye good morr 
fore prepare thyself;the bark is ready, and the wind at help,th’associates tend, and everything is bentfor england.hamlet.for england?king.ay, hamlet 
repose, to be asleep    with eyes wide open; standing, speaking, moving,    and yet so fast as