In [1]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

Using TensorFlow backend.


# Explore text

In [2]:
shakespeare_data = open('shakespeare.txt', encoding="utf-8-sig").read().lower()

def replace_many(string_data, to_replace, new_string):
    # Iterate over the strings in text data
    for elem in to_replace :
        # Check if string is in the main string
        if elem in string_data :
            # Replace the string
            string_data = string_data.replace(elem, new_string)
    
    return  string_data

shakespeare_data = replace_many(shakespeare_data, ['\t', '\n', '[', ']', '\\', '_', '|', '}'] , "")

In [3]:
single_chars = sorted(list(set(shakespeare_data)))
print(single_chars)

[' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'æ', 'è', 'œ', '—', '‘', '’', '“', '”']


In [4]:
char_length = len(shakespeare_data)
print(char_length)

5471300


In [5]:
unique_chars = len(single_chars)
print(unique_chars)

64


# Process to dictionary mapping

In [6]:
char_mapping = {char:numeric for numeric, char in enumerate(single_chars)}
print(char_mapping)

{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '@': 27, '`': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54, 'à': 55, 'æ': 56, 'è': 57, 'œ': 58, '—': 59, '‘': 60, '’': 61, '“': 62, '”': 63}


In [7]:
reverse_mapping = {numeric:char for numeric, char in enumerate(single_chars)}
print(reverse_mapping)

{0: ' ', 1: '!', 2: '"', 3: '$', 4: '%', 5: '&', 6: "'", 7: '(', 8: ')', 9: '*', 10: ',', 11: '-', 12: '.', 13: '/', 14: '0', 15: '1', 16: '2', 17: '3', 18: '4', 19: '5', 20: '6', 21: '7', 22: '8', 23: '9', 24: ':', 25: ';', 26: '?', 27: '@', 28: '`', 29: 'a', 30: 'b', 31: 'c', 32: 'd', 33: 'e', 34: 'f', 35: 'g', 36: 'h', 37: 'i', 38: 'j', 39: 'k', 40: 'l', 41: 'm', 42: 'n', 43: 'o', 44: 'p', 45: 'q', 46: 'r', 47: 's', 48: 't', 49: 'u', 50: 'v', 51: 'w', 52: 'x', 53: 'y', 54: 'z', 55: 'à', 56: 'æ', 57: 'è', 58: 'œ', 59: '—', 60: '‘', 61: '’', 62: '“', 63: '”'}


In [8]:
char_threshold = 100
inputs = []
outputs = []
char_counter = char_length - char_threshold

for i in range(0, char_counter, 1):
    
    input_chars = shakespeare_data[i:i + char_threshold]
    # this retrieves the threshold values character
    output_chars = shakespeare_data[i + char_threshold]
    #Appends threshold value char ids as a list into inputs
    inputs.append([char_mapping[char] for char in input_chars])
    #For every 100 values there is one y value which is the output
    outputs.append(char_mapping[output_chars])
    


In [10]:
print(inputs[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 36, 33, 0, 47, 43, 42, 42, 33, 48, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 40, 40, 61, 47, 0, 51, 33, 40, 40, 0, 48, 36, 29, 48, 0, 33, 42, 32, 47, 0, 51, 33, 40, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 36, 33, 0, 48, 46, 29, 35, 33, 32, 53, 0, 43, 34, 0, 29, 42, 48, 43, 42]


In [11]:
print(outputs[1])

53


# Understanding one-hot encoding for char maps
    We convert our reshaped data below in data_to_use into a one-hot vector. 
    Essentially an array of 0s and 1s. 
    The 1 only occurs at the position where the chracter_id that we mapped earlier is true. 
    For example, say we have some unique character IDs, [12, 11, 6, 3, 1].
    Then say we have 1 single data output equal to 1, output = ([[0, 1, 0, 0, 0]]). Notice how the 1 only occurs at the position of 11.  So it means we have a binary
    representation of that character occuring with other chars.
    We do this iteratively in batch sizes using the threshold we determined earlier

In [14]:

# we reshape to 1 because we only plan to predict 1 char
reshaped_data = np.reshape(inputs, (len(inputs), char_threshold, 1))

# normalize our data 
reshaped_data = reshaped_data / float(unique_chars)

# finally, we want categorical data to feed into our LSTM

data_to_use = np_utils.to_categorical(outputs)

# Building a model with Keras

In [None]:
model = Sequential()
#Since we know the shape of our Data we can input the timestep and feature data
#The number of timestep sequence are dealt with in the fit function
model.add(LSTM(256, input_shape=(reshaped_data.shape[1], reshaped_data.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(data_to_use.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(reshaped_data, data_to_use, epochs=5, batch_size=128)
model.save_weights("shakespeare.hdf5")
#model.load_weights("Othello.hdf5")

Instructions for updating:
Use tf.cast instead.
Epoch 1/5