In [1]:
import numpy as np 
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Flatten, Dropout
import matplotlib.pyplot as plt

Using TensorFlow backend.


Import raw data

In [2]:
sonnets = ""
with open("data/shakespeare.txt") as f:
    for line in f:
        if (line.strip().isdigit()):
            continue
        else:
            sonnets += line.lower()

Generate encoding of each character.

In [3]:
def generate_onehot_dict_and_reverse(word_list):
    """
    Takes a string, returning a dictionary mapping of characters to their index in a 
    one-hot-encoded representation of the words.
    """
    word_to_index = {}
    index_to_word = {}
    i = 0
    for word in word_list:
        if word not in word_to_index:
            word_to_index[word] = i
            index_to_word[i] = word
            i += 1
    return word_to_index, index_to_word

Parse into 40 character sequences and the characeter after to make the training data.

In [4]:
train_x, train_y = [], []
for i in range(len(sonnets) - 41):
    train_x.append(sonnets[i:i+40])
    train_y.append(sonnets[i+40])

Encode the x vector.

In [5]:
dic, index_dic = generate_onehot_dict_and_reverse(sonnets)
train_x = [[dic[j] for j in i] for i in train_x]

One hot encode the y.

In [6]:
# Encode the y labels
train_y = [dic[i] for i in train_y]
train_y = keras.utils.np_utils.to_categorical(train_y)

Take a look at shape to confirm everything looks good.

In [7]:
train_x = np.array(train_x).reshape(len(train_x), 40, 1)

In [8]:
train_y = np.array(train_y).reshape(len(train_y), 38)

In [9]:
np.shape(train_x)

(94554, 40, 1)

In [10]:
np.shape(train_y)

(94554, 38)

In [16]:
model = Sequential()
model.add(LSTM(150, input_shape = (40, 1, ), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(150))
model.add(Dense(38))
model.add(Activation('softmax'))

## Printing a summary of the layers and weights in your model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 40, 150)           91200     
_________________________________________________________________
dropout_5 (Dropout)          (None, 40, 150)           0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 38)                5738      
_________________________________________________________________
activation_1 (Activation)    (None, 38)                0         
Total params: 277,538
Trainable params: 277,538
Non-trainable params: 0
_________________________________________________________________


In [None]:
## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
model.compile(loss='categorical_crossentropy',optimizer='Adam', metrics=['accuracy'])

fit = model.fit(train_x, train_y, batch_size=200, nb_epoch=20,
    verbose=1)

## Printing the accuracy of our model, according to the loss function specified in model.compile above
score = model.evaluate(train_x, train_y, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

In [None]:
s = ""
for i in train_x[0]:
    s += index_dic[i[0]]
seed = train_x[0]

break_counter = 0
while(break_counter < 14):
    p = model.predict_proba(np.array(seed).reshape(1, 40, 1), verbose=0)[0]
    n = np.random.choice(range(38), p = p)
    c = index_dic[n]
    s += c
    if (n == dic["\n"]):
        break_counter += 1
    seed = np.append(seed[1:], [n])

In [None]:
print(s)