In [1]:
import numpy as np 
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Flatten, Dropout
import matplotlib.pyplot as plt

Using TensorFlow backend.


Import raw data

In [2]:
sonnets = ""
with open("data/shakespeare.txt") as f:
    for line in f:
        if (line.strip().isdigit()):
            continue
        else:
            sonnets += line.lower()

Generate encoding of each character.

In [3]:
def generate_onehot_dict_and_reverse(word_list):
    """
    Takes a string, returning a dictionary mapping of characters to their index in a 
    one-hot-encoded representation of the words.
    """
    word_to_index = {}
    index_to_word = {}
    i = 0
    for word in word_list:
        if word not in word_to_index:
            word_to_index[word] = i
            index_to_word[i] = word
            i += 1
    return word_to_index, index_to_word

Parse into 40 character sequences and the characeter after to make the training data.

In [4]:
train_x, train_y = [], []
for i in range(len(sonnets) - 41):
    train_x.append(sonnets[i:i+40])
    train_y.append(sonnets[i+40])

Encode the x vector.

In [5]:
dic, index_dic = generate_onehot_dict_and_reverse(sonnets)
train_x = [[dic[j] for j in i] for i in train_x]

One hot encode the y.

In [6]:
# Encode the y labels
train_y = [dic[i] for i in train_y]
train_y = keras.utils.np_utils.to_categorical(train_y)

Take a look at shape to confirm everything looks good.

In [7]:
train_x = np.array(train_x).reshape(len(train_x), 40, 1)

In [8]:
train_y = np.array(train_y).reshape(len(train_y), 38)

In [9]:
np.shape(train_x)

(94554, 40, 1)

In [10]:
np.shape(train_y)

(94554, 38)

In [11]:
model = Sequential()
model.add(LSTM(150, input_shape = (40, 1, )))
model.add(Dense(38))
model.add(Activation('softmax'))

## Printing a summary of the layers and weights in your model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 150)               91200     
_________________________________________________________________
dense_1 (Dense)              (None, 38)                5738      
_________________________________________________________________
activation_1 (Activation)    (None, 38)                0         
Total params: 96,938
Trainable params: 96,938
Non-trainable params: 0
_________________________________________________________________


In [12]:
## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
model.compile(loss='categorical_crossentropy',optimizer='Adam', metrics=['accuracy'])

fit = model.fit(train_x, train_y, batch_size=200, nb_epoch=40,
    verbose=1)

## Printing the accuracy of our model, according to the loss function specified in model.compile above
score = model.evaluate(train_x, train_y, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])



Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test score: 1.68429250594
Test accuracy: 0.49663684244


In [15]:
s = ""
for i in train_x[0]:
    s += index_dic[i[0]]
seed = train_x[0]

break_counter = 0
while(break_counter < 14):
    p = model.predict_proba(np.array(seed).reshape(1, 40, 1), verbose=0)[0]
    n = np.random.choice(range(38), p = p)
    c = index_dic[n]
    s += c
    if (n == dic["\n"]):
        break_counter += 1
    seed = np.append(seed[1:], [n])

In [16]:
print(s)

from fairest creatures we desire increasid,
fouraie jikdinatous seem thy it sor lros,
that is the pittel the touth uorws forsyt.


what sponsane she wisasp uilt me a bove, worn the wintered,
in lnilfua stay that wan bealen goitt.
a are to somw tefory wimeso hreet mmin.
m bow loeeds hut oake thous owagle sweet be,
   f in kowe d r chat fictesed fartt,
and do bou and meisew far i how as gous:
lesiexy woofoot, retous-yabe be flu tpey me now wase,
 nd whills iyes faarer zike eyet mivht


