In [9]:
import os
import re
import string
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [10]:
def parse_observations(text):
    # Convert text to dataset. Treat each stanza as a sequence.
    
    line_counter = 0 
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [1, 2, 3]:
            line_counter = 0
            continue
        elif len(line) == 0:
            continue
  
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        line_counter += 1
        if line_counter % 4 == 0 or line_counter == 14:
            # Add the encoded sequence after end of each stanza
            obs.append(obs_elem)
            obs_elem = []
        
    return obs, obs_map

def parse_lines(text):
    # Convert text to dataset. Treat each line as a sequence.
    
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [0, 1, 2, 3]:
            continue
  
        obs_elem = []
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        obs.append(obs_elem)
               
    return obs, obs_map

def parse_poems(text):
    # Convert text to dataset. Treat each poem as a sequence.
    
    line_counter = 0 
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [1, 2, 3]:
            line_counter = 0
            continue
        elif len(line) == 0:
            continue
  
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        line_counter += 1
        if line_counter == 14:
            # Add the encoded sequence after end of each stanza
            obs.append(obs_elem)
            obs_elem = []
        
    return obs, obs_map

In [11]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# get words as numbers and get the word map
all_words = []
words, word_map = parse_poems(text)
for i in range(len(words)):
    for j in range(len(words[i])):
        all_words.append(words[i][j])
words = all_words
        

# remove sonnet numbers and convert to lowercase
n_words = len(words)
n_vocab = len(word_map)

In [12]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 7
dataX = []
dataY = []

# use sliding window approach
for i in range(0, n_words - seq_length, 1):
    seq_in = words[i:i + seq_length]
    seq_out = words[i + seq_length]
    dataX.append([word for word in seq_in])
    dataY.append(seq_out)
    
n_sequences = len(dataX)

In [13]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_sequences, seq_length, 1))

# normalize data to range (0, 1)
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)
print(X)
print(y)

[[[0.00000000e+00]
  [3.14861461e-04]
  [6.29722922e-04]
  ...
  [1.25944584e-03]
  [1.57430730e-03]
  [1.88916877e-03]]

 [[3.14861461e-04]
  [6.29722922e-04]
  [9.44584383e-04]
  ...
  [1.57430730e-03]
  [1.88916877e-03]
  [2.20403023e-03]]

 [[6.29722922e-04]
  [9.44584383e-04]
  [1.25944584e-03]
  ...
  [1.88916877e-03]
  [2.20403023e-03]
  [2.51889169e-03]]

 ...

 [[1.53022670e-01]
  [1.33501259e-01]
  [2.51889169e-01]
  ...
  [9.99370277e-01]
  [4.54659950e-01]
  [4.54659950e-01]]

 [[1.33501259e-01]
  [2.51889169e-01]
  [4.56863980e-01]
  ...
  [4.54659950e-01]
  [4.54659950e-01]
  [9.99685139e-01]]

 [[2.51889169e-01]
  [4.56863980e-01]
  [9.99370277e-01]
  ...
  [4.54659950e-01]
  [9.99685139e-01]
  [4.94332494e-02]]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
# define the LSTM model 
# (single layer with 150 units, followed by dense output layer)
model = Sequential()
model.add(LSTM(150, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [7]:
# define the checkpoint
filepath="weights-improvement-word-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [8]:
# train the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 6.90331, saving model to weights-improvement-word-01-6.9033.hdf5
Epoch 2/20

Epoch 00002: loss improved from 6.90331 to 6.46844, saving model to weights-improvement-word-02-6.4684.hdf5
Epoch 3/20

Epoch 00003: loss improved from 6.46844 to 6.44533, saving model to weights-improvement-word-03-6.4453.hdf5
Epoch 4/20

Epoch 00004: loss improved from 6.44533 to 6.43712, saving model to weights-improvement-word-04-6.4371.hdf5
Epoch 5/20

Epoch 00005: loss improved from 6.43712 to 6.43461, saving model to weights-improvement-word-05-6.4346.hdf5
Epoch 6/20

Epoch 00006: loss improved from 6.43461 to 6.43151, saving model to weights-improvement-word-06-6.4315.hdf5
Epoch 7/20

Epoch 00007: loss improved from 6.43151 to 6.43086, saving model to weights-improvement-word-07-6.4309.hdf5
Epoch 8/20

Epoch 00008: loss improved from 6.43086 to 6.43026, saving model to weights-improvement-word-08-6.4303.hdf5
Epoch 9/20

Epoch 00009: loss improved from 

<keras.callbacks.callbacks.History at 0x194f613a608>