In [1]:
import os
import re
import string
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
def parse_observations(text):
    # Convert text to dataset. Treat each stanza as a sequence.
    
    line_counter = 0 
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [1, 2, 3]:
            line_counter = 0
            continue
        elif len(line) == 0:
            continue
  
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        line_counter += 1
        if line_counter % 4 == 0 or line_counter == 14:
            # Add the encoded sequence after end of each stanza
            obs.append(obs_elem)
            obs_elem = []
        
    return obs, obs_map

def parse_lines(text):
    # Convert text to dataset. Treat each line as a sequence.
    
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [0, 1, 2, 3]:
            continue
  
        obs_elem = []
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        obs.append(obs_elem)
               
    return obs, obs_map

def parse_poems(text):
    # Convert text to dataset. Treat each poem as a sequence.
    
    line_counter = 0 
    lines = [line.strip() for line in text.split('\n')]

    obs_counter = 0
    obs = []
    obs_map = {}
    obs_elem = []

    for line in lines:
        # don't include sonnet numbers or blank lines
        if len(line) in [1, 2, 3]:
            line_counter = 0
            continue
        elif len(line) == 0:
            continue
  
        words = [word.strip() for word in line.split(' ')]
        for word in words:
            # make all words lowercase and remove punctuation
            word = re.sub(r'[^\w]', '', word).lower()
            
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        line_counter += 1
        if line_counter == 14:
            # Add the encoded sequence after end of each stanza
            obs.append(obs_elem)
            obs_elem = []
        
    return obs, obs_map

In [24]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# get words as numbers and get the word map
all_words = []
words, word_map = parse_poems(text)
for i in range(len(words)):
    for j in range(len(words[i])):
        all_words.append(words[i][j])
words = all_words
        

# remove sonnet numbers and convert to lowercase
text = re.sub(r'[0-9]+', '', tex
             ) 
text = text.lower()
n_words = len(words)
n_vocab = len(word_map)

NameError: name 'tex' is not defined

In [26]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 5
dataX = []
dataY = []

# use sliding window approach
for i in range(0, n_words - seq_length, 1):
    seq_in = words[i:i + seq_length]
    seq_out = words[i + seq_length]
    dataX.append([word for word in seq_in])
    dataY.append(seq_out)
    
n_sequences = len(dataX)
print(dataY)

IndexError: list index out of range

In [None]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_sequences, seq_length, 1))

# normalize data to range (0, 1)
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)
print(X)
print(y)

In [None]:
# define the LSTM model 
# (single layer with 150 units, followed by dense output layer)
model = Sequential()
model.add(LSTM(150, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# train the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)