In [26]:
import re
from numpy import array
from pickle import dump
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
# load text
spenser = load_doc('data/spenser.txt')
spenser = "\n\n" + spenser
spenser = re.sub("\n\n(.*)\n\n", "\n\n", spenser)[2:]
shakes = load_doc('data/shakespeare.txt')
shakes = ''.join([i for i in shakes if not i.isdigit()])
raw_text = spenser + shakes
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)

In [28]:
#organize into sequences of characters
length = 40
n = 1
sequences = list()
for i in range(length, len(raw_text), n):
    # select sequence of tokens
    seq = raw_text[i-length:i+1]
    # store
    sequences.append(seq)

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()    
# save sequences to file
out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)

In [30]:
#load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))

In [31]:
sequences = list()
for line in lines:
    #integer encode line
    encoded_seq = [mapping[char] for char in line]
    #store
    sequences.append(encoded_seq)
#vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 64


In [32]:
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)
X.shape

(147093, 40, 64)

In [33]:
# define the model
def define_model(X):
    model = Sequential()
    model.add(LSTM(150, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    return model

In [34]:
# define model
model = define_model(X)
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
callbacks = [EarlyStopping(monitor='loss', min_delta=0.005, patience=2),
             ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='min')]

model.fit(X, y, epochs=100, verbose=2, callbacks=callbacks)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 150)               129000    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                9664      
Total params: 138,664
Trainable params: 138,664
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
 - 261s - loss: 2.2602 - accuracy: 0.3566

Epoch 00001: saving model to weights-improvement-01-2.2602.hdf5
Epoch 2/100
 - 283s - loss: 1.9104 - accuracy: 0.4360

Epoch 00002: saving model to weights-improvement-02-1.9104.hdf5
Epoch 3/100
 - 247s - loss: 1.7758 - accuracy: 0.4676

Epoch 00003: saving model to weights-improvement-03-1.7758.hdf5
Epoch 4/100
 - 247s - loss: 1.6846 - accuracy: 0.4891

Epoch 00004: saving model to weights-improvement-04-1.6846.hdf5
Epoch 5/100
 - 835s - loss: 1.6191

<keras.callbacks.callbacks.History at 0x10de48150>

In [35]:
# save the model to file
model.save('model_spenser_shakes.h5')
# save the mapping
dump(mapping, open('mapping_spenser_shakes.pkl', 'wb'))