Drawn from https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

In [1]:
# Small LSTM Network to Generate Text for Alice in Wonderland
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from datetime import datetime
import os
import re

Using TensorFlow backend.


In [2]:
#"http://www.gutenberg.org/cache/epub/11/pg11.txt"
#scp -i /c/blah/.ssh/blah.pem /d/blah/alice_in_wonderland_11-0.txt ubuntu@blah:~/blah/

In [3]:
# load ascii text and covert to lowercase
filename = "alice_in_wonderland_11-0.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [4]:
print type(raw_text), len(raw_text)

<type 'str'> 173595


In [5]:
#strip non alpha characters from source text.
regex = re.compile('[^a-zA-Z]')

startTime= datetime.now()
raw_text = regex.sub('', raw_text)
timeElapsed=datetime.now()-startTime
print('Time elapsed (hh:mm:ss.ms) {}'.format(timeElapsed))


Time elapsed (hh:mm:ss.ms) 0:00:00.011232


In [6]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print "Total Characters: ", n_chars
print "Total Vocab: ", n_vocab
print "chars:\n", chars

Total Characters:  123011
Total Vocab:  26
chars:
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print "Total Characters: ", n_chars
print "Total Vocab: ", n_vocab


Total Characters:  123011
Total Vocab:  26


In [8]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print "Total Patterns: ", n_patterns


Total Patterns:  122911


In [9]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
print X.shape

(122911, 100, 1)


In [10]:
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
print type(y), y.shape
print y[0]

<type 'numpy.ndarray'> (122911, 26)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


In [11]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_1 (LSTM)                    (None, 100, 256)      264192      lstm_input_1[0][0]               
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100, 256)      0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 256)           525312      dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 256)           0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [12]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}-improvedA.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#https://keras.io/callbacks/
#Save the model after every epoch.
#NB: this appears to be saving a file for every epoch, huge disk space hog.
'''
Arguments

filepath: string, path to save the model file.

monitor: quantity to monitor.

verbose: verbosity mode, 0 or 1.

save_best_only: if save_best_only=True, the latest best model according to the quantity monitored will not be overwritten.

mode: one of {auto, min, max}. If save_best_only=True, the decision to overwrite the current save file is made based on 
either the maximization or the minimization of the monitored quantity. 
For val_acc, this should be max, for val_loss this should be min, etc. 
In auto mode, the direction is automatically inferred from the name of the monitored quantity.

save_weights_only: if True, then only the model's weights will be saved (model.save_weights(filepath)), 
else the full model is saved (model.save(filepath)).

period: Interval (number of epochs) between checkpoints.
'''

callbacks_list = [checkpoint]


In [None]:
# fit the model
startTime= datetime.now()
print ("started at ", startTime)
model.fit(X, y, nb_epoch=50, batch_size=64, callbacks=callbacks_list)
timeElapsed=datetime.now()-startTime
print('Time elapsed (hh:mm:ss.ms) {}'.format(timeElapsed))


('started at ', datetime.datetime(2017, 11, 22, 0, 25, 33, 447631))
Epoch 1/50