In [None]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file = open('/content/drive/My Drive/Datasets/FamousFiveCompleteVolume.txt').read()

In [None]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [None]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [None]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [None]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 2676877
Total vocab: 37


In [None]:
seq_length = 100
x_data = []
y_data = []

In [None]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [None]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 2676777


In [None]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [None]:
y = np_utils.to_categorical(y_data)

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
model.fit(X, y, epochs=25, batch_size=256, callbacks=desired_callbacks)

Epoch 1/25

Epoch 00001: loss improved from inf to 2.05642, saving model to model_weights_saved.hdf5
Epoch 2/25

Epoch 00002: loss did not improve from 2.05642
Epoch 3/25

Epoch 00003: loss improved from 2.05642 to 1.95817, saving model to model_weights_saved.hdf5
Epoch 4/25

Epoch 00004: loss improved from 1.95817 to 1.69227, saving model to model_weights_saved.hdf5
Epoch 5/25

Epoch 00005: loss improved from 1.69227 to 1.57652, saving model to model_weights_saved.hdf5
Epoch 6/25

Epoch 00006: loss improved from 1.57652 to 1.51530, saving model to model_weights_saved.hdf5
Epoch 7/25

Epoch 00007: loss improved from 1.51530 to 1.47834, saving model to model_weights_saved.hdf5
Epoch 8/25

Epoch 00008: loss improved from 1.47834 to 1.45116, saving model to model_weights_saved.hdf5
Epoch 9/25

Epoch 00009: loss improved from 1.45116 to 1.43232, saving model to model_weights_saved.hdf5
Epoch 10/25

Epoch 00010: loss improved from 1.43232 to 1.41604, saving model to model_weights_saved.hdf5

<tensorflow.python.keras.callbacks.History at 0x7f98327c78d0>

In [None]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" hisper excited dog want give hiding place away idiot edgar panting puffing arrived cliff top complet "


In [None]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

ely said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian said julian 