In [1]:
import nltk
nltk.download('stopwords')
import pandas
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
file = open('frankenstein.txt').read()

In [3]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)
    
processed_inputs = tokenize_words(file)

In [4]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [5]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print("total number of characters =", input_len)
print("total vocab=", vocab_len)

total number of characters = 269079
total vocab= 38


In [6]:
seq_len = 100
x_data = []
y_data = []

In [7]:
for i in range(0, input_len - seq_len,1):
    in_seq = processed_inputs[i:i + seq_len]
    out_seq = processed_inputs[i + seq_len]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append([char_to_num[out_seq]])
    
n_patterns = len(x_data)
print('total patterns=', n_patterns)

total patterns= 268979


In [8]:
X = numpy.reshape(x_data, (n_patterns, seq_len, 1))
X = X/float(vocab_len)

In [9]:
y = np_utils.to_categorical(y_data)

In [10]:
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences= True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
filepath = 'model_weights_saved_new.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1 , save_best_only= True, mode ='min')
desired_callbacks = [checkpoint]

In [13]:
model.fit(X,y, epochs=4, batch_size = 256 , callbacks= desired_callbacks)

Epoch 1/4
Epoch 1: loss improved from inf to 2.89628, saving model to model_weights_saved_new.hdf5
Epoch 2/4
Epoch 2: loss improved from 2.89628 to 2.61820, saving model to model_weights_saved_new.hdf5
Epoch 3/4
Epoch 3: loss improved from 2.61820 to 2.45684, saving model to model_weights_saved_new.hdf5
Epoch 4/4
Epoch 4: loss improved from 2.45684 to 2.34437, saving model to model_weights_saved_new.hdf5


<keras.callbacks.History at 0x1bf4e2f68b0>

In [14]:
filename = 'model_weights_saved_new.hdf5'
model.load_weights(filename)
model.compile(loss ='categorical_crossentropy', optimizer = 'adam')

In [15]:
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [16]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print('Ramdim seed =')
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Ramdim seed =
" serable apparatus dungeon morning remember thus awoke understanding forgotten particulars happened f "


In [17]:
for i in range(1000):
    x = numpy.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose = 0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

arher seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare sear