In [4]:
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
file = open("frankenstein.txt").read()

In [6]:
def tokenize_words(input):
    input= input.lower()
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(input)
    filtered=filter(lambda token: token not in stopwords.words('english'),tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)

In [7]:
chars=sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [9]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:",input_len)
print("Total vacab",vocab_len)

Total number of characters: 232972
Total vacab 37


In [10]:
seq_length = 100
x_data = []
y_data = []

In [11]:
for i  in range(0,input_len-seq_length,1):
    in_seq = processed_inputs[i:i+seq_length]
    out_seq = processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
n_patterns = len(x_data)
print('total patterns:',n_patterns)

total patterns: 232872


In [13]:
X=numpy.reshape(x_data,(n_patterns,seq_length,1))
X=X/float(vocab_len)

In [14]:
y = np_utils.to_categorical(y_data)

In [15]:
#creating the model
model=Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))


In [16]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [17]:
filepath= 'model_weights_saved.hdf5'
checkpoint= ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks = [checkpoint]

In [19]:
model.fit(X,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93038, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93038 to 2.89647, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.89647 to 2.84392, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.84392 to 2.75620, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7fa9e4ca0ef0>

In [21]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [22]:
num_to_char = dict((i,c) for i,c in enumerate(chars))


In [27]:
start = numpy.random.randint(0,len(x_data)-1)
pattern = x_data[start]
print("Random seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]),"\"")

Random seed:
" immaculatebeingsmiserableabandonedabortionspurnedkickedtrampledevenbloodboilsrecollectioninjusticetr "


In [29]:
for i in range(1000):
    x=numpy.reshape(pattern,(1,len(pattern), 1))
    x=x/float(vocab_len)
    prediction=model.predict(x,verbose=0)
    index=numpy.argmax(prediction)
    result=num_to_char[index]
    seq_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

edeereatedreatedreatedsertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedtertedterted