In [1]:
import re
import sys
import string
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, LSTM
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [13]:
rawtext = open('./dat/wonderland.txt','r').read().split('\n')
rawtext = ' '.join(rawtext)
rawtext = [word.strip(string.punctuation) for word in rawtext.split()]
rawtext = ' '.join(rawtext)
rawtext = rawtext.replace('-', ' ')
rawtext = ' '.join(rawtext.split())

In [14]:
all_words = rawtext.split()
unique_words = sorted(list(set(all_words)))
n_vocab = len(unique_words) + 1
print("Total Vocab:", n_vocab)
word_to_int = dict((w, i) for i, w in enumerate(unique_words))
int_to_word = dict((i, w) for i, w in enumerate(unique_words))

Total Vocab: 3064


In [15]:
raw_text = rawtext.split()
n_words = len(raw_text)
print(n_words)

26694


In [16]:
seq_length = 2
dataX = []
dataY = []
for i in range(0, n_words - seq_length):
    seq_in  = raw_text[i: i+seq_length]
    seq_out = raw_text[i+seq_length]
    dataX.append([word_to_int[word] for word in seq_in])
    dataY.append(word_to_int[seq_out])
n_patterns = len(dataX)
print('Total patterns:', n_patterns)

Total patterns: 26692


In [26]:
# Reshape dataX to size of [samples, time steps, features] and scale it to 0-1
# Represent dataY as one hot encoding
X_train = np.reshape(dataX, (n_patterns, seq_length))#/float(n_vocab)
Y_train = np_utils.to_categorical(dataY, num_classes=n_vocab)

In [27]:
print(X_train.shape)
print(Y_train.shape)

(26692, 2)
(26692, 3064)


In [28]:

# define model
model = Sequential()
model.add(Embedding(n_vocab, 50))# Use pre-trained GloVe
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(n_vocab, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 50)          153200    
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 100)         60400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_6 (Dense)              (None, 3064)              309464    
Total params: 613,564
Trainable params: 613,564
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [30]:
# define the checkpoint
filepath="word-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [31]:
model.fit(X_train, Y_train, nb_epoch=10, batch_size=128, callbacks=callbacks_list)

  """Entry point for launching an IPython kernel.


Epoch 1/10
Epoch 2/10
 1792/26692 [=>............................] - ETA: 2s - loss: 6.2474 - acc: 0.0586



Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2f6dd931d0>

In [34]:
start = np.random.randint(0, len(X_train)-1)
pattern = dataX[start]
result = []
print("Seed:")
print("\"", ' '.join([int_to_word[value] for value in pattern]), "\"")
for i in range(2000):
    x = np.reshape(pattern, (1, len(pattern)))
    #x = x/float(n_vocab)
    prediction = model.predict(x)
    index = np.argmax(prediction)
    result.append(int_to_word[index])
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nGenerated Sequence:")
print(' '.join(result))
print("\nDone.")  

Seed:
" on again "

Generated Sequence:
the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Queen and was the Qu