In [1]:
import re
import sys
import string
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, LSTM
from keras.layers.embeddings import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
rawtext = open('wonderland.txt','r').read().split('\n')
rawtext = ' '.join(rawtext)
rawtext = [word.strip(string.punctuation) for word in rawtext.split()]
rawtext = ' '.join(rawtext)
rawtext = rawtext.replace('-', ' ')
rawtext = ' '.join(rawtext.split())

In [3]:
all_words = rawtext.split()
unique_words = sorted(list(set(all_words)))
n_vocab = len(unique_words)
print("Total Vocab:", n_vocab)
word_to_int = dict((w, i) for i, w in enumerate(unique_words))
int_to_word = dict((i, w) for i, w in enumerate(unique_words))

Total Vocab: 3063


In [4]:
raw_text = rawtext.split()
n_words = len(raw_text)
print(n_words)

26694


In [5]:
seq_length = 2
dataX = []
dataY = []
for i in range(0, n_words - seq_length):
    seq_in  = raw_text[i: i+seq_length]
    seq_out = raw_text[i+seq_length]
    dataX.append([word_to_int[word] for word in seq_in])
    dataY.append(word_to_int[seq_out])
n_patterns = len(dataX)
print('Total patterns:', n_patterns)

Total patterns: 26692


In [6]:
# Reshape dataX to size of [samples, time steps, features] and scale it to 0-1
# Represent dataY as one hot encoding
X_train = np.reshape(dataX, (n_patterns, seq_length, 1))/float(n_vocab)
Y_train = np_utils.to_categorical(dataY)

In [7]:
model = Sequential()
model.add(LSTM(256, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3063)              787191    
Total params: 1,051,383
Trainable params: 1,051,383
Non-trainable params: 0
_________________________________________________________________
None


# TODO: Add Embedding
In that case, keep inputs as indices--> 
X_train = np.reshape(dataX, (n_patterns, seq_length, 1))

instead of:

X_train = np.reshape(dataX, (n_patterns, seq_length, 1))/float(n_vocab)

In [8]:
# define the checkpoint
filepath="word-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [9]:
model.fit(X_train, Y_train, nb_epoch=10, batch_size=128, callbacks=callbacks_list)

  """Entry point for launching an IPython kernel.


Epoch 1/10

Epoch 00001: loss improved from inf to 6.70846, saving model to word-weights-improvement-01-6.7085.hdf5
Epoch 2/10

Epoch 00002: loss improved from 6.70846 to 6.26488, saving model to word-weights-improvement-02-6.2649.hdf5
Epoch 3/10

Epoch 00003: loss improved from 6.26488 to 6.25089, saving model to word-weights-improvement-03-6.2509.hdf5
Epoch 4/10

Epoch 00004: loss improved from 6.25089 to 6.24623, saving model to word-weights-improvement-04-6.2462.hdf5
Epoch 5/10

Epoch 00005: loss improved from 6.24623 to 6.24237, saving model to word-weights-improvement-05-6.2424.hdf5
Epoch 6/10

Epoch 00006: loss improved from 6.24237 to 6.23627, saving model to word-weights-improvement-06-6.2363.hdf5
Epoch 7/10

Epoch 00007: loss improved from 6.23627 to 6.22753, saving model to word-weights-improvement-07-6.2275.hdf5
Epoch 8/10

Epoch 00008: loss improved from 6.22753 to 6.21728, saving model to word-weights-improvement-08-6.2173.hdf5
Epoch 9/10

Epoch 00009: loss improved from 

<keras.callbacks.History at 0x7fa0c4371a58>

In [11]:
# load the network weights
#filename = "word-weights-improvement-10-6.1935.hdf5"
#model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
model.fit(X_train, Y_train, nb_epoch=20, batch_size=32, callbacks=callbacks_list)

  """Entry point for launching an IPython kernel.


Epoch 1/20

Epoch 00001: loss did not improve from 6.19351
Epoch 2/20

Epoch 00002: loss did not improve from 6.19351
Epoch 3/20

KeyboardInterrupt: 

In [13]:
# load the network weights
#filename = "word-weights-improvement-10-6.1935.hdf5"
#model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [16]:
start = np.random.randint(0, len(X_train)-1)
pattern = dataX[start]
result = []
print("Seed:")
print("\"", ' '.join([int_to_word[value] for value in pattern]), "\"")
for i in range(200):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x/float(n_vocab)
    prediction = model.predict(x)
    index = np.argmax(prediction)
    result.append(int_to_word[index])
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nGenerated Sequence:")
print(' '.join(result))
print("\nDone.")  

Seed:
" the arm "

Generated Sequence:
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

Done.
