In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Input, LSTM, RepeatVector, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

# fix random seed for reproducibility
numpy.random.seed(7)

Using TensorFlow backend.


In [2]:
# load the dataset but only keep the top n words, zero the rest
top_words = 50
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [3]:
# truncate and pad input sequences
max_review_length = 50
# X_train = X_train[:1000]
# y_train = y_train[:1000]
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
print(X_train)
print(y_train)

[[ 2  2 26 ..., 19  2 32]
 [ 2  5  2 ..., 16  2  2]
 [ 2 28  2 ...,  7  2  2]
 ..., 
 [ 4  2  2 ...,  4  2  2]
 [13 18 31 ..., 12  9 23]
 [ 2  8  2 ...,  2  2  9]]
[1 0 0 ..., 0 1 0]


In [4]:
encoder_input = numpy.zeros((len(X_train), max_review_length, top_words))
for i, sentence in enumerate(X_train):
    for j, word in enumerate(sentence):
        encoder_input[i, j, word] = 1

In [5]:
encoder_input_test = numpy.zeros((len(X_test), max_review_length, top_words))
for i, sentence in enumerate(X_test):
    for j, word in enumerate(sentence):
        encoder_input_test[i, j, word] = 1

In [6]:
latent_dim=top_words
inputs = Input(shape=(max_review_length, top_words))
print(inputs.shape)
encoded = LSTM(latent_dim,return_sequences=False,dropout=0.1)(inputs)
print(encoded.shape)
encoded = Dense(latent_dim,activation='sigmoid')(encoded)
print(encoded.shape)
decoded = RepeatVector(max_review_length)(encoded)
print(decoded.shape)
decoded = LSTM(latent_dim, return_sequences=True)(decoded)
print(decoded.shape)
decoded = Activation('softmax')(decoded)
print(decoded.shape)

sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

(?, 50, 50)
(?, 50)
(?, 50)
(?, 50, 50)
(?, ?, 50)
(?, ?, 50)


In [7]:
sequence_autoencoder.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])

In [8]:
sequence_autoencoder.fit(encoder_input,encoder_input,batch_size = 64,epochs = 10, validation_data=(encoder_input_test, encoder_input_test)) # validation_split=0.3)


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fba404b2390>

In [9]:
# # save model as json
# model_json = sequence_autoencoder.to_json()
# with open("models/encoder.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# sequence_autoencoder.save_weights("models/encoder.h5")
# print("Saved model to disk")


In [10]:
# # load json and create model
# json_file = open('models/encoder.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# # load weights into new model
# loaded_model.load_weights("models/encoder.h5")
# print("Loaded model from disk")


In [11]:
# # Only for debugging purpose
# z = 1
# decoded_train = sequence_autoencoder.predict(encoder_input[z:z+1])
# print(decoded_train)
# print(decoded_train.shape)
# print(encoder_input[z:z+1].shape)
# print(encoder_input[z])

In [12]:
encoded_train = encoder.predict(encoder_input)
print(encoded_train)
embedded_train = numpy.reshape(encoded_train, (len(encoder_input),top_words))
print(encoded_train.shape)


[[ 0.55768627  0.65891981  0.48071006 ...,  0.55686694  0.59972262
   0.48916361]
 [ 0.56575203  0.78526115  0.66632491 ...,  0.40819427  0.40477139
   0.12812956]
 [ 0.45523545  0.35163945  0.50617063 ...,  0.64417171  0.84766912
   0.6587466 ]
 ..., 
 [ 0.60720003  0.69888031  0.8338114  ...,  0.36654621  0.66989577
   0.48242465]
 [ 0.61997652  0.81262159  0.43435851 ...,  0.53611898  0.38918602
   0.2448179 ]
 [ 0.11388204  0.12168794  0.40964434 ...,  0.45711258  0.10535987
   0.05810964]]
(25000, 50)


In [13]:
encoded_test = encoder.predict(encoder_input_test)
print(encoded_test)
embedded_test = numpy.reshape(encoded_test, (len(encoder_input),top_words))
print(encoded_test.shape)


[[ 0.20486037  0.36848336  0.50598913 ...,  0.58979666  0.21977293
   0.68930674]
 [ 0.22914685  0.460949    0.64076537 ...,  0.30663884  0.04640674
   0.107353  ]
 [ 0.58832365  0.44723576  0.55821377 ...,  0.56808007  0.82458603
   0.64477652]
 ..., 
 [ 0.67775989  0.68313521  0.55804759 ...,  0.52440917  0.48409593
   0.59898919]
 [ 0.6369406   0.87636393  0.6485061  ...,  0.43285292  0.38804454
   0.20396268]
 [ 0.62084961  0.53712624  0.72566396 ...,  0.4456012   0.66934609
   0.5872432 ]]
(25000, 50)


In [14]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(top_words,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='hard_sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(embedded_train, y_train, validation_data=(embedded_test, y_test), epochs=20, batch_size=64)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 64)                3264      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 7,489
Trainable params: 7,489
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb9beed5b90>

In [15]:
# Final evaluation of the model
scores = model.evaluate(embedded_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 55.58%
