# Tutorial to analyze text sentiment of IMDB reviews. 
* Purely based on https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
* Other resources: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [22]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding 
from keras.preprocessing import sequence


In [23]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = top_words)

max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen = max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen = max_review_length)

print ("x_train.shape: ", X_train.shape)
print ("x_test.shape: ", X_test.shape)

print ("y_train.shape: ", y_train.shape)
print ("y_test.shape: ", y_test.shape)

x_train.shape:  (25000, 500)
x_test.shape:  (25000, 500)
y_train.shape:  (25000,)
y_test.shape:  (25000,)


## LSTM for sentement prediction

In [None]:
embedding_vector_length = 32

model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length = max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print ("Model: ", model.summary())

epochs = 3
batch_size = 64
model.fit(X_train, y_train, 
          validation_data = (X_test, y_test), 
          batch_size = batch_size,
          epochs = epochs,
          verbose = 1)

score = model.evaluate(X_test, y_test, verbose = 0)
print ('Score: ', score)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
Model:  None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Score:  [0.31270698459625246, 0.86743999999999999]


## Using Convolution layer before LSTM
Similar accuracy and slignly improved speed

In [None]:
from keras.layers import Conv1D, MaxPooling1D

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length = max_review_length))
model.add(Conv1D(filters = 32, kernel_size = 3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print ("Model: ", model.summary())

epochs = 3
batch_size = 64
model.fit(X_train, y_train, 
          validation_data = (X_test, y_test), 
          batch_size = batch_size,
          epochs = epochs,
          verbose = 1)

score = model.evaluate(X_test, y_test, verbose = 0)
print ('Score: ', score)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 216,405
Trainable params: 216,405
Non-trainable params: 0
_________________________________________________________________
Model:  None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3