## Chapter 22: Project: Predict Sentiment From Movie Review

## 1. Preparation

In [26]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers import Embedding

# fix random seed
seed = 7
np.random.seed(seed)

## 2. IMDB Dataset

### What is IMDB Dataset

The Large Movie Review Dataset (often referred to as the IMDB dataset) contains 25,000 highly-polar movie reviews (good or bad) for training and the same amount again for testing. The problem is to determine whether a given moving review has a positive or negative sentiment.

### Download Dataset

Keras offer a API to load IMDB dataset. When the dataset is stored at ~/.keras/datasets/imdb.pkl as a 32M file.  

In [17]:
from keras.datasets import imdb

(X_train, Y_train), (X_val, Y_val) = imdb.load_data(nb_words=5000)
print X_train.shape, X_train.dtype

(25000,) object


In [18]:
print type(X_train[0])

<type 'list'>


### Preprocess

Bound the length of word sequence to 500, truncating longer reviews and zero-padding shorter reviews.

In [19]:
from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)

print len(X_train[0])

500


## 3. Build a Simple Model

We will use an Embedding layer as the input layer, setting the vocabulary to 5,000, the word vector size to 32 dimensions and the input length to 500. The output of this first layer will be a 32x500 sized matrix.

In [20]:
# define a simple model
def create_simple_model():
    model = Sequential()
    model.add(Embedding(5000, 32, input_length=max_words))
    model.add(Flatten())
    model.add(Dense(250, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

simple_model = create_simple_model()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 500, 32)       160000      embedding_input_3[0][0]          
____________________________________________________________________________________________________
flatten_3 (Flatten)              (None, 16000)         0           embedding_3[0][0]                
____________________________________________________________________________________________________
dense_5 (Dense)                  (None, 250)           4000250     flatten_3[0][0]                  
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 1)             251         dense_5[0][0]                    
Total params: 4160501
_____________________________________________________________________

In [21]:
simple_model.fit(X_train, Y_train, validation_data=(X_val, Y_val), nb_epoch=10, batch_size=30, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1230fcd10>

## 4. Build 1-D Conv Layer Model

In [27]:
# define conv model
def create_conv_model():
    model = Sequential()
    model.add(Embedding(5000, 32, input_length=max_words))
    model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
    model.add(MaxPooling1D(pool_length=2))
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

conv_model = create_conv_model()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_7 (Embedding)          (None, 500, 32)       160000      embedding_input_7[0][0]          
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 500, 32)       3104        embedding_7[0][0]                
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)    (None, 250, 32)       0           convolution1d_1[0][0]            
____________________________________________________________________________________________________
flatten_4 (Flatten)              (None, 8000)          0           maxpooling1d_1[0][0]             
___________________________________________________________________________________________

In [28]:
conv_model.fit(X_train, Y_train, validation_data=(X_val, Y_val), nb_epoch=10, batch_size=30, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x10e6cab10>