# Sentiment Prediction from Movie Review (IMDB)

Goal: Predict positive or negative sentiments from movie reviews

Dataset: [IMDB Movie reviews sentiment classification](https://keras.io/datasets/)

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten

np.random.seed(9)

Using TensorFlow backend.


## Load the data

In [2]:
max_words = 20000  # only the least common words won't be used
max_length = 500 # Maximun number of words per review

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=20000,
                                                      skip_top=10,
                                                      maxlen=500)

print("Training reviews: {}".format(len(x_train)))
print("Testing reviews: {}".format(len(x_test)))

Training reviews: 25000
Testing reviews: 20947


## Process the data

In [3]:
# Pad input: limited to max_length
x_train = pad_sequences(x_train, max_length)
x_test = pad_sequences(x_test, max_length)

# One-hot output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print("Training set: {}, {} ".format(x_train.shape, x_test.shape))
print("Testing set: {}, {}".format(y_train.shape, y_test.shape))

Training set: (25000, 500), (20947, 500) 
Testing set: (25000, 2), (20947, 2)


## Deep Neural Network

In [4]:
model = Sequential()
model.add(Embedding(20000, 20, input_length=500))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


hist = model.fit(x_train, y_train,
          batch_size=128,
          epochs=3,
          validation_data=(x_test, y_test), 
          verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 20)           400000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                160016    
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 560,050.0
Trainable params: 560,050.0
Non-trainable params: 0.0
_________________________________________________________________
Train on 25000 samples, validate on 20947 samples
Epoch 1/3
2s - loss: 0.6337 - acc: 0.6154 - val_loss: 0.4368 - val_acc: 0.8110
E

In [5]:
score = model.evaluate(x_test, y_test, verbose=0)
print("accuracy: ", score[1])

accuracy:  0.881414999767
