In [None]:
'''
This program aims to classify the sentiment for the movie reviews from IMDB dataset.

Data preparation:
1) The IMDB movie review dataset:
It can be downloaded from:
http://ai.stanford.edu/%7Eamaas/data/sentiment/
Unpack the downloaded IMDB package to the folder: ./aclImdb

2) The GLOVE (Global Vectors for Word Representation) pretrained word vectors 
It can be downloaded from:
http://nlp.stanford.edu/data/glove.6B.zip
Unpack the downloaded glove package to the folder: ./glove.6B/

This program uses Keras deep learning library.

This program achieved an average accuracy of 0.90 over 10000 test samples. 
'''

### Required Imports

In [6]:
import pyprind
import pandas as pd
import numpy as np
import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, Embedding
from keras.models import Model

Using TensorFlow backend.


In [4]:
#!pip install PyPrind

### Defining Parameters and paths

In [13]:
# the path to the review texts and sentiment labels
data_path = 'data/aclImdb/'
# the path to the glove vectors
glove_path = 'models/glove.6B/'
# max number of words in the texts to be vectorized (choose the frequent words)
max_nb_words = 20000
# max number of words in a review (the review is padded or trucated to the number)
num_words_per_review = 1000
# glove embedding dimension
glove_dim = 100
# the validation split
validation_ratio = 0.2

# fix the random seed
np.random.seed(123)

### Loading Review Texts and Sentiment Labels

In [12]:
# load the movie review texts and sentiment labels
labels = {'pos': 1, 'neg': 0}
# there are totally 50,000 review texts
print ('\n')
print ('Loading review texts and sentiment labels ...')
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(data_path, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding="utf8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()



Loading review texts and sentiment labels ...


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:13:21


### Defiing the values

In [14]:
df.columns = ['review', 'sentiment']
texts = df['review'].values.tolist()
labels = df['sentiment'].values.tolist()

In [34]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


### Loading GLOVE Vectors

In [17]:
# load the glove vectors
print ('Loading GLOVE word vectors ...')
# the dictionary for maping a word to a 100-dim vector
glove_embedding = {}
f = open(os.path.join(glove_path, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    fields = line.split()
    word = fields[0] # the first element is the word
    word_vector = np.asarray(fields[1:], dtype='float32') 
    glove_embedding[word] = word_vector
f.close()

Loading GLOVE word vectors ...


In [39]:
#import pickle
#f=open('dictionaries/glove_embedding_v1.pickle','wb')
#pickle.dump(glove_embedding,f)
#f.close()

In [37]:
import pickle
f1 = open('dictionaries/glove_embedding_v1.pickle', 'rb')
glove_embedding = pickle.load(f1)

### Tokenizing

In [19]:
# tokenize the words in the texts
tokenizer = Tokenizer(nb_words = max_nb_words) 
tokenizer.fit_on_texts(texts) 
# convert each review text into a sequence of word-indices
matrix_word_indices = tokenizer.texts_to_sequences(texts)
# the dictionary for mapping a word to an index
dictionary_word_index = tokenizer.word_index



### Pad each review text to a fixed length of word sequence

In [20]:
matrix_word_indices_fixed_length = pad_sequences(matrix_word_indices, maxlen = num_words_per_review)
# convert to numpy arrays 
data = np.array(matrix_word_indices_fixed_length)
labels = np.array(labels)

### Shuffle the data 

In [21]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# percentage of validation data
nb_validation_samples = int(validation_ratio*data.shape[0])

### Allocation of training data and validation data

In [22]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_validation = data[-nb_validation_samples:]
y_validation = labels[-nb_validation_samples:]

### Prepare embedding matrix

In [24]:
num_words = min(max_nb_words, len(dictionary_word_index))
# embedding_matrix[0] is a all-zero vector representing no word
embedding_matrix = np.zeros((num_words+1, glove_dim)) 
print ('Vectorizing the words ...')
for word, index in dictionary_word_index.items():
    if index > max_nb_words:
        continue 
    # get the glove vector for the word
    glove_vector = glove_embedding.get(word) 
    if glove_vector is not None: 
        embedding_matrix[index] = glove_vector

Vectorizing the words ...


### Define the model

In [25]:
# layer 0: the input layer
sequence_input = Input(shape=(num_words_per_review,), dtype='int32')
# layer-1: the embedding layer
embedding_layer = Embedding(num_words+1, glove_dim, weights=[embedding_matrix], input_length=num_words_per_review, trainable=True)
embedded_output = embedding_layer(sequence_input)
# layer-2: the first convolution layer
x = Conv1D(nb_filter=128, filter_length=5, activation='relu')(embedded_output)
# layer-3: the first pooling layer
x = MaxPooling1D(pool_length=5)(x)
# layer-4: the second convolution layer
x = Conv1D(128, 5, activation='relu')(x)
# layer-5: the second pooling layer
x = MaxPooling1D(pool_length = 5)(x)
# flatten layer
x = Flatten()(x)
# layer-6: the first dense layer
x = Dense(output_dim = 128, activation='relu')(x)
# layer-7: the second dense layer
x = Dense(output_dim = 128, activation='relu')(x)
# layer-8: the output layer
final_output = Dense(1, activation='sigmoid')(x)

  
  # Remove the CWD from sys.path while we load stuff.
  


### Compile the model

In [26]:
# define the model
model = Model(input=sequence_input, output=final_output)
# compile the model
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

  


In [42]:
x_train, y_train

(array([[    0,     0,     0, ...,  5200,   266,   268],
        [    0,     0,     0, ...,     7,   230,   157],
        [    0,     0,     0, ...,  3440,     2, 17886],
        ...,
        [    0,     0,     0, ...,     7,     7,  1017],
        [    0,     0,     0, ...,   532,    42,     9],
        [    0,     0,     0, ...,    43,    21,    89]]),
 array([1, 0, 1, ..., 0, 0, 0]))

### Training and validation

In [28]:
# training and validation
print ('Training the model ...')
model.fit(x=x_train, y=y_train, validation_data=(x_validation, y_validation), nb_epoch=5, batch_size=128, verbose=1)

Training the model ...


  This is separate from the ipykernel package so we can avoid doing imports until


Train on 40000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11b692b3d68>

### Evaluate the model

In [29]:
# evaluate the model
print ('Evaluating the model ...')
test_accuracy = model.evaluate(x_validation, y_validation, verbose=1)
print ('\nThe average accuracy on the evaluation data set is %.3f.' % test_accuracy[1])

Evaluating the model ...

The average accuracy on the evaluation data set is 0.901.


In [46]:
model.save('models/keras_custom_90_perc.h5')