# Sentiment Analysis using CNN with features extracted from word2vec (IMDb on CBOW algo)

### Data Generator?
In previous attempts, due to RAM constraint we couldn't fit the entire reviews and all the word vector dimensions into memory. Thus we can only take the first 100 vector dimensions, as well as maybe first 200 words of movie reviews.

With data generator and Keras' `model.fit_generator()` function, we can pass a Python generator that spews out infinite number of X_train and Y_train.

In [1]:
# Notebook
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import numpy as np
from text_tokenizer import tokenize

# ==== CONFIGS ====

h5_save_as = "models/cnn-cbow.h5"

# The word vector can be swapped with say GoogleNews 6B dataset
word_vector_bin_file = "word2vec/w2v-imdb-cbow-100d.bin"
word_vector_dims = 100

# in aclImdb, the longest review is 2470 words long
# Due to memory constraint, in this one I limit to 200 words
max_review_length = 3000

# Can easily swap with other datasets if you want
positive_review_txts = "aclImdb/train/pos/*.txt"
negative_review_txts = "aclImdb/train/neg/*.txt"
positive_review_vals = "aclImdb/test/pos/*.txt"
negative_review_vals = "aclImdb/test/neg/*.txt"

pad_token = '<PAD/>'
positive_y = [1, 0]
negative_y = [0, 1]

## Step 1: Make X_train

X_train data structure is a 3D array, consisting of reviews, words, and vectors:
```json
[
  // a review
  [
    // a word, and its array of 100 vectors
    [0.75, 0.64 ...],
    ...
  ], 
  ...
]
```

In [2]:
import glob

# highest word count shall be the convnet rows
highest_review_word_count = 0
training_reviews = []
validating_reviews = []

# just for notebook
file_read_count = 0

for txt in glob.glob(positive_review_txts):
    with (open(txt, 'r')) as f:
        word_array =  tokenize(f.read())
        highest_review_word_count = max(highest_review_word_count, len(word_array))
        training_reviews.append([word_array, positive_y])
        file_read_count += 1
        if file_read_count % 1000 == 0:
            sys.stdout.write("\r{0}".format(file_read_count))
            sys.stdout.flush()

for txt in glob.glob(negative_review_txts):
    with (open(txt, 'r')) as f:
        word_array = tokenize(f.read())
        highest_review_word_count = max(highest_review_word_count, len(word_array))
        training_reviews.append([word_array, negative_y])
        file_read_count += 1
        if file_read_count % 1000 == 0:
            sys.stdout.write("\r{0}".format(file_read_count))
            sys.stdout.flush()
        
# For validation purposes
for txt in glob.glob(positive_review_vals):
    with (open(txt, 'r')) as f:
        word_array = tokenize(f.read())
        validating_reviews.append([word_array, positive_y])
        file_read_count += 1
        if file_read_count % 1000 == 0:
            sys.stdout.write("\r{0}".format(file_read_count))
            sys.stdout.flush()

for txt in glob.glob(negative_review_vals):
    with (open(txt, 'r')) as f:
        word_array = tokenize(f.read())
        validating_reviews.append([word_array, negative_y])
        file_read_count += 1
        if file_read_count % 1000 == 0:
            sys.stdout.write("\r{0}".format(file_read_count))
            sys.stdout.flush()

print('highest word count: ', highest_review_word_count)

50000('highest word count: ', 2606)


## Step 2: Assign vector to vocabs

In [3]:
import sys
import gensim
from gensim.models import Word2Vec
word_vecs = Word2Vec.load_word2vec_format(word_vector_bin_file, binary=True)

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is disabled, cuDNN 5105)


In [4]:
def word_vector_for(word):
    try:
        return word_vecs[word][:word_vector_dims]
    except KeyError:
        return np.random.uniform(-0.25, 0.25, word_vector_dims)

In [5]:
import random
# In this case, we will use a generator to generate these big ass numpy arrays on the fly.
# Considerations:
# - This generator must be an infinite loop
# - Every iteration must have shuffled data
#
#
# It will return a tuple of single X_train (sentence_length, dims) and Y_train (2,1)
#
def reviews_generator(reviews, batch_size=50):
    while 1:
        # Shuffle the reviews
        random.shuffle(reviews)
        
        batch_counter = 0
        x_train = np.full((batch_size, max_review_length, word_vector_dims), 0, dtype='float32')
        y_train = np.full((batch_size, 2), 0, dtype='float32')
        for i, review in enumerate(reviews):
            for j, word in enumerate(review[0]):
                x_train[batch_counter][j] = word_vector_for(word)
                if j == max_review_length - 1:
                    break
            x_train[batch_counter][len(review[0]):-1] = word_vector_for(pad_token)
            y_train[batch_counter] = np.array(review[1])
            
            if batch_counter + 1 == batch_size:
                batch_counter = 0
                yield x_train, y_train
                x_train = np.full((batch_size, max_review_length, word_vector_dims), 0, dtype='float32')
                y_train = np.full((batch_size, 2), 0, dtype='float32')
            else:
                batch_counter += 1

## Step 3: Keras

![YoonKim CNN Architecture](yoonkim-cnn-architecture.png)

In [6]:
from keras.models import Model, Sequential
from keras.layers.convolutional import Convolution1D, Convolution2D
from keras.layers.pooling import MaxPooling1D, MaxPooling2D
from keras.layers import Merge, Dense, Dropout, Activation, Input, Flatten
from keras.optimizers import SGD

Using Theano backend.


In [7]:
# Based on the paper, there are filters of various sizes
filters = 1
epochs = 10

layer1_filter_sizes = [3,4,5]
layer1_convs = []

graph_in = Input(shape=(max_review_length, word_vector_dims))

for filter_size in layer1_filter_sizes:
    conv = Convolution1D(filters,
                         filter_size,
                         border_mode = 'valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length=2)(conv)
    flatten = Flatten()(pool)
    layer1_convs.append(flatten)

# Merge the conv
merged = Merge(mode='concat')(layer1_convs)
graph = Model(input=graph_in, output=merged)

final_model = Sequential()
final_model.add(graph)
final_model.add(Dense(16))
final_model.add(Activation('relu'))
final_model.add(Dropout(0.5))
final_model.add(Dense(2))
final_model.add(Activation('softmax'))

final_model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])


## Step 4: TRAIN THIS

In [8]:
final_model.fit_generator(reviews_generator(training_reviews),
                          len(training_reviews),
                          25,
                          validation_data=reviews_generator(validating_reviews),
                          nb_val_samples=len(validating_reviews))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f1d3423dc90>

In [9]:
final_model.save(h5_save_as)