In [1]:
import numpy as np

from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Activation, GlobalMaxPooling1D
from tensorflow.keras.preprocessing import sequence

np.random.seed(1337)

In [2]:
# Download dataset here:
# https://ai.stanford.edu/~amaas/data/sentiment/

In [3]:
import glob
import os

from random import shuffle

def preprocess_data(filepath):
    """
    This is dependent of your training source but we will try to generalize it 
    as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    
    dataset = []
    
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
            
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset

In [4]:
dataset = preprocess_data('../data/aclImdb/train')
dataset[0]

(0,
 'So there\'s an old security guard and a guy who dies and then there\'s KEVIN, the world\'s biggest wuss. Kevin wants to impress his incredibly insensitive, bratty, and virginal girlfriend AMY. As he returns from work to... a random house... he finds his "friends," the sexually confusing red-shorted KYLE and the truly revolting sluttish DAPHNE. They are soon joined by Daphne\'s boyfriend, the trigger-happy sex-crazed macho lunkhead NICK. And there\'s the title creatures, horrid little dogeared puppets who kill people by giving them their heart\'s desire. Kyle\'s heart\'s desire is to mate with a creepy, yucky woman in spandex. Nick\'s heart\'s desire is to throw grenades in a grade school cafeteria-- I mean nightclub. Kevin\'s heart\'s desire is to beat up a skinny thug with nunchucks. Amy\'s heart\'s desire is to be a disgusting slut. Daphne\'s already a disgusting slut, so she doesn\'t have a heart\'s desire. Along the way a truly hideous band sings a truly odd song. The hobgobl

In [5]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

In [6]:
# Downloads "GoogleNews-vectors-negative300.bin.gz".
# https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

# word_vectors = get_data('w2v', limit=200_000)
word_vectors = KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin", binary=True, limit=200_000)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab.
        vectorized_data.append(sample_vecs)
        
    return vectorized_data

In [8]:
def collect_expected(dataset):
    """
    Collect the expected target values from the dataset - 0 for negative reviews,
    1 for positive reviews.
    """
    expected = []
    for sample in dataset:
        expected.append(sample[0])

    return expected

In [9]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [10]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [11]:
# CNN Parameters.

maxlen = 400 
batch_size = 32      # How many samples to show the net before backpropagating the error and updating the weights.
embedding_dims = 300 # Length of token vectors you will create for passing into the convnet.
filters = 250        # Number of filters you will train.
kernel_size = 3      # The width of the filters; actual filters will each be a matrix of weights of sizxe: embedding_dims x kernel_size, 50 * 3
hidden_dims = 250    # Number of neurons in the plain feedforward net at the end of the chain.
epochs = 2           # Number of times you will pass the entire training dataset through the network.

In [12]:
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen.
    """
    new_data = []
    
    # Create a vector of 0s the length of our word vectors.
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
        
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            
            # Append the appropriate number 0 vectors to the list.
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [13]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [14]:
model = Sequential()
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid', # 'same' or 'valid' are options.
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))

# Pooling

Pooling is the convolutional neural network path's to dimensionality reduction. 

Average pooling takes the subset of values you would in theory retain the most data.
 
Max pooling has an interesting property, in that by taking the largest activation value for the given region, the network sees that subsection's most prominent feature.

In [15]:
model.add(GlobalMaxPooling1D())

# Dropout

A special technique developed to prevent overfitting in neural networks.

In [16]:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [17]:
print(x_train.shape)
model.summary()

(20000, 400, 300)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 398, 250)          225250    
_________________________________________________________________
global_max_pooling1d (Global (None, 250)               0         
_________________________________________________________________
dense (Dense)                (None, 250)               62750     
_________________________________________________________________
dropout (Dropout)            (None, 250)               0         
_________________________________________________________________
activation (Activation)      (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 251       
_________________________________________________________________
activation_1 (Activation)    (None, 1)

In [18]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

W1222 08:15:23.986097 4683374016 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x16b61d9d0>

In [19]:
model_structure = model.to_json()
with open('cnn_model.json', 'w') as json_file:
    json_file.write(model_structure)
model.save_weights('cnn_weights.h5')

In [20]:
with open('cnn_model.json', 'r') as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('cnn_weights.h5')

In [21]:
sample_1 = 'I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can\'t wait for the weekend.'

In [22]:
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)

array([[0.0158824]], dtype=float32)

In [23]:
model.predict_classes(test_vec)

array([[0]], dtype=int32)