In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm
import math

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

from keras import initializers, regularizers, constraints, optimizers, layers \

from fastai.vision import *
    
import matplotlib.pyplot as plt
plt.switch_backend('agg')


Using TensorFlow backend.


In [0]:
# Download data and create dataframe
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')

In [3]:
# Shuffle the indices randomly
df = df.iloc[np.random.permutation(len(df))]
print(df.head(10))

        label                                               text  is_valid
177  positive  This was the best documentary I've ever seen!!...     False
614  negative  A below average looking video game is turned i...     False
390  positive  Sure, it's a 50's drive-in special, but don't ...     False
298  positive  This is a very dramatic and suspenseful movie....     False
282  positive  If in the 90's you're adapting a book written ...     False
916  negative  I had suspicions the movie was going to be bad...      True
543  negative  I saw this film in its premier week in 1975. I...     False
770  negative  This was the first Ewan McGregor movie I ever ...     False
223  positive  A true dark noir movie and a very graphic film...     False
499  positive  A very realistic portrait of a broken family a...     False


In [4]:
# Replace string value with integers
df['label'] = df['label'].map({'negative':0, 'positive':1})
print(df.head())

     label                                               text  is_valid
177      1  This was the best documentary I've ever seen!!...     False
614      0  A below average looking video game is turned i...     False
390      1  Sure, it's a 50's drive-in special, but don't ...     False
298      1  This is a very dramatic and suspenseful movie....     False
282      1  If in the 90's you're adapting a book written ...     False


In [0]:
VALIDATION_PERCENT = 0.2
EMBED_SIZE = 50 # size of word vector
MAX_FEATURES = 15000 # number of unique words to use (i.e num rows in embedding vector)
MAX_LEN = 200 # max number of words in a review
NUMBER_OF_CLASSES = 1 # output of model

In [21]:
# Split into training and validation sets
cut = int(VALIDATION_PERCENT * len(df)) + 1
train_df, valid_df = df[cut:], df[:cut]

print(train_df.head())
print(valid_df.head())

     label                                               text  is_valid
544      1  Hmmmm, want a little romance with your mystery...     False
716      1  I adored this movie. Not only because I am a b...     False
765      0  Felt mine was while watching this...but it see...     False
903      1  After high-school graduation, best friends Ali...      True
496      1  this independent film was one of the best film...     False
     label                                               text  is_valid
177      1  This was the best documentary I've ever seen!!...     False
614      0  A below average looking video game is turned i...     False
390      1  Sure, it's a 50's drive-in special, but don't ...     False
298      1  This is a very dramatic and suspenseful movie....     False
282      1  If in the 90's you're adapting a book written ...     False


In [0]:
# Fill missing values with _na_
train_X = train_df["text"].fillna("_na_").values
valid_X = valid_df["text"].fillna("_na_").values

In [0]:
# Tokenize the sentences
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
valid_X = tokenizer.texts_to_sequences(valid_X)

In [24]:
print('Train_X: ')
print(train_X[0:5])

print('Tokenizer: ')
word_index = tokenizer.word_index
print('# of unique tokens: ', len(word_index))
print(list(tokenizer.index_word.items())[0:10])
print(list(tokenizer.index_word.items())[-10:])

Train_X: 
[[4689, 162, 3, 119, 665, 17, 130, 945, 10, 46, 9, 11, 103, 43, 1, 665, 13, 4690, 10, 64, 25, 93, 14, 3, 114, 15, 18, 88, 101, 1, 665, 26, 4690, 53, 1, 3812, 2749, 35, 135, 429, 3, 9012, 665, 518, 822, 1, 2195, 12, 6, 66, 1, 665, 83, 4690, 1, 60, 206, 25, 83, 3, 119, 95, 642, 1, 945, 120, 297, 95, 73, 161, 235, 18, 163, 20, 1, 625, 4, 1479, 1019, 3178, 6137, 438, 72, 11, 198, 3813, 1827, 13, 165, 51, 40, 439, 3814, 11, 103, 1480, 45, 302, 1303, 18, 10, 3179, 3180, 254, 207, 101, 25, 83, 114, 204, 294, 17, 3, 49, 72, 588, 255, 15, 278, 1, 981, 13, 414, 2, 1, 415, 101, 25, 83, 2750, 20, 178, 101, 25, 83, 3, 119, 9013, 3, 119, 4691, 49, 4692, 27, 151, 12, 13, 89, 38, 10, 13, 1, 394, 4, 148, 2196, 1481, 12, 101, 742, 25, 83, 1482, 36, 10, 101, 25, 83, 666, 8, 6138, 18, 31, 62, 59, 8, 9014, 319, 1384, 34, 9015, 782, 3, 9016, 13, 1, 195, 2, 39, 9017, 1692, 5, 2197, 879, 589, 395, 3815, 495, 22, 51, 66, 10, 83, 3, 49, 847, 1483, 395, 287, 14, 1, 193, 263, 395, 3815, 495, 124, 62, 33

In [25]:
# Pad the sequences to ensure constant length
train_X = pad_sequences(train_X, maxlen=MAX_LEN)
valid_X = pad_sequences(valid_X, maxlen=MAX_LEN)

print(train_X)
print(valid_X)

[[ 101   25   83  114 ...    3  982    4 1063]
 [  10  209 6144 6141 ...   15  138   65    9]
 [   4  248  477   16 ...    1 9045 9046   95]
 [2759   17   96   31 ...    9    3  745  147]
 ...
 [   4   24  107    2 ...    2  452    6  573]
 [   0    0    0    0 ...    2  726  411  711]
 [  66   83  643    8 ...   39   24 5576 4320]
 [   0    0    0    0 ...    7   67  982 1291]]
[[    0     0     0     0 ...     6     3  6633  5036]
 [    0     0     0     0 ...  4292     9   779  1291]
 [11724   236    58    22 ...    74   129  6401   839]
 [    0     0     0     0 ...    70   552   819   147]
 ...
 [   79     4     9    18 ...   107   500    10  4205]
 [    1   393    63    46 ...   127    32     1   118]
 [    8    63     1   532 ...     9   745     4   147]
 [    0     0     0     0 ...    32     1  6428  1272]]


In [26]:
# Get the target values
train_Y = train_df['label'].values
valid_Y = valid_df['label'].values

print(train_Y)
print(valid_Y)

[1 1 0 1 ... 0 0 1 0]
[1 0 1 1 ... 0 0 1 0]


In [0]:
# Create input
input = Input(shape=(MAX_LEN,), dtype='int32')

In [0]:
# Either use pretrained embeddings or create own embeddings from data

# PRETRAINED
# -----------------------------------
# embeddings_index = {}
# f = open('glove.6B.100d.txt',encoding='utf8')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()
# 
# print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
# 
# embedding_matrix = np.random.random((len(word_index) + 1, EMBED_SIZE))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector
# 
# embedding_layer = Embedding(len(word_index) + 1, EMBED_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=True)
# embedded_sequences = embedding_layer(input)

# Create own embeddings
# -----------------------------------
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(input)

In [29]:
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(4)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = GlobalAveragePooling1D()(x)
x = Dense(32, activation='relu')(x)

preds = Dense(NUMBER_OF_CLASSES, activation='sigmoid')(x)

model = Model(input, preds)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 100)          1200000   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 196, 128)          64128     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 49, 128)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 45, 128)           82048     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 9, 128)            0         
_________________________________________________________________
global_average_pooling1d_3 ( (None, 128)               0         
__________

In [30]:
from keras.callbacks import ReduceLROnPlateau

checkpoint=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=1, factor=0.5, min_lr=0.00001)
history=model.fit(train_X, train_Y, validation_data=(valid_X, valid_Y), epochs=10, batch_size=2, callbacks=[checkpoint, learning_rate_reduction])

Train on 799 samples, validate on 201 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.55721, saving model to model_cnn.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.55721 to 0.79602, saving model to model_cnn.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.79602 to 0.81095, saving model to model_cnn.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.81095
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.81095
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.81095

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.81095
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.81095
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.81095

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.81095


In [0]:
fig1 = plt.figure()
plt.plot(history.history['loss'],'r',linewidth=3.0)
plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :CNN',fontsize=16)
fig1.savefig('loss_cnn.png')
plt.show()

In [0]:
fig2=plt.figure()
plt.plot(history.history['acc'],'r',linewidth=3.0)
plt.plot(history.history['val_acc'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : CNN',fontsize=16)
fig2.savefig('accuracy_cnn.png')
plt.show()  