In [172]:
%matplotlib inline
import numpy as np
import utils
from keras.models import Model, Sequential
from keras.layers import Dense, Input, BatchNormalization, Embedding, merge, Flatten, Dropout, Convolution1D, MaxPooling1D
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.metrics import mean_absolute_percentage_error
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences 
from keras.metrics import binary_accuracy
import matplotlib.pyplot as plt
import pickle
import pandas as pd

In [2]:
#ddir="/home/ajith/dl/deeplearning1/nbs/data/imdb/"
ddir="/home/ubuntu/nbs/courses/deeplearning1/nbs/data/imdb/"
rdir=ddir + "results"

### Data Cleanup

In [3]:
from keras.datasets import imdb
word2id = imdb.get_word_index(path=ddir + "imdb_full.pkl")
path = get_file(ddir +'imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, y_train), (x_test, y_test) = pickle.load(f)

A local file was found, but it seems to be incomplete or outdated.
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.pkl
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl

In [4]:
id2word = {v:k for (k, v) in word2id.iteritems()} # dictionary from index to corresponding word.

In [5]:
def to_text(code):
    text=''.join([id2word[i] + " " for i in code])
    return text

In [113]:
# Shorten reviews by removing most common words(like 'the', 'and') etc that do not necessarily add to sentiment
# Remove least frequently used words too

FROM=5
TO=5000
VOCAB_SIZE=TO
def reduce_vocab(code):
    #new_code = [np.array([val if (val >= FROM and val < TO) else (TO-1) for val in s]) for s in code]
    new_code = [np.array([val for val in s if (val >= FROM and val < TO)]) for s in code]
    return new_code

In [114]:
x_trainc = reduce_vocab(x_train)
x_testc = reduce_vocab(x_test)

In [115]:
# Truncate train and test data to a 500-word wide matrix by truncating/padding as appropriate.
SEQ_LEN=500
x_trainc = pad_sequences(x_trainc, maxlen=SEQ_LEN, value=0)
x_testc = pad_sequences(x_testc, maxlen=SEQ_LEN, value=0)
x_trainc.shape

(25000, 500)

### Simple linear model

In [36]:
nfactors=32
model = Sequential([
        Embedding(VOCAB_SIZE, nfactors, input_length=SEQ_LEN, W_regularizer=l2(1e-4)),
        Flatten(),
        Dropout(0.5),
        BatchNormalization(),
        Dense(100, activation='relu'),
        Dropout(0.5),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 500, 32)       160000      embedding_input_5[0][0]          
____________________________________________________________________________________________________
flatten_5 (Flatten)              (None, 16000)         0           embedding_5[0][0]                
____________________________________________________________________________________________________
dropout_9 (Dropout)              (None, 16000)         0           flatten_5[0][0]                  
____________________________________________________________________________________________________
batchnormalization_9 (BatchNormal(None, 16000)         32000       dropout_9[0][0]                  
___________________________________________________________________________________________

In [97]:
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [98]:
model.fit(x_trainc, y_train, batch_size=64, nb_epoch=5, validation_data=(x_testc, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0a1affccd0>

### CNN model

In [145]:
nfactors=32

cm = Sequential([
        Embedding(VOCAB_SIZE, nfactors, input_length=SEQ_LEN, W_regularizer=l2(1e-4), dropout=0.2),
        Dropout(0.2),
        Convolution1D(64, 5, border_mode='same', activation='relu'),
        Flatten(),
        BatchNormalization(),
        Dropout(0.3),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
cm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_29 (Embedding)         (None, 500, 32)       160000      embedding_input_29[0][0]         
____________________________________________________________________________________________________
dropout_61 (Dropout)             (None, 500, 32)       0           embedding_29[0][0]               
____________________________________________________________________________________________________
convolution1d_29 (Convolution1D) (None, 500, 64)       10304       dropout_61[0][0]                 
____________________________________________________________________________________________________
flatten_29 (Flatten)             (None, 32000)         0           convolution1d_29[0][0]           
___________________________________________________________________________________________

In [149]:
cm.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [150]:
cm.fit(x_trainc, y_train, batch_size=64, nb_epoch=5, validation_data=(x_testc, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f09ccf8cd50>

### Pre-trained word vectors

In [173]:
def load_vectors(loc):
    return (utils.load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [174]:
GPATH="/home/ubuntu/nbs/courses/deeplearning1/nbs/data/glove/6B.100d"
vecs, words, wordidx = load_vectors(GPATH)

In [186]:
#Create embedding matching word ids in IMDB data
def create_embedding():
    no_word_cnt=0
    nfactors = vecs.shape[1]
    emb = np.zeros((VOCAB_SIZE, nfactors))
    for imdb_id in range(1, VOCAB_SIZE):
        imdb_word=id2word[imdb_id]
        if wordidx.has_key(imdb_word):
            emb[imdb_id] = vecs[wordidx.get(imdb_word)]
        else:
            emb[imdb_id] = np.random.normal(scale=0.6, size=(nfactors,))
            no_word_cnt+=1
    print("Created embedding with no_word_cnt=%d" %no_word_cnt)
    return emb

In [187]:
glovemb = create_embedding()

Created embedding with no_word_cnt=79


In [201]:
nfactors = vecs.shape[1]
cm = Sequential([
        Embedding(VOCAB_SIZE, nfactors, input_length=SEQ_LEN, W_regularizer=l2(1e-4), weights=[glovemb]),
        BatchNormalization(),
        Dropout(0.2),
        Convolution1D(32, 5, border_mode='same', activation='relu'),
        Flatten(),
        BatchNormalization(),
        Dropout(0.3),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
cm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_36 (Embedding)         (None, 500, 100)      500000      embedding_input_36[0][0]         
____________________________________________________________________________________________________
batchnormalization_59 (BatchNorma(None, 500, 100)      200         embedding_36[0][0]               
____________________________________________________________________________________________________
dropout_82 (Dropout)             (None, 500, 100)      0           batchnormalization_59[0][0]      
____________________________________________________________________________________________________
convolution1d_36 (Convolution1D) (None, 500, 32)       16032       dropout_82[0][0]                 
___________________________________________________________________________________________

In [204]:
cm.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [205]:
cm.fit(x_trainc, y_train, batch_size=64, nb_epoch=5, validation_data=(x_testc, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f09a043a410>