In [1]:
%matplotlib inline
from __future__ import division, print_function
import numpy as np
import cPickle as pickle


In [2]:
import keras
from keras.utils.data_utils import get_file
from keras.layers import Dense, Flatten, Input, Embedding, merge, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam
from keras.models import Sequential
from keras.preprocessing import sequence

from keras.datasets import imdb

Using Theano backend.


# Setup Data

IMDB dataset, which I will use, contains movie reviews from IMDB, along with their sentiments. Let's have a look!

### Getting the data...

In [4]:
idx = imdb.get_word_index()

In [5]:
#to get the original dataset, we download the reviews using this code copied from keras.datasets
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

### Looking at  the data...

In [6]:
idx_array = sorted(idx, key=idx.get)
idx_array[:5]

[u'the', u'and', u'a', u'of', u'to']

In [7]:
#mapping from id to word
idx2word = {v: k for k, v in idx.iteritems()} 

In [8]:
len(x_train), len(x_test)

(25000, 25000)

In [9]:
#Here is the first review as it is provided in the training set (indecies of words)
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [10]:
idx2word[23022] #The first word in the list is of index 23022 -> let's interpret it using our idx2word mapping

u'bromwell'

In [11]:
#let's look at the whole review in meaningful words
' '.join (idx2word[i] for i in x_train[0] ) 

u"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [12]:
#let's have a look at some labels we are given: 0 = negative review  1 = positive review
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [13]:
num_vocab = 5000
#the below line simply does the following: 
# if the word index is less than 5000 (vocab_size) leave it as is, otherwise 
# replace it by the last word --> this replaces the rare words [those of index greater 
# than 5000, because, remember, words are ordered by their frequency] by a single id.
trn = [np.array([i if i<num_vocab-1 else num_vocab-1 for i in s]) for s in x_train]
test = [np.array([i if i<num_vocab-1 else num_vocab-1 for i in s]) for s in x_test]

In [14]:
# trn is a list of movie reviews. Each movie review is a numpy array of words from our corpus
trn[2].shape 

(149,)

In [15]:
reviews_length = np.array(map(len, trn))
reviews_length.max(),reviews_length.min(),reviews_length.mean()

(2493, 10, 237.71364)

In [16]:
seq_len = 500 #we will fix the review length at double the average review length

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

trn.shape

(25000, 500)

### A simple linear model

In [138]:
lf_num = 32
model = Sequential([
    Embedding(num_vocab, lf_num, input_length=500), #each review has 500 words (we have a dictionary of 5000 words to select from)
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])
model.compile(Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [55]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [30]:
def train(model, lr, nb_epoch = 2):
    if (lr): model.optimizer.lr= lr
    model.fit(trn, labels_train, nb_epoch = nb_epoch, validation_data=(test, labels_test))

In [59]:
train(model)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


### Let's have a CNN

In [76]:
model = Sequential([
    Embedding(num_vocab, lf_num, input_length=500),
    Conv1D(64, 5, border_mode='same', activation='relu'),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.compile(Adam(1e-4), 'binary_crossentropy', metrics=['accuracy'])

  This is separate from the ipykernel package so we can avoid doing imports until


In [77]:
train(model)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


### Pretrained vectors

In [18]:
import os, re #re is regular expression operator
import bcolz
from numpy.random import normal

lf_num = 50 #100 or 200,..etc based on the used pretrained embeddings
glove_dataset_path = '/data/glove/results/6B.50d'

In [19]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('/data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir='/data/glove/results',
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [20]:
def load_vectors(loc):
    return (bcolz.open(loc+'.dat')[:],
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))
def load_array(fname):
    return bcolz.open(fname)[:]

In [21]:
vecs, words, wordidx = load_vectors(glove_dataset_path)

The glove word ids and imdb word ids use different indexes. So we create a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist).

In [22]:
def create_emb():
    nb_factors = vecs.shape[1]
    embeddings = np.zeros((num_vocab, nb_factors))
    for i in range(1, len(embeddings)):
        imdb_word = idx2word[i]
        if imdb_word and re.match(r"^a-zA-Z0-9\-]*$", imdb_word):
            glove_idx_for_this_word = wordidx[imdb_word]
            embeddings[i] = vecs[glove_idx_for_this_word]
        else:
            embeddings[i] = normal(scale=0.6, size=(nb_factors, ))
    
    embeddings[-1] = normal(scale=0.6, size=(nb_factors,))
    embeddings/=3 #--> why would we divide by three?! 
    return embeddings 

In [23]:
embeddings = create_emb()

In [27]:
model = Sequential([
    Embedding(num_vocab, lf_num, input_length=500, dropout=0.2, 
           weights=[embeddings], trainable=False),
    Dropout(0.2),
    Conv1D(64, 5, border_mode='same', activation='relu'),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [32]:
train(model, None)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


In [33]:
model.layers[0].trainable = True

In [34]:
train(model=model, lr=1e-4)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


In [None]:
train(model=model, lr=1e-2)        

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2

  % delta_t_median)


