CNN for NLP
------

Create CNN for text classification.

In [1]:
reset -fs

In [2]:
from keras.datasets import imdb

Using TensorFlow backend.


In [3]:
print('Loading data...')
max_features = 5000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [4]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [5]:
def find_longest_row(data):
    long = idx = 0
    for i in range(data.shape[0]):
        if len(data[i]) > long:
            long = len(data[i])
            idx =  i
    return long, idx

find_longest_row(x_train), find_longest_row(x_test)

((2494, 17934), (2315, 24573))

In [6]:
x_train[:3]

array([ [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32],
       [1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 2

In [7]:
from keras.preprocessing import sequence

In [8]:
print('Pad sequences (samples x time)')
maxlen = 400

# TODO: Pad sequences
x_train = sequence.pad_sequences(x_train, 
                                 maxlen=maxlen)
x_test = sequence.pad_sequences(x_test,
                                maxlen=maxlen)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [9]:
find_longest_row(x_train), find_longest_row(x_test)

((400, 0), (400, 0))

In [10]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [11]:
x_train[:3]

array([[  0,   0,   0, ...,  19, 178,  32],
       [  0,   0,   0, ...,  16, 145,  95],
       [  0,   0,   0, ...,   7, 129, 113]], dtype=int32)

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

In [13]:
print('Build model...')
# TODO: Create a model
model = Sequential()

# Start off with an efficient embedding layer which maps our vocab indices into embedding_dims dimensions
embedding_dims = 50
model.add(Embedding(max_features,embedding_dims,input_length=maxlen))

# Add a Convolution1D, which will learn filters word group filters of size filter_length:
filters = 250
kernel_size = 3
model.add(Conv1D(filters, kernel_size=1, activation='softmax'))

# Add max pooling:
model.add(GlobalMaxPooling1D())

# Add a vanilla hidden layer:
hidden_dims = 250
model.add(Dense(hidden_dims, activation='relu'))

# Project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid'))
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 400, 250)          12750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 325,751.0
Trainable params: 325,751.0
Non-trainable params: 0.0
_________________________________________________________________


TODO: How many parameters does this model have compared to LSTM for same dataset?

**Solution:** LSTM had 2.6M params compared to 325K

In [14]:
print("Training model...")
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
batch_size = 32
epochs = 2
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=True,
          validation_split=0.1)

Training model...
Train on 22500 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11decb390>

In [15]:
score, accuracy = model.evaluate(x_test, y_test,
                                batch_size=batch_size, 
                                verbose=True)



In [16]:
print('Test score: {:.3}'.format(score))
print('Test accuracy: {:.3}'.format(accuracy))

Test score: 0.366
Test accuracy: 0.838


__TODO__: How does accuracy compare to LSTM for same dataset?

**Solution:**: Baseline was approx 50% accuracy (which is no learning). But during the second epoch accuracy went up to 84%

__TODO__: How does training speed compare to LSTM for same datset?

**Solution**: Training is much faster

<br>
<br> 
<br>

----