### Text CNN
Chaining multiple convolutional outputs. <br>
Yoon Kim, Convolutional Neural Networks for Sentence Classification,  EMNLP 2014

In [1]:
import numpy as np
import theano 
import theano.tensor as T
import lasagne

Using gpu device 0: Tesla K40c (CNMeM is disabled, cuDNN 4007)


In [21]:
#Model Parameters
embedding_dim = 128 #Dimensionality of character embedding
filter_sizes = [3,4,5]
num_filters = 2 #Number of filters per filter size
dropout_keep_prob = 0.5 #Dropout keep probability

# Training parameters
batch_size = 64
num_epochs = 10

### Data details
IMDB dataset for binary sentiment classification. <br>
Data has parsed so as map every distinct word to an integer. <br>
And labels have been converted to one hot vectors.

In [2]:
#data parameters
#maximum distinct words found in the corpus.
vocab_len = 18758 
#maximum sequence len under consideration. Padded with 0's.
seq_len = 56 

In [14]:
x = np.load("./data/text_x.npy").astype(np.int32)
y = np.load("./data/text_y.npy").astype(np.int32)

print "sample x:", x[0]
print "................................."
print "sample y:", y[0]

sample x: [ 1  2  3  4  5  6  1  7  8  9 10 11 12 13 14  9 15  5 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]
.................................
sample y: [0 1]


In [15]:
#shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]

In [16]:
#split to training and validation sets
x_train, x_val = x[:-1000], x[-1000:]
y_train, y_val = y[:-1000], y[-1000:]

#dont need original data. Free memory
del x,y

In [26]:
def build_network(input_var, drop_prob):
    
    network = lasagne.layers.InputLayer(shape=(None, seq_len),
                                        input_var=input_var,
                                       name = "input")
    
    network = lasagne.layers.embedding.EmbeddingLayer(network,
                                                       input_size = vocab_len, 
                                                       output_size = embedding_dim,
                                                     name = "embedding")
    
    network = lasagne.layers.reshape(network,(-1,1,seq_len,embedding_dim),
                                    name = "reshape")
    
    #observe how symbolics can be saved in list and symbolically concatenated!!
    pooled_outputs = []
    for i , f in enumerate(filter_sizes):
        conv = lasagne.layers.conv.Conv2DLayer(network,num_filters,(f,f),
                                               nonlinearity=lasagne.nonlinearities.rectify,
                                              name = "conv-"+str(f))
        
        pool = lasagne.layers.pool.MaxPool2DLayer(conv, [seq_len - f + 1,1],
                                           stride=[1, 1], name = "pool-"+str(f))
        pooled_outputs.append(pool)
    
    
    network = lasagne.layers.ConcatLayer(pooled_outputs,axis=3, name = "concat")
    
    num_filters_total = num_filters * len(filter_sizes)
    
    network = lasagne.layers.reshape(network,(-1,num_filters*375), name = "reshape")
       
    network = lasagne.layers.dropout(network,p=drop_prob, name = "dropout")
    
    network = lasagne.layers.DenseLayer(
            network,
            num_units=2,
            nonlinearity=lasagne.nonlinearities.softmax,
            name = "dense")

    return  network

In [27]:
#regular symbolics
ip = T.imatrix("inputs")
op = T.imatrix("outputs")
#see how drop_probablity can now be passed while training!!
drop_prob = T.scalar("drop_prob")

In [28]:
network = build_network(ip,drop_prob)

for layer in lasagne.layers.helper.get_all_layers(network):
    print layer.name, layer.output_shape

input (None, 56)
embedding (None, 56, 128)
reshape (None, 1, 56, 128)
conv-3 (None, 2, 54, 126)
pool-3 (None, 2, 1, 126)
conv-4 (None, 2, 53, 125)
pool-4 (None, 2, 1, 125)
conv-5 (None, 2, 52, 124)
pool-5 (None, 2, 1, 124)
concat (None, 2, 1, 375)
reshape (None, 750)
dropout (None, 750)
dense (None, 2)


In [29]:
prediction = lasagne.layers.get_output(network)

loss = lasagne.objectives.categorical_crossentropy(prediction, op)
loss = loss.mean()
params = lasagne.layers.get_all_params(network, trainable=True)

updates = lasagne.updates.adam(loss,params, learning_rate=1e-3)

#observe drop_out becomes a parameter!!
train_fn = theano.function([ip, op, drop_prob], 
                           loss, 
                           updates=updates)


In [30]:
for epoch in range(num_epochs):
    train_loss = []
    for i in range(0,x_train.shape[0]- batch_size, batch_size):
        train_loss.append(
            train_fn(
                x_train[i:i+batch_size],
                y_train[i:i+batch_size], 
                0.5
            )
        )
    print "after epoch", epoch+1, "training loss: " , np.mean(train_loss)

after epoch 1 training loss:  0.67266450727
after epoch 2 training loss:  0.476986026891
after epoch 3 training loss:  0.237351990477
after epoch 4 training loss:  0.093088998301
after epoch 5 training loss:  0.0360921119588
after epoch 6 training loss:  0.0179557619787
after epoch 7 training loss:  0.00885628417755
after epoch 8 training loss:  0.00545781437618
after epoch 9 training loss:  0.00398699526985
after epoch 10 training loss:  0.0029546106793
