In [1]:
import os
import time
import datetime

import tensorflow as tf
import numpy as np
import data_utils as utils

from tensorflow.contrib import learn
#from text_cnn import TextCNN
from data_utils import IMDBDataset

In [2]:
sequence_length = 128
num_classes = 2
vocab_size = 75099
embedding_dim = 300

print ("Loading Dataset ...")
dataset = IMDBDataset('/home/aayush/robust-large-margin-cnn-develop/data/aclImdb/train', '/home/aayush/robust-large-margin-cnn-develop/data/vocab.pckl')
X, Y = dataset.load()
print ("Dataset loaded. Preparing data and loading embeddings ...")

np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(Y)))

X_train = X[shuffle_indices]
Y_train = Y[shuffle_indices]

embedding_path = '/home/aayush/robust-large-margin-cnn-develop/data/embeddings.npy'
embedding = utils.load_embeddings(embedding_path, vocab_size, embedding_dim)
print ("Embeddings loaded. Initialising model hyperparameters ...")

Loading Dataset ...
Dataset loaded. Preparing data and loading embeddings ...
Embeddings loaded. Initialising model hyperparameters ...


In [3]:
embedding.shape

(75099, 300)

In [10]:
class TextCNN(object):
    """
    A CNN for text classification.
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, 
      l2_reg_lambda=0.0, jac_reg=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
       
        # Embedding layer
        self.word_embedding = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]),
                trainable=False, name="W")
        self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
        self.embedding_init = self.word_embedding.assign(self.embedding_placeholder)
        
        with tf.device('/cpu:0'), tf.name_scope("embedding"):            
            self.embedded_chars = tf.nn.embedding_lookup(self.word_embedding, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        print('Embedding: {}'.format(self.embedded_chars_expanded.get_shape()))
        
        # Create a convolution + maxpool layer for each filter size
        layer_outputs = []
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv%s-maxpool-1" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                with tf.variable_scope(("conv%s-maxpool-1" % filter_size),  reuse=None):
                    W = self.init_weight(filter_shape)
                b = self.init_bias([num_filters])
                conv = self.convolution(self.embedded_chars_expanded, W)
                print('Conv1-{}: {}'.format(filter_size, conv.get_shape()))
                
                # Apply nonlinearity
                h = self.non_linearity(conv, b)
                
                # Maxpooling over the outputs
                ksize = [1, sequence_length // 2 - filter_size, 1, 1]
                pooled = self.maxpool(h, ksize)
                print('Maxpool1-{}: {}'.format(filter_size, pooled.get_shape()))
                pooled_outputs.append(pooled)

        layer_outputs = pooled_outputs

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        print('Concatenated: {}'.format(self.h_pool.get_shape()))
        
        # Second convolution + maxpool layer
        with tf.name_scope("conv-maxpool-2"):
            filter_shape = [4, 1, self.h_pool.get_shape()[3].value, num_filters // 2]
            with tf.variable_scope("conv-maxpool-2",  reuse=None):
                W = self.init_weight(filter_shape)
            b = self.init_bias([num_filters // 2])
            conv = self.convolution(self.h_pool, W)
            print('Conv2: {}'.format(conv.get_shape()))
            
            h = self.non_linearity(conv, b)
            
            ksize = [1, 128 // 8 - 1, 1, 1]
            self.pooled_2 = self.maxpool(h, ksize)

            layer_outputs.append(self.pooled_2)
            print('Maxpool2: {}'.format(self.pooled_2.get_shape()))
        
        # Third convolution + maxpool layer            
        with tf.name_scope("conv-maxpool-3"):
            filter_shape = [6, 1, self.pooled_2.get_shape()[3].value, num_filters // 2]
            with tf.variable_scope("conv-maxpool-3",  reuse=None):
                W = self.init_weight(filter_shape)
            b = self.init_bias([num_filters // 2])
            conv = self.convolution(self.pooled_2, W)
            print('Conv3: {}'.format(conv.get_shape()))
            
            h = self.non_linearity(conv, b)
            
            ksize=[1, conv.get_shape()[1].value, 1, 1]
            self.pooled_3 = self.maxpool(h, ksize)
            
            layer_outputs.append(self.pooled_3)
            print('Maxpool3: {}'.format(self.pooled_3.get_shape()))
        
        # Flatten into a long feature vector
        self.h_pool_flat = tf.reshape(self.pooled_3, [-1, num_filters // 2])
        print('Flattened: {}'.format(self.h_pool_flat.get_shape()))
        
        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = self.add_dropout(self.h_pool_flat, self.dropout_keep_prob)
        
        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            with tf.variable_scope("output",  reuse=None):
                W = tf.get_variable(
                    "W",
                    shape=[num_filters // 2, num_classes],
                    initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
        
        # Calculate Mean cross-entropy loss
        with tf.name_scope("loss"):
            softmax_pred = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(softmax_pred) + l2_reg_lambda * l2_loss
        
        # Jacobian Regularizer            
        with tf.name_scope("jacobian_reg"):
            if jac_reg > 0.0:
                layer_names = ["conv4-maxpool-1", "conv5-maxpool-1", "conv-maxpool-2", "conv-maxpool-3"]
                for idx, scope in enumerate(layer_names):
                    with tf.variable_scope(scope, reuse=True):
                        W = tf.get_variable("W")
                        # jacobian matrix of network output w.r.t. the outputs of layer L
                        # dimension: (batch_size, width, height, number of filters)
                        
                        #g_x = tf.gradients(tf.reduce_sum(tf.multiply(self.input_y, self.scores)), layer_outputs[idx])
                        g_x = tf.gradients(tf.multiply(self.input_y, self.scores), layer_outputs[idx])
                        
                        # reshape (batch_size, height=1, width, number of filters) to (batch_size*width, number of filters)
                        g_x = tf.reshape(g_x, shape=[-1, tf.shape(W)[3]])
                                                
                        # covariance matrix of jacobian vectors
                        reg = tf.matmul(tf.transpose(g_x), g_x)
                                                
                        # parameter update
                        W -= 1e-3 * jac_reg * tf.tensordot(reg, W, axes=[[1], [0]])

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

    def init_weight(self, shape):
        return tf.get_variable("W", shape, dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.1))

    def init_bias(self, shape):
        return tf.Variable(tf.constant(0.1, shape=shape), name="b")

    def non_linearity(self, conv, bias):
        return tf.nn.relu(tf.nn.bias_add(conv, bias), name="relu")

    def add_dropout(self, drop_input, keep_prob):
        return tf.nn.dropout(drop_input, keep_prob)

    def convolution(self, conv_input, weights):
        conv = tf.nn.conv2d(
            conv_input,
            weights,
            strides=[1, 1, 1, 1],
            padding="VALID",
            name="convolution")
        return conv

    def maxpool(self, pool_input, ksize):
        pooled = tf.nn.max_pool(
                    pool_input,
                    ksize=ksize,
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
        return pooled

    def fully_conected(self, fc_in, in_shape, out_shape):
        W =  tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
        b = self.init_bias(out_shape)
        return tf.nn.xw_plus_b(fc_in, W, b, name="fully_connected")
    
    def jacobian():
        pass

##To do
- Add Fully Conected layer

In [11]:
# Model Hyperparameters
filter_sizes = [4, 5]
num_filters = 64
dropout_keep_prob = 0.5
l2_reg_lambda = 0.0
jac_reg = 0.1

# Training parameters
batch_size = 50
num_epochs = 10
checkpoint_every = 100
num_checkpoints = 6

In [15]:
print("Starting training ...")

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=True,
      log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=sequence_length,
            num_classes=num_classes,
            vocab_size=vocab_size,
            embedding_size=embedding_dim,
            filter_sizes=filter_sizes,
            num_filters=num_filters,
            l2_reg_lambda=l2_reg_lambda,
            jac_reg=jac_reg)
        
        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)             
                
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        
        sess.run(cnn.embedding_init, feed_dict={cnn.embedding_placeholder: embedding})
        
        def train_step(x_batch, y_batch):
            feed_dict = {
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)
        
        batches = utils.batch_iter(
        list(zip(X_train[:1000], Y_train[:1000])), batch_size, num_epochs)
        
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

Starting training ...
Embedding: (?, 128, 300, 1)
Conv1-4: (?, 125, 1, 64)
Maxpool1-4: (?, 66, 1, 64)
Conv1-5: (?, 124, 1, 64)
Maxpool1-5: (?, 66, 1, 64)
Concatenated: (?, 66, 1, 128)
Conv2: (?, 63, 1, 32)
Maxpool2: (?, 49, 1, 32)
Conv3: (?, 44, 1, 32)
Maxpool3: (?, 1, 1, 32)
Flattened: (?, 32)
INFO:tensorflow:Summary name conv4-maxpool-1/W:0/grad/hist is illegal; using conv4-maxpool-1/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv4-maxpool-1/W:0/grad/hist is illegal; using conv4-maxpool-1/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv4-maxpool-1/W:0/grad/sparsity is illegal; using conv4-maxpool-1/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv4-maxpool-1/W:0/grad/sparsity is illegal; using conv4-maxpool-1/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv4-maxpool-1/b:0/grad/hist is illegal; using conv4-maxpool-1/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv4-maxpool-1/b:0/grad/hist is illegal; using conv4-maxpool-1/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv4-maxpool-1/b:0/grad/sparsity is illegal; using conv4-maxpool-1/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv4-maxpool-1/b:0/grad/sparsity is illegal; using conv4-maxpool-1/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv5-maxpool-1/W:0/grad/hist is illegal; using conv5-maxpool-1/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv5-maxpool-1/W:0/grad/hist is illegal; using conv5-maxpool-1/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv5-maxpool-1/W:0/grad/sparsity is illegal; using conv5-maxpool-1/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv5-maxpool-1/W:0/grad/sparsity is illegal; using conv5-maxpool-1/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv5-maxpool-1/b:0/grad/hist is illegal; using conv5-maxpool-1/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv5-maxpool-1/b:0/grad/hist is illegal; using conv5-maxpool-1/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv5-maxpool-1/b:0/grad/sparsity is illegal; using conv5-maxpool-1/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv5-maxpool-1/b:0/grad/sparsity is illegal; using conv5-maxpool-1/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-2/W:0/grad/hist is illegal; using conv-maxpool-2/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-2/W:0/grad/hist is illegal; using conv-maxpool-2/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-2/W:0/grad/sparsity is illegal; using conv-maxpool-2/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-2/W:0/grad/sparsity is illegal; using conv-maxpool-2/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-2/b:0/grad/hist is illegal; using conv-maxpool-2/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-2/b:0/grad/hist is illegal; using conv-maxpool-2/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-2/b:0/grad/sparsity is illegal; using conv-maxpool-2/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-2/b:0/grad/sparsity is illegal; using conv-maxpool-2/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.


INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name output/W:0/grad/hist is illegal; using output/W_0/grad/hist instead.


INFO:tensorflow:Summary name output/W:0/grad/hist is illegal; using output/W_0/grad/hist instead.


INFO:tensorflow:Summary name output/W:0/grad/sparsity is illegal; using output/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name output/W:0/grad/sparsity is illegal; using output/W_0/grad/sparsity instead.


INFO:tensorflow:Summary name output/b:0/grad/hist is illegal; using output/b_0/grad/hist instead.


INFO:tensorflow:Summary name output/b:0/grad/hist is illegal; using output/b_0/grad/hist instead.


INFO:tensorflow:Summary name output/b:0/grad/sparsity is illegal; using output/b_0/grad/sparsity instead.


INFO:tensorflow:Summary name output/b:0/grad/sparsity is illegal; using output/b_0/grad/sparsity instead.


Writing to /home/aayush/robust-large-margin-cnn-develop/runs/1508254256

Generating batch iterator ...
2017-10-17T21:00:58.416222: step 1, loss 3.0666, acc 0.58
2017-10-17T21:00:58.835691: step 2, loss 1.57365, acc 0.56
2017-10-17T21:00:59.221162: step 3, loss 0.90876, acc 0.54
2017-10-17T21:00:59.617773: step 4, loss 0.640487, acc 0.64
2017-10-17T21:01:00.000913: step 5, loss 0.741047, acc 0.64
2017-10-17T21:01:00.396367: step 6, loss 0.705256, acc 0.58
2017-10-17T21:01:00.782994: step 7, loss 0.769182, acc 0.58
2017-10-17T21:01:01.166538: step 8, loss 0.880539, acc 0.38
2017-10-17T21:01:01.558171: step 9, loss 0.699748, acc 0.62
2017-10-17T21:01:01.981362: step 10, loss 0.706003, acc 0.5
2017-10-17T21:01:02.379863: step 11, loss 0.769595, acc 0.52
2017-10-17T21:01:02.766435: step 12, loss 0.715894, acc 0.58
2017-10-17T21:01:03.157492: step 13, loss 0.657667, acc 0.56
2017-10-17T21:01:03.597730: step 14, loss 0.768155, acc 0.42
2017-10-17T21:01:04.003660: step 15, loss 0.711104, acc 0

KeyboardInterrupt: 