In [1]:
import pickle
import numpy as np
import tensorflow as tf

In [2]:
# Load preprocessed datasets
with open('preprocess_x_1.pickle', 'rb') as handle:
    x_shuffled = pickle.load(handle)

with open('preprocess_y_1.pickle', 'rb') as handle:
    y_shuffled = pickle.load(handle)

print "Files loaded."
print "Size: {:d}".format(len(x_shuffled))

Files loaded.
Size: 10662


In [3]:
# Split train/test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.1, random_state=42)

print("Train features dimensions: {:d}, {:d}".format(*X_train.shape))
print("Train labels dimensions: {:d}, {:d}".format(*y_train.shape))
print("Test features dimensions: {:d}, {:d}".format(*X_test.shape))
print("Test labels dimensions: {:d}, {:d}".format(*y_test.shape))

Train features dimensions: 9595, 56
Train labels dimensions: 9595, 2
Test features dimensions: 1067, 56
Test labels dimensions: 1067, 2


In [4]:
# Build TensorFlow model
sequence_length = X_train.shape[1]
num_classes = y_train.shape[1]
vocab_size = 18758
embedding_size = 128
filter_sizes = [3, 4, 5]
num_filters = 128
l2_reg_lambda = 0.0

graph = tf.Graph()
with graph.as_default():
    # Placeholders for input, output and dropout
    input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
    input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # Keep track of L2 regularization loss
    l2_loss = tf.constant(0.0)

    # Build model
    # Embedding layer
    with tf.device('/cpu:0'), tf.name_scope("embedding"):
        W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W")
        embedded_chars = tf.nn.embedding_lookup(W, input_x)
        embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

    # Create a convolution + maxpool layer for each filter size
    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W-%s" % filter_size)
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b-%s" % filter_size)

            conv = tf.nn.conv2d(
                embedded_chars_expanded,
                W,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")

            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")

            # Maxpooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, sequence_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="pool")
            pooled_outputs.append(pooled)

    # Combine all pooled features
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(3, pooled_outputs)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

    # Add dropout
    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

    # Final (unnormalized) scores and predictions
    with tf.name_scope("output"):
        W = tf.get_variable(
            "W",
            shape=[num_filters_total, num_classes],
            initializer=tf.contrib.layers.xavier_initializer())

        b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
        l2_loss += tf.nn.l2_loss(W)
        l2_loss += tf.nn.l2_loss(b)
        scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
        predictions = tf.argmax(scores, 1, name="predictions")

    # Calculate mean cross-entropy loss
    with tf.name_scope("loss"):
        losses = tf.nn.softmax_cross_entropy_with_logits(scores, input_y)
        loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

    # Accuracy
    with tf.name_scope("accuracy"):
        correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")

    # Optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    starter_learning_rate = 0.0005
    # Decay factor of 0.95 after every 10000 steps.
    with tf.name_scope('learning_rate'):
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 10000, 0.95)
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)

In [29]:
# ======== Training =========
with tf.Session(graph=graph) as sess:
    tf.initialize_all_variables().run()
    saver = tf.train.Saver(tf.all_variables())
    print('Initialized')

    def train_step(x_batch, y_batch):
        feed_dict = {
                input_x: x_batch,
                input_y: y_batch,
                dropout_keep_prob: 0.5
                }
        
        _, step, l, accuracy_train = sess.run(
            [optimizer, global_step, loss, accuracy], feed_dict=feed_dict)
        return step, l, accuracy_train
        
    def val_step(X_test, Y_test):
        feed_dict = {
                input_x: X_test,
                input_y: y_test,
                dropout_keep_prob: 1.0
                }
            
        step, loss_val, accuracy_val = sess.run(
            [global_step, loss, accuracy], feed_dict=feed_dict)
        return accuracy_val
    
    def batch_iter(data, batch_size, num_epochs, shuffle=False):
        '''
        Generates a batch iterator for a dataset.
        '''
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
        for epoch in range(num_epochs):
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    batch_size = 64
    num_epochs = 20
    evaluate_every = 100
    checkpoint_every = 100
    checkpoint= '/home/ubuntu/pynb/rt-movie-reviews/cp'

    # Generate batches
    batches = batch_iter(
        list(zip(X_train, y_train)), batch_size, num_epochs)
    # Training loop. For each batch...
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        step, l, accuracy_train = train_step(x_batch, y_batch)
        if (step % evaluate_every == 0):
            accuracy_val = val_step(X_test, y_test)
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: {:.4f}'.format(accuracy_train))
            print('Validation accuracy: {:.4f}'.format(accuracy_val))
        if (step % checkpoint_every == 0):
            path = saver.save(sess, checkpoint, global_step=step)
            print("Saved model checkpoint to {}\n".format(path))

Initialized
Minibatch loss at step 100: 1.337996
Minibatch accuracy: 0.6250
Validation accuracy: 0.5192
Saved model checkpoint to /home/ubuntu/pynb/rt-movie-reviews/cp-100

Minibatch loss at step 200: 0.893610
Minibatch accuracy: 0.6406
Validation accuracy: 0.5764
Saved model checkpoint to /home/ubuntu/pynb/rt-movie-reviews/cp-200

Minibatch loss at step 300: 1.225151
Minibatch accuracy: 0.5085
Validation accuracy: 0.5933
Saved model checkpoint to /home/ubuntu/pynb/rt-movie-reviews/cp-300

Minibatch loss at step 400: 0.904263
Minibatch accuracy: 0.6719
Validation accuracy: 0.6064
Saved model checkpoint to /home/ubuntu/pynb/rt-movie-reviews/cp-400

Minibatch loss at step 500: 0.785592
Minibatch accuracy: 0.6406
Validation accuracy: 0.6289
Saved model checkpoint to /home/ubuntu/pynb/rt-movie-reviews/cp-500

Minibatch loss at step 600: 0.797718
Minibatch accuracy: 0.6780
Validation accuracy: 0.6120
Saved model checkpoint to /home/ubuntu/pynb/rt-movie-reviews/cp-600

Minibatch loss at step