In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.pylab
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
class DateSets():
    """The datasets used to train, validate and test the model"""
    
    
    def __init__(self, n_features=None, n_labels=None):
        """Load training data, validation data and test data
        
        Args:
            n_features: the number of features
            n_labels: the number of labels
        """
        self.training_x = None
        self.training_y = None
        self.test_x = None
        self.test_y = None
        self.validation_x = None
        self.validation_y = None
        self.all_x = None
        self.all_y = None
        self.n_features = n_features
        self.n_labels = n_labels
        
        
    def load_training(self, training_x, training_y):
        """Load training data
        
        Args:
            training_x: features of training data
            training_y: labels of training data
        """
        assert training_x.shape[0] == len(training_y)
        self.training_x = training_x
        self.training_y = training_y
        if self.n_features is None:
            self.n_features = training_x.shape[1]
        if self.n_labels is None:
            self.n_labels = len(set(training_y))
        self.training_sample_size = training_x.shape[0]
        
        
    def load_test(self, test_x, test_y):
        """Load training data
        
        Args:
            test_x: feature of test data
            test_y: labels of test data
        """
        self.test_x = test_x
        self.test_y = test_y
        
        
    def load_validation(self, validation_x, validation_y):
        """Load training data
        
        Args:
            validation_x: features of validation data
            validation_y: labels of validation data
        """
        self.validation_x = validation_x
        self.validation_y = validation_y
        
        
    def load_all(self, all_x, all_y):
        """Loading all the data to train the final model
        
        Args:
            all_x: features of all data
            all_y: labels of all data
        """
        self.all_x = all_x
        self.all_y = all_y

In [9]:
class FullyConnectedHiddenLayer():
    """A fully connected hidden layer"""
    
    
    def __init__(self):
        pass

In [10]:
raw_data = pd.read_csv('/Users/yinan/kaggle/digit_recognizer/data/train.csv')
raw_data = raw_data.reindex(np.random.permutation(raw_data.index))

In [11]:
sample_num = raw_data.shape[0]
training_data = raw_data.iloc[:int(sample_num * 0.8), :]
validation_data = raw_data.iloc[int(sample_num * 0.8):int(sample_num * 0.9), :]
test_data = raw_data.iloc[int(sample_num * 0.9):, :]
print(training_data.shape)
print(validation_data.shape)
print(test_data.shape)

(33600, 785)
(4200, 785)
(4200, 785)


In [12]:
mnist = DateSets(n_features=784, n_labels=10)
mnist.load_training(training_data.iloc[:, 1:].values / 255, training_data.iloc[:, 0].values)
mnist.load_validation(validation_data.iloc[:, 1:].values / 255, validation_data.iloc[:, 0].values)
mnist.load_test(test_data.iloc[:, 1:].values / 255, test_data.iloc[:, 0].values)
mnist.load_all(raw_data.iloc[:, 1:].values / 255, raw_data.iloc[:, 0].values)

## MLP  (97.05% accuracy)

In [9]:
batch_size = 100
data_x = tf.placeholder(tf.float32, shape=(None, mnist.n_features))
data_y = tf.placeholder(tf.int32, shape=(None))

In [10]:
n_hidden1_nodes = 128
with tf.name_scope('hidden1'):
    weights = tf.Variable(
        tf.truncated_normal([mnist.n_features, n_hidden1_nodes], stddev=1.0 / np.sqrt(mnist.n_features)),
        name='weights')
    biases = tf.Variable(
        tf.zeros([n_hidden1_nodes]),
        name='biases')
    hidden1 = tf.nn.relu(tf.matmul(data_x, weights) + biases)

In [11]:
n_hidden2_nodes = 32
with tf.name_scope('hidden2'):
    weights = tf.Variable(
        tf.truncated_normal([n_hidden1_nodes, n_hidden2_nodes], stddev=1.0 / np.sqrt(n_hidden1_nodes)),
        name='weights')
    biases = tf.Variable(
        tf.zeros([n_hidden2_nodes]),
        name='biases')
    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)

In [12]:
with tf.name_scope('softmax_linear'):
    weights = tf.Variable(
        tf.truncated_normal([n_hidden2_nodes, mnist.n_labels], stddev=1.0 / np.sqrt(n_hidden2_nodes)),
        name='weights')
    biases = tf.Variable(
        tf.zeros([mnist.n_labels]),
        name='biases')
    logits = tf.matmul(hidden2, weights) + biases

In [13]:
pred = tf.nn.softmax(logits)
accuracy = tf.nn.in_top_k(logits, data_y, k=1)

In [14]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, data_y, name='cross_entropy')
loss = tf.reduce_mean(cross_entropy, name='cross_entropy_mean')

In [15]:
learning_rate = 0.05
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

In [16]:
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

In [17]:
saver = tf.train.Saver()
checkpoint_file = './checkpoint'

In [19]:
tf.scalar_summary(loss.op.name, loss)
summary_op = tf.merge_all_summaries()
summary_writer = tf.train.SummaryWriter('./', sess.graph)

In [21]:
n_batches = int((mnist.training_x.shape[0] / batch_size))
max_epoches = 32
steps = 0
for epoch in range(max_epoches):
    permutation_index = np.random.permutation(range(mnist.training_sample_size))
    training_x = mnist.training_x[permutation_index]
    training_y = mnist.training_y[permutation_index]
    for i in range(n_batches):
        steps += 1
        feed_dict = {data_x: training_x[(i * batch_size): ((i + 1) * batch_size)],
                     data_y: training_y[(i * batch_size): ((i + 1) * batch_size)]}
        _, loss_value = sess.run([optimizer, loss], feed_dict=feed_dict)
        
        if steps % 100 == 0:
            summary_str = sess.run(summary_op, feed_dict=feed_dict)
            summary_writer.add_summary(summary_str, steps)
            summary_writer.flush()
        
        if steps % 1000 == 0:
            print('In step {}, the loss is {}.'.format(steps, loss_value))
            valid_feed_dict = {data_x: mnist.validation_x,
                               data_y: mnist.validation_y}
            valid_accuracy = sess.run(accuracy, feed_dict=valid_feed_dict)
            accuracy_rate = sum(valid_accuracy) / len(valid_accuracy)
            print('The accuracy on validation set is {}.'.format(accuracy_rate))
            saver.save(sess, checkpoint_file, global_step=steps)
saver.save(sess, checkpoint_file, global_step=steps)

In step 1000, the loss is 0.1622794270515442.
The accuracy on validation set is 0.9621428571428572.
In step 2000, the loss is 0.009996295906603336.
The accuracy on validation set is 0.9633333333333334.
In step 3000, the loss is 0.10839683562517166.
The accuracy on validation set is 0.9638095238095238.
In step 4000, the loss is 0.057765811681747437.
The accuracy on validation set is 0.9664285714285714.
In step 5000, the loss is 0.06623770296573639.
The accuracy on validation set is 0.9673809523809523.
In step 6000, the loss is 0.021750975400209427.
The accuracy on validation set is 0.9688095238095238.
In step 7000, the loss is 0.021516375243663788.
The accuracy on validation set is 0.9697619047619047.
In step 8000, the loss is 0.006869563367217779.
The accuracy on validation set is 0.9688095238095238.
In step 9000, the loss is 0.004652712494134903.
The accuracy on validation set is 0.97.
In step 10000, the loss is 0.014274745248258114.
The accuracy on validation set is 0.969047619047619

'./checkpoint-10752'

In [22]:
test_feed_dict = {data_x: mnist.test_x, data_y: mnist.test_y}
test_accuracy = sess.run(accuracy, feed_dict=test_feed_dict)
accuracy_rate = sum(test_accuracy) / len(test_accuracy)
print('The accuracy on test set is {}.'.format(accuracy_rate))

The accuracy on test set is 0.9704761904761905.


## CNN (98.79% accuracy)

In [13]:
batch_size = 100
data_x = tf.placeholder(tf.float32, shape=(None, np.sqrt(mnist.n_features), np.sqrt(mnist.n_features), 1))
data_y = tf.placeholder(tf.int32, shape=(None))

In [14]:
with tf.variable_scope('conv1') as scope:
    kernel = tf.Variable(tf.truncated_normal([5, 5, 1, 32], stddev=0.1))
    biases = tf.Variable(tf.zeros([32]))
    conv = tf.nn.conv2d(data_x, kernel, strides=[1, 1, 1, 1], padding='SAME')
    h_conv = tf.nn.relu(conv + biases)
pooling1 = tf.nn.max_pool(h_conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [15]:
with tf.variable_scope('conv2') as scope:
    kernel = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1))
    biases = tf.Variable(tf.zeros([64]))
    conv = tf.nn.conv2d(pooling1, kernel, strides=[1, 1, 1, 1], padding='SAME')
    h_conv = tf.nn.relu(conv + biases)
pooling2 = tf.nn.max_pool(h_conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [16]:
pooling2_shape = list(pooling2.get_shape())
fc1_input_shape = int(pooling2_shape[1] * pooling2_shape[2] * pooling2_shape[3])
pooling2_flat = tf.reshape(pooling2, [-1, fc1_input_shape])
with tf.variable_scope('fc1') as scope:
    weights = tf.Variable(tf.truncated_normal([fc1_input_shape, 1024], stddev=0.1))
    biases = tf.Variable(tf.zeros([1024]))
    fc1 = tf.nn.relu(tf.matmul(pooling2_flat, weights) + biases)

In [17]:
keep_prob = tf.placeholder(tf.float32)
fc1_drop = tf.nn.dropout(fc1, keep_prob)

In [18]:
with tf.variable_scope('softmax_out') as scope:
    weights = tf.Variable(tf.truncated_normal([1024, mnist.n_labels], stddev=0.1))
    biases = tf.Variable(tf.zeros([mnist.n_labels]))
    logits = tf.matmul(fc1_drop, weights) + biases

In [50]:
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, data_y), name='loss')
pred = tf.nn.softmax(logits)
accuracy = tf.nn.in_top_k(logits, data_y, k=1)

In [20]:
learning_rate = 1e-4
optimizer = tf.train.AdamOptimizer(learning_rate)
train_step = optimizer.minimize(loss)

In [21]:
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

In [22]:
validation_x_flat = np.reshape(mnist.validation_x, [mnist.validation_x.shape[0], 28, 28, 1])

In [23]:
n_batches = int((mnist.training_x.shape[0] / batch_size))
max_epoches = 32
steps = 0
for epoch in range(max_epoches):
    permutation_index = np.random.permutation(range(mnist.training_sample_size))
    training_x = mnist.training_x[permutation_index]
    training_y = mnist.training_y[permutation_index]
    training_x = np.reshape(training_x, [training_x.shape[0], 28, 28, 1])
    for i in range(n_batches):
        steps += 1
        feed_dict = {data_x: training_x[(i * batch_size): ((i + 1) * batch_size)],
                     data_y: training_y[(i * batch_size): ((i + 1) * batch_size)],
                     keep_prob: 0.5
                    }
        _, loss_value = sess.run([train_step, loss], feed_dict=feed_dict)
        
        if steps % 1000 == 0:
            print('In step {}, the loss is {}.'.format(steps, loss_value))
            valid_feed_dict = {data_x: validation_x_flat,
                               data_y: mnist.validation_y,
                               keep_prob: 1
                              }
            valid_accuracy = sess.run(accuracy, feed_dict=valid_feed_dict)
            accuracy_rate = sum(valid_accuracy) / len(valid_accuracy)
            print('The accuracy on validation set is {}.'.format(accuracy_rate))
print('Finish training.')

In step 1000, the loss is 0.1103094294667244.
The accuracy on validation set is 0.9730952380952381.
In step 2000, the loss is 0.05188584327697754.
The accuracy on validation set is 0.9835714285714285.
In step 3000, the loss is 0.02617611177265644.
The accuracy on validation set is 0.985.
In step 4000, the loss is 0.007313924841582775.
The accuracy on validation set is 0.9859523809523809.
In step 5000, the loss is 0.011613022536039352.
The accuracy on validation set is 0.9873809523809524.
In step 6000, the loss is 0.03831010311841965.
The accuracy on validation set is 0.9876190476190476.
In step 7000, the loss is 0.0010470832930877805.
The accuracy on validation set is 0.9883333333333333.
In step 8000, the loss is 0.004456762224435806.
The accuracy on validation set is 0.9885714285714285.
In step 9000, the loss is 0.030779356136918068.
The accuracy on validation set is 0.9888095238095238.
In step 10000, the loss is 0.00938940979540348.
The accuracy on validation set is 0.987619047619047

In [25]:
test_feed_dict = {data_x: np.reshape(mnist.test_x, [mnist.test_x.shape[0], 28, 28, 1]), 
                  data_y: mnist.test_y,
                  keep_prob: 1
                 }
test_accuracy = sess.run(accuracy, feed_dict=test_feed_dict)
accuracy_rate = sum(test_accuracy) / len(test_accuracy)
print('The accuracy on test set is {}.'.format(accuracy_rate))

The accuracy on test set is 0.9878571428571429.


## Predict kaggle test data (99.13% accuracy)

In [26]:
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

In [28]:
n_batches = int((mnist.all_x.shape[0] / batch_size))
max_epoches = 32
steps = 0
for epoch in range(max_epoches):
    permutation_index = np.random.permutation(range(mnist.all_x.shape[0]))
    training_x = mnist.all_x[permutation_index]
    training_y = mnist.all_y[permutation_index]
    training_x = np.reshape(training_x, [training_x.shape[0], 28, 28, 1])
    for i in range(n_batches):
        steps += 1
        batch_x = training_x[(i * batch_size): ((i + 1) * batch_size)]
        batch_y = training_y[(i * batch_size): ((i + 1) * batch_size)]
        feed_dict = {data_x: batch_x,
                     data_y: batch_y,
                     keep_prob: 0.5
                    }
        _, loss_value = sess.run([train_step, loss], feed_dict=feed_dict)
        
        if steps % 1000 == 0:
            print('In step {}, the loss is {}.'.format(steps, loss_value))
            valid_feed_dict = {data_x: batch_x,
                               data_y: batch_y,
                               keep_prob: 1
                              }
            valid_accuracy = sess.run(accuracy, feed_dict=valid_feed_dict)
            accuracy_rate = sum(valid_accuracy) / len(valid_accuracy)
            print('The accuracy on this batch is {}.'.format(accuracy_rate))
print('Finish training.')

In step 1000, the loss is 0.08695012331008911.
The accuracy on this batch is 0.96.
In step 2000, the loss is 0.03808596730232239.
The accuracy on this batch is 1.0.
In step 3000, the loss is 0.02897307090461254.
The accuracy on this batch is 0.99.
In step 4000, the loss is 0.02167578972876072.
The accuracy on this batch is 1.0.
In step 5000, the loss is 0.003554876195266843.
The accuracy on this batch is 1.0.
In step 6000, the loss is 0.007254057098180056.
The accuracy on this batch is 1.0.
In step 7000, the loss is 0.015092527493834496.
The accuracy on this batch is 1.0.
In step 8000, the loss is 0.022795602679252625.
The accuracy on this batch is 1.0.
In step 9000, the loss is 0.0023500279057770967.
The accuracy on this batch is 1.0.
In step 10000, the loss is 0.0030890898779034615.
The accuracy on this batch is 1.0.
In step 11000, the loss is 0.0005180800217203796.
The accuracy on this batch is 1.0.
In step 12000, the loss is 0.006005151197314262.
The accuracy on this batch is 1.0.


In [34]:
kaggle_test = pd.read_csv('/Users/yinan/kaggle/digit_recognizer/data/test.csv')
kaggle_test = kaggle_test.values / 255
kaggle_test = np.reshape(kaggle_test, [-1, 28, 28, 1])

test_pred = sess.run(tf.arg_max(pred, dimension=1), feed_dict={data_x: kaggle_test, keep_prob: 1})

submission = pd.DataFrame([list(range(1, kaggle_test.shape[0] + 1)), list(test_pred)]).transpose()
submission.columns = ['ImageID', 'Label']

submission.to_csv('/Users/yinan/kaggle/digit_recognizer/data/submission_20160824.csv', index=False)