Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import cPickle as pickle
import numpy as np
import tensorflow as tf
import time
import math

First reload the data we generated in _notmist.ipynb_.

Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [6]:
class BaseTensorFlow:
    def __init__(self):
        self.batch_size = 256
        self.NUM_CLASSES = 10
        self.starting_learning_rate = 0.01
        self.train_dir = './'
        self.num_steps = 50001
        self.IMAGE_PIXELS = 784
        
    def model(self, images, input_size, output_size, isEval=None):
        raise Exception('Error', 'Not implemented')
    
    def loadData(self):
        pickle_file = 'notMNIST.pickle'

        with open(pickle_file, 'rb') as f:
          save = pickle.load(f)
          self.train_dataset = save['train_dataset']
          self.train_labels = save['train_labels']
          self.valid_dataset = save['valid_dataset']
          self.valid_labels = save['valid_labels']
          self.test_dataset = save['test_dataset']
          self.test_labels = save['test_labels']
          del save  # hint to help gc free up memory
    
          self.train_dataset = self.train_dataset.reshape((-1, self.IMAGE_PIXELS)).astype(np.float32)
          self.valid_dataset = self.valid_dataset.reshape((-1, self.IMAGE_PIXELS)).astype(np.float32)
          self.test_dataset = self.test_dataset.reshape((-1, self.IMAGE_PIXELS)).astype(np.float32)

          print 'Training set', self.train_dataset.shape, self.train_labels.shape
          print 'Validation set', self.valid_dataset.shape, self.valid_labels.shape
          print 'Test set', self.test_dataset.shape, self.test_labels.shape

    def loss_function(self,logits, labels):
        labels = tf.expand_dims(labels, 1)
        indices = tf.expand_dims(tf.range(0, self.batch_size), 1)
        concated = tf.concat(1, [indices, labels])
        onehot_labels = tf.sparse_to_dense(
              concated, tf.pack([self.batch_size, self.NUM_CLASSES]), 1.0, 0.0)
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
                                                          onehot_labels,
                                                          name='xentropy')
        loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
        return loss

    def training(self,loss):
        train_size = self.train_dataset.shape[0]
        tf.scalar_summary(loss.op.name, loss)
        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(
            self.starting_learning_rate,      
            global_step * self.batch_size,  
            train_size,          
            0.95,                
            staircase=True)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return train_op

    def evaluation(self,logits, labels):
        correct = tf.nn.in_top_k(logits, labels, 1)
        return tf.reduce_sum(tf.cast(correct, tf.int32))

    def preapare_placeholder_inputs(self):
        self.images_placeholder = tf.placeholder(tf.float32, shape=(self.batch_size,self.IMAGE_PIXELS))
        self.labels_placeholder = tf.placeholder(tf.int32, shape=(self.batch_size))

    def fill_feed_dict(self, dataset, labels, step):
        if labels.shape[0] - self.batch_size > 0:
            offset = (step * self.batch_size) % (labels.shape[0] - self.batch_size)
        else:
            offset = 0
        images_feed = dataset[offset:(offset + self.batch_size), :]
        labels_feed = labels[offset:(offset + self.batch_size)]
        feed_dict = {
            self.images_placeholder: images_feed,
            self.labels_placeholder: labels_feed,
        }
        return feed_dict
    
    def do_eval(self,sess,
            eval_correct,
            dataset, 
            labels):
        true_count = 0  
        steps_per_epoch = labels.shape[0] // self.batch_size
        num_examples = steps_per_epoch * self.batch_size
        for step in xrange(steps_per_epoch):
            feed_dict = self.fill_feed_dict(dataset, labels, step)
            true_count += sess.run(eval_correct, feed_dict=feed_dict)
        
        precision = 1.0*true_count / num_examples
        print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
            (num_examples, true_count, precision))
    
    def run_training(self,sess, eval_correct, train_op, loss):
        
        summary_op = tf.merge_all_summaries()
        summary_writer = tf.train.SummaryWriter(self.train_dir, graph_def=sess.graph_def)
        saver = tf.train.Saver()
    
        feed_dict = self.fill_feed_dict(self.train_dataset, self.train_labels, 0)
    
        for step in xrange(self.num_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss],
                                       feed_dict=feed_dict)
            
            feed_dict = self.fill_feed_dict(self.train_dataset, self.train_labels, step+1)
           
            duration = time.time() - start_time
            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                summary_str = sess.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                
            if (step + 1) % 1000 == 0 or (step + 1) == self.num_steps:
                saver.save(sess, self.train_dir, global_step=step)
                print('Training Data Eval:')
                self.do_eval(sess, eval_correct,
                    feed_dict[self.images_placeholder], feed_dict[self.labels_placeholder])
                print('Validation Data Eval:')
                self.do_eval(sess, eval_correct, self.valid_dataset, self.valid_labels)
        
    def process(self):
        with tf.Graph().as_default():
            self.preapare_placeholder_inputs()
        
            logits_train, regularizer = self.model(self.images_placeholder, 
                                    self.IMAGE_PIXELS, self.NUM_CLASSES)
            loss = self.loss_function(logits_train, self.labels_placeholder)
            loss += 5e-4 * regularizer
            train_op = self.training(loss)
        
            logits_eval = self.model(self.images_placeholder, 
                                     self.IMAGE_PIXELS, self.NUM_CLASSES, isEval=True)
            eval_correct = self.evaluation(logits_eval, self.labels_placeholder)
    
            with tf.Session() as sess:
                init = tf.initialize_all_variables()
                sess.run(init)
            
                self.run_training(sess, eval_correct, train_op, loss)
                print('Test Data Eval:')
                self.do_eval(sess,
                    eval_correct,
                    self.test_dataset, self.test_labels)

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compue the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [8]:
class inference_hidden0(BaseTensorFlow):
    def __init__(self):
        BaseTensorFlow.__init__(self)
        self.SEED = 66478 

    def model(self, images, input_size, output_size, isEval=None):

        with tf.variable_scope('softmax_linear', reuse=isEval):
            weights = tf.get_variable("weights", [input_size, output_size],
                initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(input_size)),
                          seed=self.SEED))
        
            biases = tf.get_variable("biases", [output_size],
                initializer=tf.constant_initializer(0.0))
    
            logits = tf.matmul(images, weights) + biases
            reg_linear = tf.nn.l2_loss(weights) 
    
            if isEval:  
                return logits
            else:
                regularizers = reg_linear
                return (logits, regularizers)
        
class inference_hidden1(BaseTensorFlow):
    def __init__(self):
        BaseTensorFlow.__init__(self)
        self.hidden1_units = 256
        self.SEED = 66478 

    def model(self, images, input_size, output_size, isEval=None):
        with tf.variable_scope('hidden1', reuse=isEval):
            weights = tf.get_variable("weights", [input_size, self.hidden1_units],
                initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(input_size)),
                          seed=self.SEED))    
            biases = tf.get_variable("biases", [self.hidden1_units],
                initializer=tf.constant_initializer(0.0))
    
        reg_hidden1 = tf.nn.l2_loss(weights) 
        hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
   
        with tf.variable_scope('softmax_linear', reuse=isEval):
            weights = tf.get_variable("weights", [self.hidden1_units, output_size],
                initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(self.hidden1_units)),
                          seed=self.SEED))
        
            biases = tf.get_variable("biases", [output_size],
                initializer=tf.constant_initializer(0.0))
    
            logits = tf.matmul(hidden1, weights) + biases
            reg_linear = tf.nn.l2_loss(weights) 
        
            if isEval:  
                return logits
            else:
                regularizers = (reg_hidden1 + reg_linear)
                return (logits, regularizers)
        
#if __name__ == '__main__':
model0 = inference_hidden1()
model0.loadData()
model0.process()
    

Training set (190000, 784) (190000,)
Validation set (10000, 784) (10000,)
Test set (18724, 784) (18724,)
Step 0: loss = 2.37 (0.057 sec)
Step 100: loss = 1.51 (0.010 sec)
Step 200: loss = 1.20 (0.022 sec)
Step 300: loss = 1.04 (0.011 sec)
Step 400: loss = 0.82 (0.023 sec)
Step 500: loss = 0.81 (0.010 sec)
Step 600: loss = 0.82 (0.010 sec)
Step 700: loss = 0.76 (0.013 sec)
Step 800: loss = 0.70 (0.021 sec)
Step 900: loss = 0.91 (0.010 sec)
Training Data Eval:
  Num examples: 256  Num correct: 203  Precision @ 1: 0.7930
Validation Data Eval:
  Num examples: 9984  Num correct: 8071  Precision @ 1: 0.8084
Step 1000: loss = 0.81 (0.012 sec)
Step 1100: loss = 0.73 (0.011 sec)
Step 1200: loss = 0.59 (0.010 sec)
Step 1300: loss = 0.73 (0.011 sec)
Step 1400: loss = 0.74 (0.010 sec)
Step 1500: loss = 0.66 (0.010 sec)
Step 1600: loss = 0.64 (0.011 sec)
Step 1700: loss = 0.65 (0.011 sec)
Step 1800: loss = 0.72 (0.010 sec)
Step 1900: loss = 0.60 (0.017 sec)
Training Data Eval:
  Num examples: 256  

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [None]:
train_dataset1 = train_dataset[:5*batch_size, :]
train_labels1 = train_labels[:5*batch_size]

print 'Training set', train_dataset1.shape, train_labels1.shape

In [None]:
process(train_dataset1, train_labels1, inference_hidden1)

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [None]:
hidden1_units = 256
def inference_hidden2(images, isEval=None):
  # Hidden 1
    with tf.variable_scope('hidden1', reuse=isEval):
        weights = tf.get_variable("weights", [IMAGE_PIXELS, hidden1_units],
            initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(IMAGE_PIXELS)),
                          seed=SEED))    
        biases = tf.get_variable("biases", [hidden1_units],
            initializer=tf.constant_initializer(0.0))
    
        reg_hidden1 = tf.nn.l2_loss(weights) 
        hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
        if not isEval:
            print 'drop out added'
            hidden1 = tf.nn.dropout(hidden1, 0.5, seed=SEED)
  # Linear
    with tf.variable_scope('softmax_linear', reuse=isEval):
        weights = tf.get_variable("weights", [hidden1_units, NUM_CLASSES],
            initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(hidden1_units)),
                          seed=SEED))
        
        biases = tf.get_variable("biases", [NUM_CLASSES],
            initializer=tf.constant_initializer(0.0))
    
        logits = tf.matmul(hidden1, weights) + biases
        reg_linear = tf.nn.l2_loss(weights) 
    
        if isEval:  
            return logits
        else:
            regularizers = (reg_hidden1 + reg_linear)
            return (logits, regularizers)

In [None]:
process(train_dataset1, train_labels1, inference_hidden2)

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [None]:
#Hi, myself and Zhen Zhou from the LISA lab at Université de Montréal trained a couple of 4 
#layer MLPs with 1024-300-50 hidden neurons respectively. We divided the noisy set into 5/6 
#train 1/6 valid and kept the clean set for testing. We 97.1% accuracy on the test set at 412 
#epoch with early stopping, linear decay of the learning rate, a hard constraint on the norm 
#of the weights and tanh activation units. We get approximately 93 on valid and 98 on train. 
#The train set is easy to overfit (you can get 100% accuracy on train if you continue training). 
#One could probably do better if they pursue hyper-optimization further. We used Torch 7.

hidden1_units = 256
hidden2_units = 128
def inference_hidden3(images, isEval=None):
    # Hidden 1
    with tf.variable_scope('hidden1', reuse=isEval):
        weights = tf.get_variable("weights", [IMAGE_PIXELS, hidden1_units],
            initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(IMAGE_PIXELS)),
                          seed=SEED))    
        biases = tf.get_variable("biases", [hidden1_units],
            initializer=tf.constant_initializer(0.0))
    
        reg_hidden1 = tf.nn.l2_loss(weights) 
        hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
        if not isEval:
            hidden1 = tf.nn.dropout(hidden1, 0.5, seed=SEED)
    # Hidden 2
    with tf.variable_scope('hidden2', reuse=isEval):
        weights = tf.get_variable("weights", [hidden1_units, hidden2_units],
            initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(hidden1_units)),
                          seed=SEED))    
        biases = tf.get_variable("biases", [hidden2_units],
            initializer=tf.constant_initializer(0.0))
    
        reg_hidden2 = tf.nn.l2_loss(weights) 
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
        if not isEval:
            hidden2 = tf.nn.dropout(hidden2, 0.5, seed=SEED)
    # Linear
    with tf.variable_scope('softmax_linear', reuse=isEval):
        weights = tf.get_variable("weights", [hidden2_units, NUM_CLASSES],
            initializer=tf.random_normal_initializer(0.0, 1.0 / math.sqrt(float(hidden2_units)),
                          seed=SEED))
        biases = tf.get_variable("biases", [NUM_CLASSES],
            initializer=tf.constant_initializer(0.0))
    
        logits = tf.matmul(hidden2, weights) + biases
        reg_linear = tf.nn.l2_loss(weights) 
    
    if isEval:  
        return logits
    else:
        regularizers = (reg_hidden1 +reg_hidden2+ reg_linear)
        return (logits, regularizers)

In [None]:
process(train_dataset, train_labels, inference_hidden3)