# Practicing Batch Normalization

### By building a 20 layers convolutional neural network with a fully connected layer to classify handwritten digits.

NOT GOOD FOR CLASSIFYING MNIST DIGITS BUT GOOD FOR PRACTICE 
since Batch Normalization works good with deeper neural nets

In [68]:
#Loading Tensorflow and downloading Mnist
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, reshape=False)


Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [23]:
#Function to create fully connected layers without batch normalization

def fully_connected(prev_layer, num_units):
    
    layer = tf.layers.dense(prev_layer, num_units, activation=tf.nn.relu)
    return layer

In [26]:
#Function to create convolutional layers (without pooling) without Batch Normalization

def conv_layer(prev_layer, layer_depth):
    strides = 2 if layer_depth % 3 ==0 else 1
    layer = tf.layers.conv2d(prev_layer, layer_depth*4, 3, strides=strides, padding="same", activation= tf.nn.relu)
    
    return layer

In [32]:
### Building and training the network on the Mnist dataset, while displaying loss and accuracy
def train(num_batches, batch_size, learning_rate):
    inputs = tf.placeholder(tf.float32, [None, 28,28,1])
    labels = tf.placeholder(tf.float32, [None, 10])
    
    #20 convolutional filters
    layer = inputs
    for layer_i in range(1, 20):
        layer = conv_layer(layer, layer_i)
        
    layer_shape= layer.get_shape().as_list()    
    layer = tf.reshape(layer, shape=[-1, layer_shape[1]*layer_shape[2]*layer_shape[3]])
    
    #fully connected with relu activation
    fc_layer = fully_connected(layer, 100)
    
    #logits - fully connected without relu activation
    logits = tf.layers.dense(fc_layer, 10)
    model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
    
    opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
    
    correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for batch_i in range(num_batches):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)

            # train this batch
            sess.run(opt, {inputs: batch_xs, 
                                 labels: batch_ys})
            
            if batch_i % 100 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images,
                                                              labels: mnist.validation.labels})
                print('Batch: {:>2}: Validation loss: {:>3.5f}, Validation accuracy: {:>3.5f}'.format(batch_i, loss, acc))
            elif batch_i % 25 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys})
                print('Batch: {:>2}: Training loss: {:>3.5f}, Training accuracy: {:>3.5f}'.format(batch_i, loss, acc))

        # At the end, score the final accuracy for both the validation and test sets
        acc = sess.run(accuracy, {inputs: mnist.validation.images,
                                  labels: mnist.validation.labels})
        print('Final validation accuracy: {:>3.5f}'.format(acc))
        acc = sess.run(accuracy, {inputs: mnist.test.images,
                                  labels: mnist.test.labels})
        print('Final test accuracy: {:>3.5f}'.format(acc))
        
        
        correct = 0
        for i in range(100):
            correct += sess.run(accuracy,feed_dict={inputs: [mnist.test.images[i]],
                                                    labels: [mnist.test.labels[i]]})

        print("Accuracy on 100 samples:", correct/100)    


num_batches = 800
batch_size = 64
learning_rate = 0.002

tf.reset_default_graph()

with tf.Graph().as_default():
    train(num_batches, batch_size, learning_rate)
    
            

Batch:  0: Validation loss: 0.69067, Validation accuracy: 0.09900
Batch: 25: Training loss: 0.36572, Training accuracy: 0.04688
Batch: 50: Training loss: 0.32663, Training accuracy: 0.14062
Batch: 75: Training loss: 0.32599, Training accuracy: 0.07812
Batch: 100: Validation loss: 0.32638, Validation accuracy: 0.11260
Batch: 125: Training loss: 0.32680, Training accuracy: 0.07812
Batch: 150: Training loss: 0.32655, Training accuracy: 0.06250
Batch: 175: Training loss: 0.32512, Training accuracy: 0.12500
Batch: 200: Validation loss: 0.32640, Validation accuracy: 0.09900
Batch: 225: Training loss: 0.32114, Training accuracy: 0.20312
Batch: 250: Training loss: 0.32470, Training accuracy: 0.12500
Batch: 275: Training loss: 0.32804, Training accuracy: 0.06250
Batch: 300: Validation loss: 0.32597, Validation accuracy: 0.09900
Batch: 325: Training loss: 0.32647, Training accuracy: 0.06250
Batch: 350: Training loss: 0.32461, Training accuracy: 0.15625
Batch: 375: Training loss: 0.32900, Trainin

### Without Batch Normalization Accuracy is lower then 15%;
Now i'm going to train the same network using batch Normalization to over 90% in the same number of Batches

### BATCH NORMALIZATION: Higher Level Implementation

In [40]:
#Fully connected layer with batch normalization

def fully_connected(prev_layer, num_units, is_training):
    #not using bias since batch normalization implement gamma*layer + beta where beta replace the biases units.
    layer = tf.layers.dense(prev_layer, num_units, use_bias=False, activation=None)
    #adding training=is_training since the network needs to know how to update the pop. statistics
    layer = tf.layers.batch_normalization(layer, epsilon=0.001, beta_initializer=tf.zeros_initializer(), 
                                          gamma_initializer=tf.ones_initializer(), training=is_training)
    
    #adding the activation function
    out = tf.nn.relu(layer)
    return out

In [45]:
#Convolutional Layer with BN
def conv_layer(prev_layer, layer_depth, is_training):
    
    strides = 2 if layer_depth % 3 ==0 else 1
    conv_layer = tf.layers.conv2d(prev_layer, layer_depth*4, 3, strides=strides, padding='same', activation= None, use_bias=False)
    
    conv_layer = tf.layers.batch_normalization(conv_layer, training=is_training)
    conv_layer = tf.nn.relu(conv_layer)
    
    return conv_layer
    

### Training with Batch Normalization
**1** Placeholder for is_training since it will be set to True during training and set to False during inference
**2** is_training is passed in the conv_layer and in the fully_connected layer
**3** optimizer inside the tf.control_dependencies since it has to update pop. statistics of the normalization layers

In [47]:
#When training, the moving_mean and moving_variance need to be updated. 
#By default the update ops are placed in tf.GraphKeys.UPDATE_OPS, 
#so they need to be added as a dependency to the train_op

def train(num_batches, batch_size, learning_rate):
    inputs = tf.placeholder(tf.float32, [None, 28,28,1])
    labels = tf.placeholder(tf.float32, [None, 10])
    is_training = tf.placeholder(tf.bool)
    
    #20 convolutional filters
    layer = inputs
    

    for layer_i in range(1, 20):
        layer = conv_layer(layer, layer_i, is_training)
        
    layer_shape= layer.get_shape().as_list()    
    layer = tf.reshape(layer, shape=[-1, layer_shape[1]*layer_shape[2]*
                                     layer_shape[3]])
    
    #fully connected with batch normalization
    fc_layer = fully_connected(layer, 100, is_training)
    
    #logits - fully connected without relu activation
    logits = tf.layers.dense(fc_layer, 10)
    model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=labels))
    
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
    
    
    
    correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for batch_i in range(num_batches):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)

            # train this batch
            sess.run(opt, {inputs: batch_xs, 
                                    labels: batch_ys, is_training:True})
            
            if batch_i % 100 == 0:
                loss, acc = sess.run([model_loss, accuracy], 
                                     {inputs: mnist.validation.images, 
                                      labels: mnist.validation.labels, is_training:False})
                print('Batch: {:>2}: Validation loss: {:>3.5f}, Validation accuracy: {:>3.5f}'.format(
                    batch_i, loss, acc))
            elif batch_i % 25 == 0:
                loss, acc = sess.run([model_loss, accuracy], 
                                     {inputs: batch_xs, labels: batch_ys, is_training:False})
                print('Batch: {:>2}: Training loss: {:>3.5f}, Training accuracy: {:>3.5f}'.format(
                    batch_i, loss, acc))

        # At the end, score the final accuracy for both the validation and test sets
        acc = sess.run(accuracy, {inputs: mnist.validation.images,
                                  labels: mnist.validation.labels, is_training:False})
        print('Final validation accuracy: {:>3.5f}'.format(acc))
        acc = sess.run(accuracy, {inputs: mnist.test.images,
                                  labels: mnist.test.labels, is_training:False})
        print('Final test accuracy: {:>3.5f}'.format(acc))
        
        
        correct = 0
        for i in range(100):
            correct += sess.run(accuracy,feed_dict={inputs: 
                                                    [mnist.test.images[i]],
                                                    labels: 
                                                    [mnist.test.labels[i]], 
                                                    is_training:False})

        print("Accuracy on 100 samples:", correct/100)    


num_batches = 800
batch_size = 64
learning_rate = 0.002

tf.reset_default_graph()

with tf.Graph().as_default():
    train(num_batches, batch_size, learning_rate)
    
            



Batch:  0: Validation loss: 0.69178, Validation accuracy: 0.09860
Batch: 25: Training loss: 0.60032, Training accuracy: 0.07812
Batch: 50: Training loss: 0.49489, Training accuracy: 0.12500
Batch: 75: Training loss: 0.41710, Training accuracy: 0.12500
Batch: 100: Validation loss: 0.37155, Validation accuracy: 0.11260
Batch: 125: Training loss: 0.34586, Training accuracy: 0.12500
Batch: 150: Training loss: 0.33138, Training accuracy: 0.15625
Batch: 175: Training loss: 0.35367, Training accuracy: 0.12500
Batch: 200: Validation loss: 0.35802, Validation accuracy: 0.14740
Batch: 225: Training loss: 0.25998, Training accuracy: 0.46875
Batch: 250: Training loss: 0.23626, Training accuracy: 0.51562
Batch: 275: Training loss: 0.18696, Training accuracy: 0.60938
Batch: 300: Validation loss: 0.15098, Validation accuracy: 0.75120
Batch: 325: Training loss: 0.07846, Training accuracy: 0.87500
Batch: 350: Training loss: 0.09178, Training accuracy: 0.82812
Batch: 375: Training loss: 0.09913, Trainin

#### With Batch Normalization i got an excellent performance without changes in the network and just one epoch of 800 batches! Accuracy is 99%!

### BATCH NORMALIZATION: Lower Level Implementation

tf.nn.batch_normalization : normalizes a tensor by mean and variance, and applies (optionally) a scale γ to it, as well as an offset β.

In [61]:
#Function to create fully_connected layer. 
def fully_connected(prev_layer, num_units, is_training):
    
    #fc_layer with no bias and no activation in order to implement batch_normalization 
    fc_layer = tf.layers.dense(prev_layer, num_units, use_bias=False, activation=None)
    #beta and gamma are the parameters of the equation y= gamma*x + beta
    beta = tf.Variable(tf.zeros([num_units]))
    gamma = tf.Variable(tf.ones([num_units]))
      
   
    #mean and variance of the population 
    pop_mean = tf.Variable(tf.zeros([num_units]), trainable=False)
    pop_variance = tf.Variable(tf.ones([num_units]), trainable=False)
    
    epsilon = 1e-3
    
    
    def batch_norm_training():
        
        decay= 0.99
        #extracting mean and variance from the current batch using tf.moments
        batch_mean, batch_variance = tf.nn.moments(fc_layer, [0])
        
        #updating mean and variance for the inference phase
        train_mean = tf.assign(pop_mean, (pop_mean * decay) + batch_mean*(1-decay))
        train_variance = tf.assign(pop_variance, (pop_variance * decay) + batch_variance*(1-decay))     
        with tf.control_dependencies([train_mean, train_variance]):
            return tf.nn.batch_normalization(fc_layer, mean=batch_mean, variance=batch_variance, 
                                             offset=beta, scale=gamma, variance_epsilon=epsilon)
    
    def batch_norm_inference():
        return tf.nn.batch_normalization(fc_layer, mean=pop_mean, variance=pop_variance, offset=beta, 
                                         scale=gamma, variance_epsilon=epsilon)
    
    batch_norm_out = tf.cond(is_training, batch_norm_training, batch_norm_inference)
    
    #returning the normalized value after the relu activation 
    return tf.nn.relu(batch_norm_out)
    

In [62]:
#Function to create convolutional_layers with batch normalization.
#using layer depth to create the filter of the network and the strides is not good for the CNN but it's ok 
#for the purpose of practicing with BN

def conv_layer(prev_layer, layer_depth, is_training):
    
    #as before
    strides = 2 if layer_depth %3 ==0 else 1
    
    #defining the filter/feature map
    in_ch = prev_layer.get_shape().as_list()[3]
    out_ch = layer_depth*4
    weights = tf.Variable(tf.truncated_normal([3,3, in_ch, out_ch], stddev=0.05))
    
    #conv layer
    conv_layer = tf.nn.conv2d(prev_layer, weights, strides=[1,strides,strides,1], padding='SAME')
    
    #as before
    gamma = tf.Variable(tf.ones([out_ch]))
    beta = tf.Variable(tf.zeros([out_ch]))
    
    pop_mean = tf.Variable(tf.zeros([out_ch]), trainable=False)
    pop_variance = tf.Variable(tf.ones([out_ch]), trainable=False)
    
    epsilon= 1e-3
    
    def batch_norm_training():
        #as before
                
        decay= 0.99
        #extracting mean and variance from the current batch using tf.moments
        batch_mean, batch_variance = tf.nn.moments(conv_layer, [0, 1, 2], keep_dims=False)
        
        #updating mean and variance for the inference phase
        train_mean = tf.assign(pop_mean, (pop_mean * decay) + batch_mean*(1-decay))
        train_variance = tf.assign(pop_variance, (pop_variance * decay) + batch_variance*(1-decay))     
        with tf.control_dependencies([train_mean, train_variance]):
            return tf.nn.batch_normalization(conv_layer, mean=batch_mean, variance=batch_variance, 
                                             offset=beta, scale=gamma, variance_epsilon=epsilon)
        
    def batch_norm_inference():
        return tf.nn.batch_normalization(conv_layer, mean=pop_mean, variance=pop_variance, offset=beta, 
                                         scale=gamma, variance_epsilon=epsilon)
    
    batch_norm_out = tf.cond(is_training, batch_norm_training, batch_norm_inference)
    
    #returning the normalized value after the relu activation 
    return tf.nn.relu(batch_norm_out)


### Training with BN

In [63]:
#removing tf.control_dependencies since it has already been used in the functions above
def train(num_batches, batch_size, learning_rate):
    inputs = tf.placeholder(tf.float32, [None, 28,28,1])
    labels = tf.placeholder(tf.float32, [None, 10])
    is_training = tf.placeholder(tf.bool)
    
    #20 convolutional filters
    layer = inputs
    

    for layer_i in range(1, 20):
        layer = conv_layer(layer, layer_i, is_training)
        
    layer_shape= layer.get_shape().as_list()    
    layer = tf.reshape(layer, shape=[-1, layer_shape[1]*layer_shape[2]*layer_shape[3]])
    
    #fully connected with batch normalization
    fc_layer = fully_connected(layer, 100, is_training)
    
    #logits - fully connected without relu activation
    logits = tf.layers.dense(fc_layer, 10)
    model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
    
    opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
    
    
    
    correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for batch_i in range(num_batches):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)

            # train this batch
            sess.run(opt, {inputs: batch_xs, 
                                    labels: batch_ys, is_training:True})
            
            if batch_i % 100 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images,
                                                              labels: mnist.validation.labels, is_training:False})
                print('Batch: {:>2}: Validation loss: {:>3.5f}, Validation accuracy: {:>3.5f}'.format(batch_i, loss, acc))
            elif batch_i % 25 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys, is_training:False})
                print('Batch: {:>2}: Training loss: {:>3.5f}, Training accuracy: {:>3.5f}'.format(batch_i, loss, acc))

        
        acc = sess.run(accuracy, {inputs: mnist.validation.images,
                                  labels: mnist.validation.labels, is_training:False})
        print('Final validation accuracy: {:>3.5f}'.format(acc))
        acc = sess.run(accuracy, {inputs: mnist.test.images,
                                  labels: mnist.test.labels, is_training:False})
        print('Final test accuracy: {:>3.5f}'.format(acc))
        
        
        correct = 0
        for i in range(100):
            correct += sess.run(accuracy,feed_dict={inputs: [mnist.test.images[i]],
                                                    labels: [mnist.test.labels[i]], is_training:False})

        print("Accuracy on 100 samples:", correct/100)    


num_batches = 800
batch_size = 64
learning_rate = 0.002

tf.reset_default_graph()

with tf.Graph().as_default():
    train(num_batches, batch_size, learning_rate)
    

Batch:  0: Validation loss: 0.69078, Validation accuracy: 0.10700
Batch: 25: Training loss: 0.58244, Training accuracy: 0.10938
Batch: 50: Training loss: 0.47270, Training accuracy: 0.04688
Batch: 75: Training loss: 0.39788, Training accuracy: 0.07812
Batch: 100: Validation loss: 0.36073, Validation accuracy: 0.10700
Batch: 125: Training loss: 0.35411, Training accuracy: 0.03125
Batch: 150: Training loss: 0.32505, Training accuracy: 0.14062
Batch: 175: Training loss: 0.35393, Training accuracy: 0.10938
Batch: 200: Validation loss: 0.40405, Validation accuracy: 0.11260
Batch: 225: Training loss: 0.47686, Training accuracy: 0.20312
Batch: 250: Training loss: 0.68870, Training accuracy: 0.10938
Batch: 275: Training loss: 0.41925, Training accuracy: 0.15625
Batch: 300: Validation loss: 0.82895, Validation accuracy: 0.11440
Batch: 325: Training loss: 0.76750, Training accuracy: 0.07812
Batch: 350: Training loss: 0.46322, Training accuracy: 0.28125
Batch: 375: Training loss: 0.39270, Trainin

### As before the model trained with Batch Normalization reaches greater performances and higher accuracy!