In [10]:
import tensorflow as tf
import time

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [11]:
from tensorflow.keras.callbacks import EarlyStopping

In [12]:
# 1024 neurons in each hidden layers
n_hidden_1 = 1024
n_hidden_2 = 1024
n_hidden_3 = 1024

# input size is the size of a picture: 28*28
# output size
input_size = 784
output_size = 10

# Parameters
learning_rate = 0.0005
training_epochs = 100
batch_size = 200
display_step = 1

In [13]:
def layer(x, weight_shape, bias_shape):
    """
    Defines the network layers
    input:
        - x: input vector of the layer
        - weight_shape: shape the the weight maxtrix
        - bias_shape: shape of the bias vector
    output:
        - output vector of the layer after the matrix multiplication and transformation
    """
    
    weight_init = tf.random_normal_initializer(stddev=(2.0/weight_shape[0])**0.5)
    W = tf.get_variable("W", weight_shape, initializer=weight_init,constraint = tf.keras.constraints.MaxNorm())
    
    bias_init = tf.constant_initializer(value=0)
    b = tf.get_variable("b", bias_shape, initializer=bias_init)
    
    return tf.nn.relu(tf.matmul(x, W) + b)

In [14]:
def inference(x, keep_prob):
    """
    define the structure of the whole network
    input:
        - x: a batch of pictures 
        (input shape = (batch_size*image_size))
        - keep_prob: The keep_prob of dropout layer
    output:
        - a batch vector corresponding to the logits predicted by the network
        (output shape = (batch_size*output_size)) 
    """
    if keep_prob == 1:
        x = tf.nn.dropout(x,keep_prob)
    else:
        x = tf.nn.dropout(x, 0.8)
    x = tf.reshape(x,[-1,28,28,1])

    with tf.variable_scope("fully_connected1"):
        
        # pass the output of max-pooling into a Fully_Connected layer
        x = tf.reshape(x,[-1,28*28])
        # after reshaping, use fully-connected layer to compress
        fc_1 = layer(x, [28*28, n_hidden_1], [n_hidden_1])
        
        # apply dropout. You may try to add drop out after every pooling layer.
        # outputs the input element scaled up by 1/keep_prob
        # The scaling is so that the expected sum is unchanged
        fc_1_drop = tf.nn.dropout(fc_1, keep_prob)
    
    with tf.variable_scope("fully_connected2"):
        
        # pass the output of max-pooling into a Fully_Connected layer

        # after reshaping, use fully-connected layer to compress
        fc_2 = layer(fc_1_drop, [n_hidden_1, n_hidden_2], [n_hidden_2])
        
        # apply dropout. You may try to add drop out after every pooling layer.
        # outputs the input element scaled up by 1/keep_prob
        # The scaling is so that the expected sum is unchanged
        fc_2_drop = tf.nn.dropout(fc_2, keep_prob)
        
    with tf.variable_scope("fully_connected3"):
        
        # pass the output of max-pooling into a Fully_Connected layer

        # after reshaping, use fully-connected layer to compress
        fc_3 = layer(fc_2_drop, [n_hidden_2, n_hidden_3], [n_hidden_3])
        
        # apply dropout. You may try to add drop out after every pooling layer.
        # outputs the input element scaled up by 1/keep_prob
        # The scaling is so that the expected sum is unchanged
        fc_3_drop = tf.nn.dropout(fc_3, keep_prob)

    with tf.variable_scope("output"):
        output = layer(fc_3_drop, [n_hidden_3, 10], [10])

    return output


In [15]:
def loss(output, y):
    """
    Computes softmax cross entropy between logits and labels and then the loss 
    
    intput:
        - output: the output of the inference function 
        - y: true value of the sample batch
        
        the two have the same shape (batch_size * num_of_classes)
    output:
        - loss: loss of the corresponding batch (scalar tensor)
    
    """
    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y)    
    loss = tf.reduce_mean(xentropy)
    return loss

In [16]:
def training(cost, global_step):
    """
    defines the necessary elements to train the network
    
    intput:
        - cost: the cost is the loss of the corresponding batch
        - global_step: number of batch seen so far, it is incremented by one each time the .minimize() function is called
    """
    tf.summary.scalar("cost", cost)
    # using Adam Optimizer 
    optimizer = tf.train.AdamOptimizer(learning_rate)
    #grads = optimizer.compute_gradients(cost)
    #for i, (g,v) in enumerate(grads):
    #    grads[i] = (tf.clip_by_norm(g,3.5),v)
    #train_op = optimizer.apply_gradients(grads, global_step=global_step)
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5)
    train_op = optimizer.minimize(cost, global_step=global_step)
    return train_op

In [17]:
def evaluate(output, y):
    """
    evaluates the accuracy on the validation set 
    input:
        -output: prediction vector of the network for the validation set
        -y: true value for the validation set
    output:
        - accuracy: accuracy on the validation set (scalar between 0 and 1)
    """
    #correct prediction is a binary vector which equals one when the output and y match
    #otherwise the vector equals 0
    #tf.cast: change the type of a tensor into another one
    #then, by taking the mean of the tensor, we directly have the average score, so the accuracy
    
    correct_prediction = tf.equal(tf.argmax(output, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar("validation_error", (1.0 - accuracy))
    return accuracy

In [18]:
earlystop_cnt = 0
earlystop_threshold = 16
if __name__ == '__main__':
    
    #please, make sure you changed for your own path 
    log_files_path = 'C:/Users/WeiLiu/logs/CNNs/'
    start_time = time.time()
    
    with tf.Graph().as_default():

        with tf.variable_scope("MNIST_DropoutNNRelu_model"):
            #neural network definition
            
            #the input variables are first define as placeholder 
            # a placeholder is a variable/data which will be assigned later 
            # MNIST data image of shape 28*28=784
            x = tf.placeholder("float", [None, 784]) 
            # 0-9 digits recognition
            y = tf.placeholder("float", [None, 10])  
            
            # dropout probability
            keep_prob = tf.placeholder(tf.float32) 
            #the network is defined using the inference function defined above in the code
            output = inference(x, keep_prob)
            cost = loss(output, y)
            #initialize the value of the global_step variable 
            # recall: it is incremented by one each time the .minimise() is called
            global_step = tf.Variable(0, name='global_step', trainable=False)
            train_op = training(cost, global_step)
            #evaluate the accuracy of the network (done on a validation set)
            eval_op = evaluate(output, y)
            summary_op = tf.summary.merge_all()
            saver = tf.train.Saver()
            sess = tf.Session()
            
            summary_writer = tf.summary.FileWriter(log_files_path, sess.graph)
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            
            # Training cycle
            for epoch in range(training_epochs):

                avg_cost = 0.0
                total_batch = int(mnist.train.num_examples/batch_size)
                max_val_acc = 0.0
                prev_tr_acc = 0.0
                
                # Loop over all batches
                for i in range(total_batch):
                    
                    minibatch_x, minibatch_y = mnist.train.next_batch(batch_size)
                    
                    # Fit training using batch data
                    sess.run(train_op, feed_dict={x: minibatch_x, y: minibatch_y, keep_prob: 0.5})
                    
                    # Compute average loss
                    avg_cost += sess.run(cost, feed_dict={x: minibatch_x, y: minibatch_y, keep_prob: 0.5})/total_batch
                
                
                # Display logs per epoch step
                if epoch % display_step == 0:
                    
                    print("Epoch:", '%04d' % (epoch+1), "cost =", "{:0.9f}".format(avg_cost))
                    
                    #probability dropout of 1 during validation
                    accuracy_tr = sess.run(eval_op, feed_dict={x: mnist.train.images, y: mnist.train.labels, keep_prob: 0.5})
                    accuracy_val = sess.run(eval_op, feed_dict={x: mnist.validation.images, y: mnist.validation.labels, keep_prob: 1})
                    print("Validation Error:", (1 - accuracy_val))
                    
                    if accuracy_val < max_val_acc:
                        if accuracy_tr > prev_tr_acc or accuracy_tr > 0.99:
                            if earlystop_cnt == earlystop_threshold:
                                print("early stopped on" + str(epoch))
                                break
                            else:
                                print("overfitting warning:" + str(earlystop_cnt))
                                earlystop_cnt += 1
                        else:
                            earlystop_cnt = 0
                    else:
                        earlystop_cnt = 0
                        max_val_acc = accuracy_val
                        
                    prev_tr_acc = accuracy_tr
                    
                    # probability dropout of 0.25 during training
                    summary_str = sess.run(summary_op, feed_dict={x: minibatch_x, y: minibatch_y, keep_prob: 0.5})
                    summary_writer.add_summary(summary_str, sess.run(global_step))
                    
                    saver.save(sess, log_files_path+'model-checkpoint', global_step=global_step)
                    
            print("Optimization Done")
                    
            accuracy = sess.run(eval_op, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1})
            print("Test Accuracy:", accuracy)
                    
        elapsed_time = time.time() - start_time
        print('Execution time was %0.3f' % elapsed_time)

Epoch: 0001 cost = 0.899320444
Validation Error: 0.06279999017715454
Epoch: 0002 cost = 0.282273567
Validation Error: 0.040400028228759766
Epoch: 0003 cost = 0.210046640
Validation Error: 0.03780001401901245
Epoch: 0004 cost = 0.178982695
Validation Error: 0.031199991703033447
Epoch: 0005 cost = 0.154254755
Validation Error: 0.029600024223327637
Epoch: 0006 cost = 0.136103214
Validation Error: 0.02679997682571411
Epoch: 0007 cost = 0.125222788
Validation Error: 0.025399982929229736
Epoch: 0008 cost = 0.113933550
Validation Error: 0.024200022220611572
Epoch: 0009 cost = 0.105201631
Validation Error: 0.025799989700317383
Epoch: 0010 cost = 0.099740214
Validation Error: 0.024200022220611572
Epoch: 0011 cost = 0.093429579
Validation Error: 0.02319997549057007
Epoch: 0012 cost = 0.089445092
Validation Error: 0.02120000123977661
Epoch: 0013 cost = 0.085177386
Validation Error: 0.021399974822998047
Epoch: 0014 cost = 0.078672063
Validation Error: 0.016799986362457275
Epoch: 0015 cost = 0.0742