In [None]:
'''
Author: Sameer Kumar
Date: 24/10/2018
'''

'''
This file trains Neural Network for Digit Classifications of MNIST Using BackPropagation
Neural Architecture:
1 - Three Layers - 1: Input Layer 2: Hidden Layer 3: Output Layer
Input Layer - 784 Inputs
Hidden Layer - 40 Neurons Activation Function tanh
Output Layer - 10 Neurons Activation Function softmax
2 - Weight: w(i, j) indicates weight fed to ith neuron from jth input
Weights in Layer 2 (Hidden Layer) W_Layer_2 [ w(1,0).....w(1,784)
                                              ...................
                                              w(40,0).....w(40,784)] 40x785
Weights in Layer 3 (Output Layer) W_Layer_3 [ w(1,0).....w(1,40)
                                              ...................
                                              w(10,0).....w(10,40)] 10x41
3 - Energy or Loss function: Cross Entropy 
4 - Weights have bias
Note: This network is designed to have only have two layers but can have user defined neurons in each layer.
Weights in Layer 2: n x (785), 784 input neurons, +1 to that to incorporate bias
Weights in Layer3: 10 x (n+1), 10 output neurons works well with Softmax, +1 to incorporate bias.
'''

'''
This code was designed to use following Heuristics to make it efficient:
1 - Scaled/Normalized the Training Set to have mean 0 and variance 1. The parameters used to normalize the Training Set
was used to normalize Test Set.
Note: To report accuracy on different Test Set that data must be normalized in the same way as above
2 - Momentum Method was incorporated in update step.
Note: This code has regularization has incorporated in it. But Hyperparameter search is highly necessary. So set it to zero.
3 - In last layer Softmax activation function coupled with Cross Entropy loss function was used.
4 - Before every epoch of training Shuffling of Training Set Elements was done.
5 - Weights were initialized by taking inspiration from Xavier initialisation. I didn't follow it as it is, but modified to get:
    Layer 1 weights: Gaussian Random Variable N(0, sqrt(2/784))
    Layer 2 weights: Gaussian Random Variable N(0, sqrt(2/10))
    Imp: Even if you change the number of neurons in the hidden layer don't change the weight how weights are initialised.
Note: Code requires optimization to decrease execution time. 
Note: Set alpha = 0, beta = 0.9, eta = 5
'''

In [None]:
# Import Required Files
import numpy
from sklearn import preprocessing
import matplotlib.pyplot as plt
from LoadData import load_training_labels, load_training_images
from LoadData import load_test_images, load_test_labels
from LoadData import InitialWeights

In [None]:
# Load Data
scaler = preprocessing.StandardScaler().fit(load_training_images('train-images-idx3-ubyte.gz'))
Images = scaler.transform(load_training_images('train-images-idx3-ubyte.gz'))
Labels_Train = load_training_labels('train-labels-idx1-ubyte.gz')
Test = scaler.transform(load_test_images('t10k-images-idx3-ubyte.gz'))
Labels_Test = load_test_labels('t10k-labels-idx1-ubyte.gz')
if numpy.DataSource().exists('InitialWeightsLayer2.txt') and numpy.DataSource().exists('InitialWeightsLayer3.txt'):
    W_Layer_2_Guess = numpy.loadtxt('InitialWeightsLayer2.txt')  # Load the Initial Weights
    W_Layer_3_Guess = numpy.loadtxt('InitialWeightsLayer3.txt')   # Load the Initial Weights
else:
    W_Layer_2_Guess, W_Layer_3_Guess = InitialWeights()
    numpy.savetxt('InitialWeightsLayer2.txt', W_Layer_2_Guess)  # Generate the Weights and save them
    numpy.savetxt('InitialWeightsLayer3.txt', W_Layer_3_Guess)  # Generate the Weights and save them

In [None]:
# Hyperparameters
alpha = 0  # Regularization Parameter
beta = 0.9  # Momentum Parameter
eta = 5  # Learning Rate

# Parameters
iterations = 0  # No. of training epochs
epsilon = 0.047  # Error Ratio = Wrong_Predictions/Total_Sample_Size
M21 = 0  # Momentum Vector for Layer 2 Bias
M22 = 0  # Momentum Vector for Layer 2 Weights
M31 = 0  # Momentum Vector for Layer 3 Bias
M32 = 0  # Momentum Vector for Layer 3 Weights
Epoch = numpy.array([])  # Array for storing No. of Training Iterations
CE_Train = numpy.array([])  # Mean Squared Error on Training Set
Error_Train = numpy.array([])  # No. of Misclassfications on Training Set
CE_Test = numpy.array([])  # Mean Squared Error on Test Set
Error_Test = numpy.array([])  # No. of Misclassfications on Test Set
max_iter = 500  # Maximum allowed Iterations for convergence
row, col = Images.shape  # Shape of Input 60000x784
row1, col1 = Test.shape  # Shape of Input 10000x784
D_Train = numpy.zeros((row, 10))
D_Train[numpy.arange(row), Labels_Train] = 1 # Desired Output based on correct Labels for Training Set
D_Test = numpy.zeros((row1, 10))
D_Test[numpy.arange(row1), Labels_Test] = 1 # Desired Output based on correct Labels for Test Set

In [None]:
# Required Functions
# SoftMax
def softmax(v):
    e = numpy.exp(v - numpy.max(v))
    return e/numpy.sum(e)


# Forward Pass
def forward_pass(image, w_layer_2, w_layer_3):
    # V2 and V3 are Locally Induced Fields at Layer 2 and 3
    # Y2 and Y3 are Locally Induced Fields at Layer 2 and 3
    temp_x = numpy.concatenate(([1], image), axis=0)
    v2 = numpy.dot(w_layer_2, temp_x.T)
    y2 = numpy.tanh(v2)
    temp_y = numpy.concatenate(([1], y2), axis=0)
    v3 = numpy.dot(w_layer_3, temp_y)
    y3 = softmax(v3)
    return v2, v3, y2, y3


# Backward Pass
def backward_pass(image, d, w_layer_3, v2, y2, y3):
    delta3 = (1/row) * (d - y3)
    derivative_layer2 = numpy.array([1 - numpy.square(numpy.tanh(i)) for i in v2])
    delta2 = numpy.multiply(numpy.dot(w_layer_3[:, 1:].T, delta3), derivative_layer2)
    gradient_layer_2 = numpy.matmul(-delta2[:, numpy.newaxis], numpy.concatenate(([1], image), axis=0)[:, numpy.newaxis].T)
    gradient_layer_3 = numpy.matmul(-delta3[:, numpy.newaxis], numpy.concatenate(([1], y2), axis=0)[:, numpy.newaxis].T)
    return gradient_layer_2, gradient_layer_3


# Update Weights
def update_weights(m21, m22, m31, m32, gradient_layer_2, gradient_layer_3, w_layer_2, w_layer_3):
    m21 = (beta * m21) - (eta * (gradient_layer_2[:, 0]))  # Momentum - Bias Layer 2
    m22 = (beta * m22) - (eta * (gradient_layer_2[:, 1:] + alpha/60000 * w_layer_2[:, 1:]))  # Momentum plus Regularization - Weights Layer 2
    m31 = (beta * m31) - (eta * (gradient_layer_3[:, 0]))  # Momentum - Bias Layer 3
    m32 = (beta * m32) - (eta * (gradient_layer_3[:, 1:] + alpha/60000 * w_layer_3[:, 1:]))  # Momentum plus Regularization - Weights Layer 3
    w_layer_2[:, 0] = w_layer_2[:, 0] + m21
    w_layer_2[:, 1:] = w_layer_2[:, 1:] + m22
    w_layer_3[:, 0] = w_layer_3[:, 0] + m31
    w_layer_3[:, 1:] = w_layer_3[:, 1:] + m32
    return w_layer_2, w_layer_3, m21, m22, m31, m32


# Calculate Cross Entropy Error and Misclassifications
def calculate_ce(image, label, w_layer_2, w_layer_3):
    rows, cols = image.shape
    ce = 0
    error = 0
    for i in range(0, rows):
        v2, v3, y2, y3 = forward_pass(image[i], w_layer_2, w_layer_3)
        d = numpy.zeros(10)
        d[label[i]] = 1
        ce = ce + numpy.sum(numpy.dot(-d, numpy.log(y3.T)))/rows + alpha * (numpy.sum(numpy.square(w_layer_3[:, 1:]))/rows)
        if label[i] != numpy.argmax(y3):
            error += 1
    return ce, error

# Learning Rate Decay
def check_learning_rate(eta_prime, ce):
    if ce[-1] >= ce[-2]:
        eta_prime = 0.4 * eta_prime
    return eta_prime

In [None]:
# Main Loop
# Iteration 0
# Backpropagation
L = numpy.arange(60000)
numpy.random.shuffle(L) # Shuffling
temp_w2 = W_Layer_2_Guess
temp_w3 = W_Layer_3_Guess
for i in L:
    V2, V3, Y2, Y3 = forward_pass(Images[i], temp_w2, temp_w3)
    Gradient_Layer_2, Gradient_Layer_3 = backward_pass(Images[i], D_Train[i], temp_w3, V2, Y2, Y3)
    temp_w2, temp_w3, M21, M22, M31, M32 = update_weights(M21, M22, M31, M32, Gradient_Layer_2, Gradient_Layer_3, temp_w2, temp_w3)
# Book Keeping
Epoch = numpy.concatenate((Epoch, [iterations]), axis=0)
ce_train, e_train = calculate_ce(Images, Labels_Train, temp_w2, temp_w3)
CE_Train = numpy.concatenate((CE_Train, [ce_train]), axis=0)
Error_Train = numpy.concatenate((Error_Train, [e_train]), axis=0)
ce_test, e_test = calculate_ce(Test, Labels_Test, temp_w2, temp_w3)
CE_Test = numpy.concatenate((CE_Test, [ce_test]), axis=0)
Error_Test = numpy.concatenate((Error_Test, [e_test]), axis=0)
# Print
print('Epoch: ', iterations, ' CE on Training Set: ', ce_train, ' Error on Training Set: ', e_train, ' Error on Test Set: ', e_test, '\n')
# Next...
iterations += 1
# Remaining Epochs
while iterations <= max_iter:
    # Backpropagation
    L = numpy.arange(60000)
    numpy.random.shuffle(L) # Shuffling
    for i in L:
        V2, V3, Y2, Y3 = forward_pass(Images[i], temp_w2, temp_w3)
        Gradient_Layer_2, Gradient_Layer_3 = backward_pass(Images[i], D_Train[i], temp_w3, V2, Y2, Y3)
        temp_w2, temp_w3, M21, M22, M31, M32 = update_weights(M21, M22, M31, M32, Gradient_Layer_2, Gradient_Layer_3, temp_w2, temp_w3)
    # Book Keeping
    Epoch = numpy.concatenate((Epoch, [iterations]), axis=0)
    ce_train, e_train = calculate_ce(Images, Labels_Train, temp_w2, temp_w3)
    CE_Train = numpy.concatenate((CE_Train, [ce_train]), axis=0)
    Error_Train = numpy.concatenate((Error_Train, [e_train]), axis=0)
    ce_test, e_test = calculate_ce(Test, Labels_Test, temp_w2, temp_w3)
    CE_Test = numpy.concatenate((CE_Test, [ce_test]), axis=0)
    Error_Test = numpy.concatenate((Error_Test, [e_test]), axis=0)
    # Print
    print('Epoch: ', iterations, ' CE on Training Set: ', ce_train, ' Error on Training Set: ', e_train, ' Error on Test Set: ', e_test, '\n')
    # Check Termination
    if (Error_Test[-1]/10000) < epsilon:
        # Save Final Weights
        numpy.savetxt('FinalOptimalWeights2.txt', temp_w2)
        numpy.savetxt('FinalOptimalWeights3.txt', temp_w3)
        print('Optimal Weights Reached!!!!!')
        break
    else:
        # Check Learning Rate
        eta = check_learning_rate(eta, CE_Train)
        # Next...
        iterations += 1

In [None]:
# Plot
# Plot 1
fig1, ax1 = plt.subplots()
ax1.plot(Epoch, Error_Train, label='Training Set')
ax1.plot(Epoch, Error_Test, 'g--', label='Test Set')
plt.title(r'No. of Training Iterations VS No. of Misclassifications')
plt.xlabel(r'Epoch $\rightarrow$')
plt.ylabel(r'Misclassifications $\rightarrow$')
plt.legend()
plt.savefig('1.pdf')
# Plot 2
fig2, ax2 = plt.subplots()
ax2.plot(Epoch, CE_Train, label='Training Set')
ax2.plot(Epoch, CE_Test, 'g--', label='Test Set')
plt.title('No of Training Iterations VS Cross Entropy (CE)')
plt.xlabel(r'Epoch $\rightarrow$')
plt.ylabel(r'CE $\rightarrow$')
plt.legend()
plt.savefig('2.pdf')
plt.show()