In [None]:
# Name: Akshay Reddy Akkati
# ID: YX50097

from sklearn.datasets import fetch_openml

# importing the MNIST dataset from scikit learn
mnist = fetch_openml('mnist_784', version=1, cache=True)
X, y = mnist["data"], mnist["target"]

# scale
X = X / 255

# For testing on XOR I used following as data
# inputs = np.array([[0,0],[0,1],[1,0],[1,1]])
# expected_output = np.array([[0],[1],[1],[0]])

In [None]:
import numpy as np
import pandas as pd

# one-hot encoding on categorical data
digits = 10
examples = y.shape[0]
y = y.reshape(1, examples)
Y_new = np.eye(digits)[y.astype('int32')]
Y_new = Y_new.T.reshape(digits, examples)

# split the data into train, dev and test in the ratio of 70, 20 and 10 
# number of train records
n = 49000
#  total number of train and dev records
m = 63000

m_test = X.shape[0] - m
X_train, X_dev, X_test = X[:n].T, X[n:m].T, X[m:].T
Y_train, Y_dev, Y_test = Y_new[:,:n], Y_new[:, n:m], Y_new[:,m:]
shuffle_index = np.random.permutation(n)
X_train, Y_train = X_train[:, shuffle_index], Y_train[:, shuffle_index]

In [None]:
def sigmoid(z):
    return 1. / (1. + np.exp(-z))

def relu(z):
    return np.maximum(0,z)

def tan_h(z):
    res = (np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z))
    return res

def compute_loss(Y, Y_hat):

    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum
    return L

def forward_propagate(X, params):

    cache = {}
    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]
    cache["A1"] = sigmoid(cache["Z1"])
#     cache["A1"] = tan_h(cache["Z1"])
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]
    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)
    return cache


def back_propagate(X, Y, params, cache):

    dZ2 = cache["A2"] - Y
    dW2 = (1./m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1./m_batch) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))
#     dZ1 = dA1 * tan_h(cache["Z1"]) * (1 - tan_h(cache["Z1"]))
    dW1 = (1./m_batch) * np.matmul(dZ1, X.T)
    db1 = (1./m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return gradients

In [None]:
np.random.seed(138)
# np.random.seed(2)
# w1 = np.random.rand(n_h,n_x)
# w2 = np.random.rand(n_y,n_h)

# hyperparameters
n_x = X_train.shape[0]
# number of neurons for the model
number_of_neurons = 56
# learning_rate = the rate at which the model should be learned
learning_rate = 0.3
beta = .9
batch_size = 200
# batches = batch_size = the size of batches of inputs
batches = -(-m // batch_size)
# epochs=number of iterations
epochs = 20

# initialization
params = { "W1": np.random.randn(number_of_neurons, n_x) * np.sqrt(1. / n_x),
           "b1": np.zeros((number_of_neurons, 1)) * np.sqrt(1. / n_x),
           "W2": np.random.randn(digits, number_of_neurons) * np.sqrt(1. / number_of_neurons),
           "b2": np.zeros((digits, 1)) * np.sqrt(1. / number_of_neurons) }

V_dW1 = np.zeros(params["W1"].shape)
V_db1 = np.zeros(params["b1"].shape)
V_dW2 = np.zeros(params["W2"].shape)
V_db2 = np.zeros(params["b2"].shape)

# train
for i in range(epochs):

    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[:, permutation]
    
#     :::::For testing XOR data:::::
#     z1,a1,z2,a2 = forward_prop(w1,w2,x)
#     loss = compute_loss(y,a2)
#     losses.append(loss)
#     da2,dw2,dz1,dw1 = back_prop(m,w1,w2,z1,a1,z2,a2,y)
#     w2 = w2-lr*dw2
#     w1 = w1-lr*dw1
# def predict(w1,w2,input):
#     z1,a1,z2,a2 = forward_prop(w1,w2,test)
#     a2 = np.squeeze(a2)
#     if a2>=0.5:
#         print( "1")
#     else:
#         print("0")

    for j in range(batches):

        begin = j * batch_size
        end = min(begin + batch_size, X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

        cache = forward_propagate(X, params)
        grads = back_propagate(X, Y, params, cache)

        V_dW1 = (beta * V_dW1 + (1. - beta) * grads["dW1"])
        V_db1 = (beta * V_db1 + (1. - beta) * grads["db1"])
        V_dW2 = (beta * V_dW2 + (1. - beta) * grads["dW2"])
        V_db2 = (beta * V_db2 + (1. - beta) * grads["db2"])

        params["W1"] = params["W1"] - learning_rate * V_dW1
        params["b1"] = params["b1"] - learning_rate * V_db1
        params["W2"] = params["W2"] - learning_rate * V_dW2
        params["b2"] = params["b2"] - learning_rate * V_db2

    cache = forward_propagate(X_train, params)
    train_loss = compute_loss(Y_train, cache["A2"])
    cache = forward_propagate(X_dev, params)
    dev_loss = compute_loss(Y_dev, cache["A2"])
    print("At Epoch:" + str(i+1) + " train-loss = " + str(train_loss) + " & dev-loss = " + str(dev_loss))

print("Training finished")

In [None]:
from sklearn.metrics import classification_report
# Testing the model on test data and printing the report
cache = forward_propagate(X_test, params)
predictions = np.argmax(cache["A2"], axis=0)
labels = np.argmax(Y_test, axis=0)

print(classification_report(predictions, labels))