In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# load data
dataframe = pd.read_csv("C:\\Abhishek_Data\\My_Data\\Datasets\\Classification\\digit-recognizer\\train.csv")
print(dataframe.shape)
dataframe.head()

In [4]:
data = np.array(dataframe)
m,n = data.shape
np.random.shuffle(data)  # why random,shuffle-> role while training

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255. # normalize b/w [0,1] -> why ?

train_data = data[1000:m].T
Y_train = train_data[0]
X_train = train_data[1:n]
X_train = X_train / 255.

In [190]:
def params_init():  

    #random initialization -> can we initialize to '0' or constant, if not why ?
    # why different initialization for different layers 

    # he initialization for ReLu
    W1 = np.random.randn(10, 784)*np.sqrt(2/784)   
    b1 = np.random.randn(10, 1)*np.sqrt(2/784)

    # Xavier (normal) iniitalization for SoftMax
    W2 = np.random.randn(10, 10)*np.sqrt(2/10)
    b2 = np.random.randn(10, 1)*np.sqrt(2/10)

    return W1, b1, W2, b2

In [17]:
def relu(Z):
    return np.maximum(0, Z) # problems in activation functions -> vanishing/exploding gradient and how to solve them

def softmax(Z):
    exp = np.exp(Z - np.max(Z)) 
    return exp / exp.sum(axis=0)

def deriv_ReLU(Z):
    return Z>0

In [18]:
def one_hot(Y): # different encoding techniques
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [19]:
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = relu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)

    return Z1, A1, Z2, A2

In [181]:
def backward_prop(Y, A2, Z2, W2, A1, Z1, W1, X):
    m = Y.size
    one_hot_Y = one_hot(Y)
    dZ2 = 2*(A2 - one_hot_Y)   # loss functions and types, where to use which one
    dW2 = 1/m * dZ2.dot(A1.T)
    dB2 = 1/m * np.sum(dZ2, 1)

    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1) 
    dW1 = 1/m * dZ1.dot(X.T)
    dB1 = 1/m * np.sum(dZ1, 1)

    return dW1, dB1, dW2, dB2 

In [21]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

In [22]:
def update_params(W1, b1, W2, b2, dW1, dB1, dW2, dB2, alpha):   # gradient based update, any queries ??
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * np.reshape(dB1, (10,1))
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * np.reshape(dB2, (10,1))

    return W1, b1, W2, b2

In [23]:
def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = params_init()

    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, dB1, dW2, dB2 = backward_prop(Y, A2, Z2, W2, A1, Z1, W1, X)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, dB1, dW2, dB2, alpha)

        if i%50 == 0:
            print("Iteration: ", i)
            print("Accuracy: ", get_accuracy(get_predictions(A2), Y))

    return W1, b1, W2, b2

In [194]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 1001, 0.1)

Iteration:  0
[5 5 5 ... 5 6 5] [6 9 7 ... 9 7 8]
Accuracy:  0.1031951219512195
Iteration:  50
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.6429512195121951
Iteration:  100
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.7470243902439024
Iteration:  150
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.7952439024390244
Iteration:  200
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8230243902439024
Iteration:  250
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8428292682926829
Iteration:  300
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8521219512195122
Iteration:  350
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8641463414634146
Iteration:  400
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8715853658536585
Iteration:  450
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8782439024390244
Iteration:  500
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8836585365853659
Iteration:  550
[6 9 7 ... 9 7 8] [6 9 7 ... 9 7 8]
Accuracy:  0.8870243902439024
Iteration:  600
[6 