In [1]:
import struct
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime

**Define Data loader & Preprocessing Steps**

In [2]:
path = os.path.join(os.path.expanduser('~'), 'Documents', 'OR 610')
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)
    
def oneHotEncoding(label):
    n = np.max(label)+1
    v = np.eye(n)[label]
    return v.T


def imageProcess(data):
    data = data/255
    data = data.reshape(data.shape[0],data.shape[1]*data.shape[2])
    return data.T

**Define activation functions for forward pass**

In [None]:
def softMax(X):
    #Write your code here
    pass

def ReLU(z):
  #Write your code here
    pass


def sigmoid(z):
  #Write your code here
    pass


def tanh(z):
  #Write your code here
    pass



**Define Activation functions for backward pass i.e. first derivative of the forward pass activation function**

In [None]:
def dReLU(z):
  #Write your code here
    pass

def dSigmoid(z):
  #Write your code here
    pass

def dTanh(z):
  #Write your code here
    pass

Multi label cross entropy with L2 regularization

**Model Procedures**

*Forward Pass:*

\\(Z_i = W_i \bullet x^T + b_i \\)

\\(A_i = \sigma(Z_i)\\)

\\(\hat{y} = A_i\\)

where \\(\sigma\\) is a nonlinear transformation

*Loss Function* with regularization

\\(L(y,\hat{y}) = -\frac{1}{m} \Sigma_j \Sigma_i y_i log(\hat{y_i}) + \frac{\lambda}{2*m} * (
\Sigma_w w^2)\\)

*Back propagation: here we use differental equations and use the chain rule first starting with the cost function and work backwards until we get to weights since we want to learn the weights that give a better fit*

\\(\frac{\delta L}{\delta w_i} = \frac{\delta L}{\delta \hat{y}} * \frac{\delta \hat{y}}{\delta z} * \frac{\delta z}{\delta w_i}\\)

*Update weights*

\\(w_i = w_i - \eta * \delta w_i - \frac {(w_i * \lambda * \eta)}{m}\\)

where \\(\eta\\) is the learning rate

In [None]:
def crossEntropyR2(y, y_hat, lamda, params):
    m = y.shape[1]
    cost = -(1/m) * np.sum(y*np.log(y_hat)) + lamda/(2*m) * (np.sum(params['W1']**2) + np.sum(params['W2']**2))
    return cost

def forward(X,params,activation):

    forwardPass = {}
    forwardPass['Z1'] = #Write your code here
    forwardPass['A1'] = #Write your code here
    forwardPass['Z2'] = #Write your code here
    forwardPass['A2'] = #Write your code here
    return forwardPass


def back(X, y,forwardPass, params,dActivation):
    m = X.shape[1]
    gradient = {}
    gradient['dZ2'] = forwardPass['A2'] - y
    gradient['dW2'] = #Write your code here
    gradient['db2'] = #Write your code here
    gradient['dA1'] = #Write your code here
    gradient['dZ1'] = #Write your code here
    gradient['dW1'] = #Write your code here
    gradient['db1'] = #Write your code here
    return gradient

def updater(params,grad,eta,lamda,m):
    updatedParams = {}
    updatedParams['W2'] = #Write your code here
    updatedParams['b2'] = #Write your code here
    updatedParams['W1'] = #Write your code here
    updatedParams['b1'] = #Write your code here
    return updatedParams

def classifer(X, params,activation):
    Z1 = np.matmul(params['W1'], X) + params['b1']
    A1 = activation(Z1)
    Z2 = np.matmul(params['W2'],A1) + params['b2']
    A2 = softMax(Z2)
    pred = np.argmax(A2, axis=0)
    return pred


Load Data to memory and define hyper params

In [4]:

X_train = imageProcess(read_idx('train-images.idx3-ubyte'))
y_train = oneHotEncoding(read_idx('train-labels-idx1-ubyte'))
X_test = imageProcess(read_idx('t10k-images-idx3-ubyte'))
y_test = read_idx('t10k-labels-idx1-ubyte')

#### General Hyperparameters
m=100 #batch size
n_x = X_train.shape[0]
n_h = 100
eta = 1
lamda = 2
np.random.seed(7)
epoch = 10


  


Sigmoid - Activation function

In [None]:
#m = X_train.shape[1]
#Initializing weightss
sigmoidParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(1. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(1. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,sigmoidParams,sigmoid)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, sigmoidParams)
    #back Prop
    gradient = back(X, y, forwardPass, sigmoidParams,dSigmoid)
    #updating weights
    sigmoidParams=updater(sigmoidParams,gradient,eta,lamda,m)

difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, sigmoidParams, sigmoid)

print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))


Final cost: 1.7302034675508282
time to train: 0:00:02.809689
Accuracy: 0.6735


ReLU Activation Function

In [None]:
#######RELU SECTION ############
reluParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(2. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(2. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,reluParams,ReLU)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, reluParams)
    #back Prop
    gradient = back(X, y, forwardPass, reluParams,dReLU)
    #updating weights
    reluParams=updater(reluParams,gradient,eta,lamda,m)
difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, reluParams, ReLU)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))



Final cost: 2.1930798741698205
time to train: 0:00:01.821118
Accuracy: 0.2032


Tanh Activation Function

In [None]:
#######tanh SECTION ############
tanhParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(1. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(1. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,tanhParams,tanh)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, tanhParams)
    #back Prop
    gradient = back(X, y, forwardPass, tanhParams,dTanh)
    #updating weights
    tanhParams=updater(tanhParams,gradient,eta,lamda,m)
difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, tanhParams, tanh)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))

Final cost: 3.921825705663649
time to train: 0:00:02.544482
Accuracy: 0.1703
