In [2]:
import pandas as pd
import numpy as np
from  matplotlib import pyplot as plt


df = pd.read_csv("mnist_test.csv")

In [3]:
data = np.array(df)
data.shape

(10000, 785)

In [15]:
m,n = data.shape          # get the shape of the data 
np.random.shuffle(data)  #because data samples werent random 

datatest = data[0:1000].T # transposed the data so that each column represents an example


datatrain = data[1000 : m].T
ytrain = datatrain[0]
xtrain = datatrain[1:n]
xtrain= xtrain/255
xtrain[:,0].shape                                                                  
         

(784,)

In [16]:
def init_parameter():
    w1 = np.random.rand(10,784)-0.5
    b1 = np.random.rand(10 ,1)-0.5
    w2 = np.random.rand(10,10)-0.5
    b2 = np.random.rand(10,1)-0.5
    return w1,b1,w2,b2

def relu(z):
    return np.maximum(0,z)

def softmax(z):
   return np.exp(z)/sum(np.exp(z))

def deriv_relu(z):
    return z>0

def onehot(y):
    onehot_y = np.zeros((y.size, y.max() +1 ))
    onehot_y[np.arange(y.size), y] = 1
    onehot_y = onehot_y.T
    return onehot_y


def forward_propagation(w1 ,b1 , w2 ,b2 , x):
    z1= w1.dot(x) + b1
    a1 = relu(z1)
    z2 = w2.dot(a1) +b2
    a2 = softmax(z2)
    return z1 , a1 , z2 , a2 

def backprop(z1 ,a1 , z2 , a2 , w2,x , y):
    onehot_y = onehot(y)
    dz2 = a2 -onehot_y #calculating the deravative by subtracting activation values from truth labels
    dw2 = 1/m * dz2.dot(a1.T) #This line calculates the gradient of the weights (W2) connecting the hidden layer to the output layer.
    db2 = 1/m * np.sum(dz2) #gradient of bias
    dz1 = w2.T.dot(dz2) * deriv_relu(z1)# the derivative of the loss function with respect to the pre-activation values of the hidden layer
    dw1 = 1/m * dz1.dot(x.T)#the gradient of the weights (W1) connecting the input layer to the hidden layer
    db1 = 1/m * np.sum(dz1)#This line computes the gradient of the biases (b1) in the hidden layer. It calculates the average of the derivative of the loss.
    return dw1 , db1 , dw2 , db2 

def updatepara(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha):
    w1 = w1 - alpha*dw1
    b1 = b1 - alpha* db1
    w2 = w2 - alpha*dw2
    b2 = b2 - alpha*db2
    return w1,b1,w2,b2

def get_predicts(a2):
    return np.argmax(a2,0)

def get_accuracy(predicts , y):
    print(predicts,y)
    return np.sum(predicts==y)/y.size


def gradientdescent(x,y,ite,alpha):
    w1,b1,w2,b2 = init_parameter()
    for i in range(ite):
        z1 ,a1, z2,a2 = forward_propagation(w1,b1,w2,b2,x)
        dw1,db1,dw2,db2 = backprop(z1,a1,z2,a2,w2,x,y)
        w1,b1,w2,b2 = updatepara(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha)
        if i% 10 ==0:
            print('iteration',i)
            print('accuracy', get_accuracy(get_predicts(a2),y))
    return w1 , b1 , w2 ,b2




In [17]:
w1,b1,w2,b2 = gradientdescent(xtrain,ytrain,1000,0.47) #ther value of learning rate was decided by hit and trial and constant tweaking which resulted in this value providing upto 94% accuracy

iteration 0
[7 4 4 ... 0 7 8] [8 8 3 ... 1 8 7]
accuracy 0.082
iteration 10
[7 3 3 ... 3 7 0] [8 8 3 ... 1 8 7]
accuracy 0.30377777777777776
iteration 20
[7 5 3 ... 8 8 0] [8 8 3 ... 1 8 7]
accuracy 0.4677777777777778
iteration 30
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.6434444444444445
iteration 40
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.677
iteration 50
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.6825555555555556
iteration 60
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.7505555555555555
iteration 70
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.7807777777777778
iteration 80
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.7976666666666666
iteration 90
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.8174444444444444
iteration 100
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.8284444444444444
iteration 110
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.8378888888888889
iteration 120
[1 8 3 ... 1 8 0] [8 8 3 ... 1 8 7]
accuracy 0.8464444444444444
iteration 130
[1 8 3 .