In [11]:
import pandas as pd
import numpy as np
from  matplotlib import pyplot as plt


df = pd.read_csv("mnist_train.csv")
df

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data = np.array(df)

data.shape

(60000, 785)

In [13]:
m,n = data.shape          # get the shape of the data 
np.random.shuffle(data)  #because data samples werent random 

datatest = data[0:10000].T # transposed the data so that each column represents an example


datatrain = data[10000 : m].T
ytrain = datatrain[0]
xtrain = datatrain[1:n]
xtrain= xtrain/255
xtrain[:,0].shape                                                                  
         

(784,)

In [14]:
def init_parameter():
    w1 = np.random.rand(20, 784) - 0.5
    b1 = np.random.rand(20, 1) - 0.5
    w2 = np.random.rand(10, 20) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    w3 = np.random.rand(10, 10) - 0.5
    b3 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2, w3, b3

def relu(z):
    return np.maximum(0,z)

def softmax(z):
   return np.exp(z)/sum(np.exp(z))

def deriv_relu(z):
    return z>0

def onehot(y, num_classes):
    onehot_y = np.zeros((num_classes, y.size))
    onehot_y[y, np.arange(y.size)] = 1
    return onehot_y

def forward_propagation(w1, b1, w2, b2, w3, b3, x):
    z1 = w1.dot(x) + b1
    a1 = relu(z1)
    z2 = w2.dot(a1) + b2
    a2 = relu(z2)
    z3 = w3.dot(a2) + b3
    a3 = softmax(z3)
    return z1, a1, z2, a2, z3, a3

def backprop(z1, a1, z2, a2, z3, a3, w2, w3, x, y):
    onehot_y = onehot(y, num_classes=10)
    dz3 = a3 - onehot_y
    dw3 = 1/m * dz3.dot(a2.T)
    db3 = 1/m * np.sum(dz3)
    dz2 = w3.T.dot(dz3) * deriv_relu(z2)
    dw2 = 1/m * dz2.dot(a1.T)
    db2 = 1/m * np.sum(dz2)
    dz1 = w2.T.dot(dz2) * deriv_relu(z1)
    dw1 = 1/m * dz1.dot(x.T)
    db1 = 1/m * np.sum(dz1)
    return dw1, db1, dw2, db2, dw3, db3

def updatepara(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1
    w2 = w2 - alpha * dw2
    b2 = b2 - alpha * db2
    w3 = w3 - alpha * dw3
    b3 = b3 - alpha * db3
    return w1, b1, w2, b2, w3, b3

def get_predicts(a3):
    return np.argmax(a3,0)

def get_accuracy(predicts , y):
    print(predicts,y)
    return np.sum(predicts==y)/y.size

def gradientdescent(x, y, ite, alpha):
    w1, b1, w2, b2, w3, b3 = init_parameter()
    for i in range(ite):
        z1, a1, z2, a2, z3, a3 = forward_propagation(w1, b1, w2, b2, w3, b3, x)

        dw1, db1, dw2, db2, dw3, db3 = backprop(z1, a1, z2, a2, z3, a3, w2, w3, x, y)

        w1, b1, w2, b2, w3, b3 = updatepara(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)

        if i % 10 == 0:
            print('iteration', i)
            print('accuracy', get_accuracy(get_predicts(a3), y))

    return w1, b1, w2, b2, w3, b3

def stochastic_gradient_descent(x, y, epochs, alpha, batch_size):
    w1, b1, w2, b2, w3, b3 = init_parameter()
    m = x.shape[1]  # Number of training examples

    for epoch in range(epochs):
        for batch_start in range(0, m, batch_size):
            batch_end = min(batch_start + batch_size, m)
            batch_x = x[:, batch_start:batch_end]
            batch_y = y[batch_start:batch_end]

            z1, a1, z2, a2, z3, a3 = forward_propagation(w1, b1, w2, b2, w3, b3, batch_x)
            dw1, db1, dw2, db2, dw3, db3 = backprop(z1, a1, z2, a2, z3, a3, w2, w3, batch_x, batch_y)
            w1, b1, w2, b2, w3, b3 = updatepara(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)

        if epoch % 10 == 0:
            print('Epoch', epoch)
            _, _, _, _, _, a3 = forward_propagation(w1, b1, w2, b2, w3, b3, x)
            predicts = get_predicts(a3)
            accuracy = get_accuracy(predicts, y)
            print('Accuracy', accuracy)

    return w1, b1, w2, b2, w3, b3


def mini_batch_gradient_descent(x, y, epochs, alpha, batch_size):
    w1, b1, w2, b2, w3, b3 = init_parameter()
    m = x.shape[1]  # Number of training examples

    for epoch in range(epochs):
        shuffled_indices = np.random.permutation(m)
        x_shuffled = x[:, shuffled_indices]
        y_shuffled = y[shuffled_indices]

        for batch_start in range(0, m, batch_size):
            batch_end = min(batch_start + batch_size, m)
            batch_x = x_shuffled[:, batch_start:batch_end]
            batch_y = y_shuffled[batch_start:batch_end]

            z1, a1, z2, a2, z3, a3 = forward_propagation(w1, b1, w2, b2, w3, b3, batch_x)
            dw1, db1, dw2, db2, dw3, db3 = backprop(z1, a1, z2, a2, z3, a3, w2, w3, batch_x, batch_y)
            w1, b1, w2, b2, w3, b3 = updatepara(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)

        if epoch % 10 == 0:
            print('Epoch', epoch)
            _, _, _, _, _, a3 = forward_propagation(w1, b1, w2, b2, w3, b3, x)
            predicts = get_predicts(a3)
            accuracy = get_accuracy(predicts, y)
            print('Accuracy', accuracy)

    return w1, b1, w2, b2, w3, b3




In [15]:
w1,b1,w2,b2,w3,b3 = gradientdescent(xtrain,ytrain,1000,0.35) #ther value of learning rate was decided by hit and trial and constant tweaking which resulted in this value providing upto 94% accuracy

iteration 0
[3 3 3 ... 3 3 3] [5 7 9 ... 6 7 0]
accuracy 0.10326
iteration 10
[2 3 3 ... 0 3 0] [5 7 9 ... 6 7 0]
accuracy 0.1924
iteration 20
[3 9 9 ... 0 4 0] [5 7 9 ... 6 7 0]
accuracy 0.29014
iteration 30
[6 9 9 ... 3 8 0] [5 7 9 ... 6 7 0]
accuracy 0.40024
iteration 40
[6 9 9 ... 6 8 0] [5 7 9 ... 6 7 0]
accuracy 0.55526
iteration 50
[6 9 4 ... 2 4 2] [5 7 9 ... 6 7 0]
accuracy 0.51258
iteration 60
[6 4 4 ... 6 8 0] [5 7 9 ... 6 7 0]
accuracy 0.57778
iteration 70
[6 7 4 ... 6 8 0] [5 7 9 ... 6 7 0]
accuracy 0.61654
iteration 80
[6 7 4 ... 6 8 0] [5 7 9 ... 6 7 0]
accuracy 0.64374
iteration 90
[6 7 4 ... 6 8 0] [5 7 9 ... 6 7 0]
accuracy 0.68118
iteration 100
[6 7 4 ... 6 8 0] [5 7 9 ... 6 7 0]
accuracy 0.70624
iteration 110
[8 7 9 ... 6 9 0] [5 7 9 ... 6 7 0]
accuracy 0.72698
iteration 120
[8 7 9 ... 6 9 0] [5 7 9 ... 6 7 0]
accuracy 0.7461
iteration 130
[8 7 9 ... 6 9 0] [5 7 9 ... 6 7 0]
accuracy 0.76312
iteration 140
[8 7 9 ... 6 9 0] [5 7 9 ... 6 7 0]
accuracy 0.7765
iteration

In [17]:
w1 , b1 ,w2 ,b2,w3,b3 =mini_batch_gradient_descent(xtrain,ytrain,10,0.3,10)

Epoch 0
[6 0 9 ... 0 9 2] [5 7 9 ... 6 7 0]
Accuracy 0.14976


In [18]:
w1 , b1 ,w2 ,b2,w3,b3 =stochastic_gradient_descent(xtrain,ytrain,100,0.3,10)

Epoch 0
[8 8 8 ... 2 8 2] [5 7 9 ... 6 7 0]
Accuracy 0.15584
Epoch 10
[1 7 7 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.4243
Epoch 20
[2 7 7 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.54148
Epoch 30
[2 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.629
Epoch 40
[2 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.69348
Epoch 50
[4 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.73544
Epoch 60
[4 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.76404
Epoch 70
[4 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.78456
Epoch 80
[4 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.80046
Epoch 90
[4 7 9 ... 2 7 0] [5 7 9 ... 6 7 0]
Accuracy 0.81296
