In [1]:
import numpy as np
import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train, x_test = x_train / 255.0, x_test / 255.0

x_train = x_train.reshape(x_train.shape[0], -1).T
x_test = x_test.reshape(x_test.shape[0], -1).T

y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)
#train on subset
x_train = x_train[:,:10000]
y_train = y_train[:10000]


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


In [2]:
x_train.shape

(784, 10000)

In [3]:
def init_params():
    w1 = np.random.randn(10, 28*28) * np.sqrt(1. / (28*28))
    b1 = np.random.randn(10, 1) * 0.01
    w2 = np.random.randn(10, 10) * np.sqrt(1. / 10)
    b2 = np.random.randn(10, 1) * 0.01
    return w1, b1, w2, b2

def ReLu(z):
    return np.maximum(z, 0)

def ReLu_deriv(z):
    return (z > 0).astype(float)

def Softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def forward_prop(w1, b1, w2, b2, X):
    z1 = np.dot(w1, X) + b1
    a1 = ReLu(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = Softmax(z2)
    return z1, a1, z2, a2

def one_hot(y, num_classes=10):
    y = y.reshape(-1)
    one_hot = np.zeros((num_classes, y.size))
    one_hot[y, np.arange(y.size)] = 1
    return one_hot

def back_prop(z1, a1, z2, a2, w2, x, y):
    m = y.size
    one_hot_y = one_hot(y, num_classes=10)

    dz2 = a2 - one_hot_y
    dw2 = (1 / m) * np.dot(dz2, a1.T)
    db2 = (1 / m) * np.sum(dz2, axis=1, keepdims=True)

    dz1 = np.dot(w2.T, dz2) * ReLu_deriv(z1)
    dw1 = (1 / m) * np.dot(dz1, x.T)
    db1 = (1 / m) * np.sum(dz1, axis=1, keepdims=True)

    return dw1, db1, dw2, db2

def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 -= alpha * dw1
    b1 -= alpha * db1
    w2 -= alpha * dw2
    b2 -= alpha * db2
    return w1, b1, w2, b2

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions, y):
    return np.mean(predictions == y)



In [6]:
def gradient_descent(X, Y, iterations, alpha):
    w1, b1, w2, b2 = init_params()

    for i in range(iterations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, X)
        dw1, db1, dw2, db2 = back_prop(z1, a1, z2, a2, w2, X, Y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)

        if i % 50 == 0:
            accuracy = get_accuracy(get_predictions(a2), Y)
            print(f"Iteration {i}: Accuracy = {accuracy:.4f}")
    return w1, b1, w2, b2

w1, b1, w2, b2 = gradient_descent(x_train, y_train, iterations=1000, alpha=0.1)


Iteration 0: Accuracy = 0.1222
Iteration 50: Accuracy = 0.7546
Iteration 100: Accuracy = 0.8372
Iteration 150: Accuracy = 0.8697
Iteration 200: Accuracy = 0.8873
Iteration 250: Accuracy = 0.8980
Iteration 300: Accuracy = 0.9053
Iteration 350: Accuracy = 0.9110
Iteration 400: Accuracy = 0.9165
Iteration 450: Accuracy = 0.9209
Iteration 500: Accuracy = 0.9228
Iteration 550: Accuracy = 0.9252
Iteration 600: Accuracy = 0.9273
Iteration 650: Accuracy = 0.9293
Iteration 700: Accuracy = 0.9299
Iteration 750: Accuracy = 0.9313
Iteration 800: Accuracy = 0.9318
Iteration 850: Accuracy = 0.9327
Iteration 900: Accuracy = 0.9334
Iteration 950: Accuracy = 0.9345


In [8]:
#test data
def test_predictions(w1, b1, w2, b2,x,y):
  _, _, _, a2 = forward_prop(w1, b1, w2, b2, x)
  accuracy = get_accuracy(get_predictions(a2), y)
  print(f"Accuracy = {accuracy:.4f}")

test_predictions(w1, b1, w2, b2,x_test,y_test)

Accuracy = 0.9129
