In [1]:
def one_hot_encode(y):
    values = np.unique(y).size
    encoded_outputs = []

    for i in range(len(y)):
        new_output = [0] * values
        new_output[y[i]] = 1
        encoded_outputs.append(new_output)

    return np.array(encoded_outputs)

In [2]:
import numpy as np
import math
import tensorflow

(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.mnist.load_data()

x_train = x_train.reshape((60000, 784))
x_test = x_test.reshape((10000, 784))

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

x_train = x_train.T
x_test = x_test.T # Transposed to match the standard dimensions... The training/testing examples should be column vectors.

y_train_original = y_train
y_test_original = y_test

y_train = one_hot_encode(y_train)
y_test = one_hot_encode(y_test)

y_train = y_train.T
y_test = y_test.T

2025-10-08 12:58:35.246172: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-08 12:58:35.264162: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-08 12:58:35.869945: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-08 12:58:39.310691: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To tur

In [6]:
x_train.shape # Here, we will be implementing mini batch GD. We need to slice the x train array to get the first n columns where n is the batch size.

(784, 60000)

In [10]:
# slicing to get the first 10 training examples in x_train:
x_train[:, :10].shape

(784, 10)

In [11]:
# We shall be using something like this for the forward and backward prop now instead of the single column vectors being used in SGD. Everything else stays the same for now.

In [38]:
class DFN:
    def __init__(self):
        
        self.weights = []
        self.weights.append(0)
        self.weights.append(np.random.randn(128, 784) * np.sqrt(2. / 784)) 
        self.weights.append(np.random.randn(16, 128) * np.sqrt(2. / 128)) 
        self.weights.append(np.random.randn(10, 16) * np.sqrt(2. / 16))


        self.biases = []
        self.biases.append(0)
        self.biases.append(np.zeros((128, 1)))
        self.biases.append(np.zeros((16, 1)))
        self.biases.append(np.zeros((10, 1)))
        
        # Here, I have removed the sigmoid and it's derivative methods altogether. 

    def RELU(self, x):
        return np.maximum(x, 0)

    def RELU_derivative(self, x):
        return (x > 0).astype(int)

    def softmax(self, x):
        shifted_x = x - np.max(x, axis=0, keepdims=True)
        exps = np.exp(shifted_x)
        return exps / np.sum(exps, axis=0, keepdims=True)

    def softmax_derivative(self):
        pass # implemented in the code.

    def forward_prop(self, input): # This method can actually process mini batches already as it uses matrix multiplication.
        a = []
        z = []
        
        a.append(input)
        z.append([0])
        z_temp = (self.weights[1] @ a[0]) + self.biases[1]
        a_temp = self.RELU(z_temp)

        z.append(z_temp)
        a.append(a_temp)

        z_temp = (self.weights[2] @ a[1]) + self.biases[2]
        a_temp = self.RELU(z_temp)

        z.append(z_temp)
        a.append(a_temp)

        z_temp = (self.weights[3] @ a[2]) + self.biases[3]
        a_temp = self.softmax(z_temp)

        z.append(z_temp)
        a.append(a_temp)

        return z, a

    def back_prop(self, z, a, y, alpha):
        m = a[0].shape[1]
        
        dZ3 = a[3] - y 
        dW3 = (1 / m) * (dZ3 @ a[2].T)
        dB3 = (1 / m) * np.sum(dZ3, axis = 1, keepdims = True)
        
        dA2 = self.weights[3].T @ dZ3
        dZ2 = dA2 * self.RELU_derivative(z[2])
        dW2 = (1 / m) * (dZ2 @ a[1].T)
        dB2 = (1 / m) * np.sum(dZ2, axis = 1, keepdims = True)

        dA1 = self.weights[2].T @ dZ2 
        dZ1 = dA1 * self.RELU_derivative(z[1])
        dW1 = (1 / m) * (dZ1 @ a[0].T)
        dB1 = (1 / m) * np.sum(dZ1, axis = 1, keepdims = True)

        self.weights[3] -= alpha * dW3
        self.biases[3] -= alpha * dB3

        self.weights[2] -= alpha * dW2
        self.biases[2] -= alpha * dB2

        self.weights[1] -= alpha * dW1
        self.biases[1] -= alpha * dB1

    def train(self, x_train, y_train, y_train_original, learning_rate = 0.1, epochs = 20, batch_size = 64):
        samples = x_train.shape[1]

        for epoch in range(epochs):
            print(f"Epoch - {epoch}")

            # Here, we shuffle the indices at every epoch.
            shuffled_index = np.random.permutation(samples)
            x_train_shuffled = x_train[:, shuffled_index]
            y_train_shuffled = y_train[:, shuffled_index]
            
            epoch_loss = 0
            
            for i in range(0, samples, batch_size):
                # Add a variable to hold the index of the end of the current batch, and use it in the indexing 
                # to extract a mini batch of columns instead of a single column.
                end = i + batch_size
                x_batch = x_train_shuffled[:, i:end]
                y_batch = y_train_shuffled[:, i:end]
    
                z, a = self.forward_prop(x_batch)
                self.back_prop(z, a, y_batch, alpha = learning_rate)
                
                y_class_indices = np.argmax(y_batch, axis = 0)
                epsilon = 1e-9 # or else log(0) gives inf.
                log_probs = - np.log(a[3][y_class_indices, np.arange(y_batch.shape[1])] + epsilon)
                loss = np.sum(log_probs)
                self.back_prop(z, a, y_batch, alpha = learning_rate)
                epoch_loss += loss
                
            # calculate epoch loss averaged over epoch
            epoch_loss /= samples
            print(f"Epoch loss = {epoch_loss}\n")

        print("Training complete.")

In [39]:
model = DFN()

model.train(x_train, y_train, y_train_original)

Epoch - 0
Epoch loss = 0.3388507579899295

Epoch - 1
Epoch loss = 0.1274720884221863

Epoch - 2
Epoch loss = 0.09281831736342164

Epoch - 3
Epoch loss = 0.07060219329596319

Epoch - 4
Epoch loss = 0.058258079561518426

Epoch - 5
Epoch loss = 0.048306090979272714

Epoch - 6
Epoch loss = 0.039006205167396564

Epoch - 7
Epoch loss = 0.0328506230055295

Epoch - 8
Epoch loss = 0.02824125850103934

Epoch - 9
Epoch loss = 0.0260483820433257

Epoch - 10
Epoch loss = 0.020112576851689285

Epoch - 11
Epoch loss = 0.017485025199045635

Epoch - 12
Epoch loss = 0.01423745758475049

Epoch - 13
Epoch loss = 0.011188672755925482

Epoch - 14
Epoch loss = 0.00749872002048807

Epoch - 15
Epoch loss = 0.006553118305421625

Epoch - 16
Epoch loss = 0.005117532939057155

Epoch - 17
Epoch loss = 0.0031538335169055986

Epoch - 18
Epoch loss = 0.001492761088867562

Epoch - 19
Epoch loss = 0.0008482850605995106

Training complete.


In [40]:
# Model testing

def evaluate_model(model, x_test, y_test_original):
    print("\nStarting model evaluation on the test set...")
    
    correct_predictions = 0
    num_samples = x_test.shape[1]

    for i in range(num_samples):
        image = x_test[:, i].reshape(-1, 1)
        true_label = y_test_original[i]

        _, a = model.forward_prop(image)
        predicted_label = np.argmax(a[3])

        if predicted_label == true_label:
            correct_predictions += 1
            
    accuracy = (correct_predictions / num_samples) * 100
    print(f"Test Accuracy: {accuracy:.2f}%")
    print(f"Correctly classified {correct_predictions} out of {num_samples} samples.")

In [41]:
evaluate_model(model, x_test, y_test_original) # It generally performs better than the previous model. Let's see if we can make it better...


Starting model evaluation on the test set...
Test Accuracy: 97.97%
Correctly classified 9797 out of 10000 samples.
