In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class NeuralNetwork:

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def sigmoid_prime(x):
        a = NeuralNetwork.sigmoid(x)
        return a * (1 - a)

    @staticmethod
    def softmax(x):
        # Stability fix: subtract max for numerical stability
        exp_shifted = np.exp(x - np.max(x, axis=0, keepdims=True))
        return exp_shifted / np.sum(exp_shifted, axis=0, keepdims=True)

    @staticmethod
    def cross_entropy_loss(y_true, y_pred):
        # Add epsilon to avoid log(0)
        eps = 1e-12
        return -np.mean(y_true * np.log(y_pred + eps))

    @staticmethod
    def cross_entropy_derivative(y_true, y_pred):
        y_true = y_true.astype(np.float32)
        return y_pred - y_true

    def __init__(self, layer_sizes):
        self.costs = []
        self.iters = []
        self.weights = []
        self.biases = []
        self.Layers = len(layer_sizes)

        for k in range(self.Layers - 1):
            # Xavier initialization for sigmoid
            self.weights.append(np.random.randn(layer_sizes[k + 1], layer_sizes[k]) * np.sqrt(1. / layer_sizes[k]))
            self.biases.append(np.zeros((layer_sizes[k + 1], 1)))

    def forward(self, X):
        self.activations = [X]
        self.Z = []

        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = np.dot(w, self.activations[-1]) + b
            self.Z.append(z)

            if i == self.Layers - 2:
                a = NeuralNetwork.softmax(z)
            else:
                a = NeuralNetwork.sigmoid(z)

            self.activations.append(a)

        return self.activations[-1]

    def backward(self, X, y):
        dw = [None] * (self.Layers - 1)
        db = [None] * (self.Layers - 1)
        dz = [None] * (self.Layers)

        y_pred = self.activations[-1]
        cost = NeuralNetwork.cross_entropy_loss(y, y_pred)

        # Last layer derivative
        dz[-1] = NeuralNetwork.cross_entropy_derivative(y, y_pred)

        for l in reversed(range(self.Layers - 1)):
            a_prev = self.activations[l]
            dz_current = dz[l + 1]

            dw[l] = np.dot(dz_current, a_prev.T) / X.shape[1]
            db[l] = np.sum(dz_current, axis=1, keepdims=True) / X.shape[1]

            if l != 0:
                da_prev = np.dot(self.weights[l].T, dz_current)
                dz[l] = da_prev * NeuralNetwork.sigmoid_prime(self.Z[l - 1])

        return cost, dw, db

    def train(self, X, y, alpha1=0.01, epochs=1000, batch_size=32, decay_rate=0.5, decay_interval_percentage = 1):
        m = X.shape[1]

        for epoch in range(epochs):
            perm = np.random.permutation(m)
            X_shuffled = X[:, perm]
            y_shuffled = y[:, perm]

            # alpha = np.cos((epoch/epochs) * (np.pi / 2)) * alpha1
            alpha = alpha1
            # alpha = alpha1 * np.exp(-decay_rate * epoch)

            epoch_cost = 0
            for i in range(0, m, batch_size):
                X_batch = X_shuffled[:, i:i + batch_size]
                y_batch = y_shuffled[:, i:i + batch_size]

                self.forward(X_batch)
                cost, dw, db = self.backward(X_batch, y_batch)
                epoch_cost += cost

                for j in range(self.Layers - 1):
                    self.weights[j] -= alpha * dw[j]
                    self.biases[j] -= alpha * db[j]

            if epoch % (epochs // 10) == 0 or epoch == epochs - 1:
                avg_cost = epoch_cost / (m // batch_size)
                print(f"Epoch {epoch}, Cost: {avg_cost:.8f}")
                self.costs.append(avg_cost)
                self.iters.append(epoch)

            # if epoch % max((decay_interval_percentage * epochs)//100, 1) == 0 and epoch > 0:
            #     alpha *= decay_rate
                

    def predict(self, X):
        return self.forward(X)

    def evaluate(self, X, y_true):
        y_pred = self.forward(X)
        pred_labels = np.argmax(y_pred, axis=0)
        true_labels = np.argmax(y_true, axis=0)
        accuracy = np.mean(pred_labels == true_labels)
        return accuracy

In [3]:
import struct

def load_images(filename):
    with open(filename, 'rb') as f:
        magic, num, rows, cols = struct.unpack(">IIII", f.read(16))
        images = np.frombuffer(f.read(), dtype=np.uint8).reshape(num, rows, cols)
        return images

def load_labels(filename):
    with open(filename, 'rb') as f:
        magic, num = struct.unpack(">II", f.read(8))
        labels = np.frombuffer(f.read(), dtype=np.uint8)
        return labels

train_images = load_images('input/train-images.idx3-ubyte')/255
train_labels = load_labels('input/train-labels.idx1-ubyte')
test_images = load_images('input/t10k-images.idx3-ubyte')/255
test_labels = load_labels('input/t10k-labels.idx1-ubyte')


In [7]:
flat_train_images = train_images.reshape(60000, 28*28).T
flat_test_images = test_images.reshape(10000, 28*28).T


In [8]:
def one_hot(y, num_classes=10):
    return np.eye(num_classes)[y].T  # shape: (10, batch_size)

y_train = one_hot(train_labels)
y_test = one_hot(test_labels)

In [None]:
model = NeuralNetwork([784, 32, 10])
model.train(flat_train_images, y_train, alpha1=5 , epochs=90, batch_size=32, decay_rate=0.9)
acc = model.evaluate(flat_test_images, y_test)
print(f"Test Accuracy: {acc * 100:.2f}%")
acc = model.evaluate(flat_train_images, y_train)
print(f"Train Accuracy: {acc * 100:.2f}%")

Epoch 0, Cost: 0.02999496
Epoch 9, Cost: 0.00690853
Epoch 18, Cost: 0.00420727
Epoch 27, Cost: 0.00259467
Epoch 36, Cost: 0.00160698
Epoch 45, Cost: 0.00101234
Epoch 54, Cost: 0.00075981
Epoch 63, Cost: 0.00061390
Epoch 72, Cost: 0.00053639
Epoch 81, Cost: 0.00049071
Epoch 89, Cost: 0.00046966
Test Accuracy: 96.47%
Train Accuracy: 99.96%


In [39]:
model = NeuralNetwork([784, 32, 10])
model.train(flat_train_images, y_train, alpha1=6, epochs=200, batch_size=32)
acc = model.evaluate(flat_test_images, y_test)
print(f"Test Accuracy: {acc * 100:.2f}%")
acc = model.evaluate(flat_train_images, y_train)
print(f"Train Accuracy: {acc * 100:.2f}%")

Epoch 0, Cost: 0.06812535
Epoch 20, Cost: 0.02612143
Epoch 40, Cost: 0.01974093
Epoch 60, Cost: 0.01654301
Epoch 80, Cost: 0.01452527
Epoch 100, Cost: 0.01334013
Epoch 120, Cost: 0.01193555
Epoch 140, Cost: 0.01119794
Epoch 160, Cost: 0.01045328
Epoch 180, Cost: 0.01019849
Epoch 199, Cost: 0.00933423
Test Accuracy: 94.37%
Train Accuracy: 97.58%
