In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
device = torch.device('cuda')
torch.cuda.is_available()

True

In [4]:
x = torch.ones(5, device=device)
y = torch.tensor([1, 2, 3, 4, 5], device=device)

print(torch.mul(x,y))


tensor([1., 2., 3., 4., 5.], device='cuda:0')


In [None]:
class NeuralNetwork:

    @staticmethod
    def sigmoid(x):
        
        return 1 / (1 + torch.exp(-x))

    @staticmethod
    def sigmoid_prime(x):
        a = NeuralNetwork.sigmoid(x)
        return a * (1 - a)

    @staticmethod
    def softmax(x):
        # Stability fix: subtract max for numerical stability
        exp_shifted = torch.exp(x - torch.max(x, dim=0, keepdim=True))
        return exp_shifted / torch.sum(exp_shifted, dim=0, keepdim=True)

    @staticmethod
    def cross_entropy_loss(y_true, y_pred):
        # Add epsilon to avoid log(0)
        eps = 1e-12
        return -torch.mean(y_true * torch.log(y_pred + eps))

    @staticmethod
    def cross_entropy_derivative(y_true, y_pred):
        y_true = y_true.astype(torch.float32)
        return y_pred - y_true

    def __init__(self, layer_sizes):
        self.costs = []
        self.iters = []
        self.weights = []
        self.biases = []
        self.Layers = len(layer_sizes)

        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')

        for k in range(self.Layers - 1):
            # Xavier initialization for sigmoid
            self.weights.append((torch.from_numpy(np.random.randn(layer_sizes[k + 1], layer_sizes[k]) * np.sqrt(1. / layer_sizes[k]))).to(device=device))
            self.biases.append((torch.from_numpy(np.zeros((layer_sizes[k + 1], 1)))).to(device=device))

    def forward(self, X):
        self.activations = [X]
        self.Z = []

        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = torch.matmul(w, self.activations[-1]) + b
            self.Z.append(z)

            if i == self.Layers - 2:
                a = NeuralNetwork.softmax(z)
            else:
                a = NeuralNetwork.sigmoid(z)

            self.activations.append(a)

        return self.activations[-1]

    def backward(self, X, y):
        dw = [None] * (self.Layers - 1)
        db = [None] * (self.Layers - 1)
        dz = [None] * (self.Layers)

        y_pred = self.activations[-1]
        cost = NeuralNetwork.cross_entropy_loss(y, y_pred)

        # Last layer derivative
        dz[-1] = NeuralNetwork.cross_entropy_derivative(y, y_pred)

        for l in reversed(range(self.Layers - 1)):
            a_prev = self.activations[l]
            dz_current = dz[l + 1]

            dw[l] = torch.dot(dz_current, a_prev.T) / X.shape[1]
            db[l] = torch.sum(dz_current, dim=1, keepdim=True) / X.shape[1]

            if l != 0:
                da_prev = torch.dot(self.weights[l].T, dz_current)
                dz[l] = da_prev * NeuralNetwork.sigmoid_prime(self.Z[l - 1])

        return cost, dw, db

    def train(self, X, y, alpha=0.1, epochs=1000):
        for i in range(epochs):
            self.forward(X)
            cost, dw, db = self.backward(X, y)

            if i % (epochs // 10) == 0 or i == epochs - 1:
                self.costs.append(cost)
                self.iters.append(i)
                print(f"Epoch {i}, Cost: {cost:.4f}")

            for j in range(self.Layers - 1):
                self.weights[j] -= alpha * dw[j]
                self.biases[j] -= alpha * db[j]

    def predict(self, X):
        return self.forward(X)

    def evaluate(self, X, y_true):
        y_pred = self.forward(X)
        pred_labels = torch.argmax(y_pred, dim=0)
        true_labels = torch.argmax(y_true, dim=0)
        accuracy = torch.mean(pred_labels == true_labels)
        return accuracy

In [None]:
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class NeuralNetwork:
    def __init__(self, layer_sizes: list):
        self.Layers = len(layer_sizes)
        self.weights = []
        self.biases = []
        self.costs = []
        self.iters = []

        for i in range(self.Layers - 1):
            w = torch.zeros((layer_sizes[i + 1], layer_sizes[i]), dtype=torch.double, device=device, requires_grad=True)
            b = torch.zeros((layer_sizes[i + 1], 1), dtype=torch.double, device=device, requires_grad=True)
            self.weights.append(w)
            self.biases.append(b)

    def sigmoid(self, x):
        return torch.sigmoid(x)

    def sigmoid_prime(self, x):
        a = self.sigmoid(x)
        return a * (1 - a)

    def softmax(self, x):
        return F.softmax(x, dim=0)

    def forward(self, X):
        self.activations = [X]
        self.Z = []

        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = torch.matmul(w, self.activations[i]) + b
            self.Z.append(z)
            if i == len(self.weights) - 1:
                a = self.softmax(z)
            else:
                a = self.sigmoid(z)
            self.activations.append(a)

        return self.activations[-1]

    def calculate_loss(self, y_true, y_pred):
        return -torch.sum(y_true * torch.log(y_pred + 1e-9), dim=0).mean()

    def backward(self, X, y):
        y_pred = self.activations[-1]
        cost = self.calculate_loss(y, y_pred)

        cost.backward()  # Use autograd!

        grads_w = [w.grad.clone() for w in self.weights]
        grads_b = [b.grad.clone() for b in self.biases]

        # Reset gradients manually
        for w, b in zip(self.weights, self.biases):
            w.grad.zero_()
            b.grad.zero_()

        return cost.item(), grads_w, grads_b

    def train(self, X, y, alpha=0.01, epochs=1000):
        for i in range(epochs):
            y_pred = self.forward(X)
            cost, grads_w, grads_b = self.backward(X, y)

            if i % (epochs // 10) == 0:
                self.costs.append(cost)
                self.iters.append(i)
                print(f"Epoch {i}, Cost: {cost:.4f}")

            # Manual gradient descent
            with torch.no_grad():
                for j in range(self.Layers - 1):
                    self.weights[j] -= alpha * grads_w[j]
                    self.biases[j] -= alpha * grads_b[j]

        return

    def predict(self, X):
        output = self.forward(X)
        return output

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)
        pred_labels = torch.argmax(y_pred, dim=0)
        true_labels = torch.argmax(y_true, dim=0)
        return (pred_labels == true_labels).float().mean().item()


In [20]:
import struct

def load_images(filename):
    with open(filename, 'rb') as f:
        magic, num, rows, cols = struct.unpack(">IIII", f.read(16))
        images = np.frombuffer(f.read(), dtype=np.uint8).reshape(num, rows, cols)
        return images

def load_labels(filename):
    with open(filename, 'rb') as f:
        magic, num = struct.unpack(">II", f.read(8))
        labels = np.frombuffer(f.read(), dtype=np.uint8)
        return labels

train_images = load_images('input/train-images.idx3-ubyte')/255
train_labels = load_labels('input/train-labels.idx1-ubyte')
test_images = load_images('input/t10k-images.idx3-ubyte')/255
test_labels = load_labels('input/t10k-labels.idx1-ubyte')


In [21]:
flat_train_images = (torch.from_numpy(train_images.reshape(60000, 28*28).T)).to('cuda')
flat_test_images = (torch.from_numpy(test_images.reshape(10000, 28*28).T)).to('cuda')

In [22]:
def one_hot(y, num_classes=10):
    return np.eye(num_classes)[y].T  # shape: (10, batch_size)

y_train = (torch.from_numpy(one_hot(train_labels))).to('cuda')
y_test = (torch.from_numpy(one_hot(test_labels))).to('cuda')

In [23]:
model = NeuralNetwork([784, 64, 16, 10])
model.train(flat_train_images, y_train, alpha=3.5, epochs=3000)
acc = model.evaluate(flat_test_images, y_test)
print(f"Test Accuracy: {acc * 100:.2f}%")

Test Accuracy: 28.58%


In [18]:
print(flat_train_images)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float64)
