# Backprop from scratch â€” Placeholder (Student TODO)

Implement the `TODO` sections in order:
1. Losses (MSE, CrossEntropy)
2. Layer forward/backward
3. step() with **L2 regularization on weights only**
4. train() mini-batch SGD + validation logging


In [None]:
import numpy as np

EPS = 1e-12

def accuracy_from_logits(probs, y_onehot):
    pred = np.argmax(probs, axis=1)
    true = np.argmax(y_onehot, axis=1)
    return np.mean(pred == true)


# =========================
# 1) Losses (TODO)
# =========================
class Loss:
    def loss(self, x, y):
        raise NotImplementedError()

    def gradient(self, x, y):
        raise NotImplementedError()


class MSE(Loss):
    """Mean squared error (mean over all elements)."""
    def loss(self, x, y):
        # TODO:
        raise NotImplementedError()

    def gradient(self, x, y):
        # TODO: derivative of mean((x-y)^2) w.r.t x
        raise NotImplementedError()


class CrossEntropy(Loss):
    """Cross-entropy for one-hot y and probability predictions x (softmax output)."""
    def loss(self, x, y):
        # TODO:
        raise NotImplementedError()

    def gradient(self, x, y):
        # TODO:
        raise NotImplementedError()


# =========================
# 2) Dense Layer (TODO)
# =========================
class Layer:
    def __init__(self, input_dim, output_dim, non_linearity=None) -> None:
        self.in_dim = input_dim
        self.out_dim = output_dim
        self.non_linearity = non_linearity

        self.output = None
        self.input = None
        self.grad_weight = None
        self.grad_bias = None

        # He init is fine for ReLU-ish nets
        self.weights = np.random.randn(self.in_dim, self.out_dim) * np.sqrt(2 / self.in_dim)
        self.bias = np.zeros(self.out_dim)

    def forward(self, x):
        """Return layer output. Store what you need for backward."""
        self.input = x
        z = x @ self.weights + self.bias

        # TODO: implement activations
        # - None: output = z
        # - "relu": output = max(0, z)
        # - "tanh": output = tanh(z)
        # - "soft_max": stable softmax over axis=1
        raise NotImplementedError()

    def backward(self, gradients):
        """
        gradients is dL/d(output_of_this_layer).
        Return gradient to previous layer: dL/d(input_of_this_layer).
        """
        # TODO: convert dL/d(output) -> dL/dz depending on activation
        # - None: grad = gradients
        # - relu: grad = (output > 0) * gradients
        # - tanh: grad = (1 - output^2) * gradients
        # - softmax: use JVP: s * (g - sum(g*s))

        raise NotImplementedError()


# =========================
# 3) Network + Training (TODO)
# =========================
class NeuralNetwork:
    def __init__(self, in_dim, layers, out_dim, loss) -> None:
        self.layers_size = layers
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.loss_name = loss

        self.layers = []
        prev = self.in_dim
        for h in self.layers_size:
            self.layers.append(Layer(prev, h, non_linearity='relu'))
            prev = h

        if loss == "mse":
            self.layers.append(Layer(prev, self.out_dim, non_linearity=None))
            self.loss = MSE()
        elif loss == "cross_entropy":
            self.layers.append(Layer(prev, self.out_dim, non_linearity="soft_max"))
            self.loss = CrossEntropy()
        else:
            raise ValueError("loss must be 'mse' or 'cross_entropy'")

    def forward(self, x):
        # TODO (optional): students can implement this too
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, gradients):

    def step(self, lr, l2_lambda=0.0):
        """
        Update parameters.
        TODO: add L2 regularization (weight decay) ONLY on weights (not biases).

        Standard: W <- W - lr * (dW + l2_lambda * W)
        """
        for layer in self.layers:

            raise NotImplementedError()

    def train(self, x, y, epochs, lr, batch_size=64, verbose_every=1, x_val=None, y_val=None, l2_lambda=0.0):
        """
        Mini-batch SGD training with optional validation logging.

        TODO:
        - shuffle each epoch
        - loop over mini-batches
        - forward -> loss -> backward -> step (with L2)
        - print losses (and validation acc for cross-entropy)
        """
        raise NotImplementedError()

    def predict(self, x):
        return self.forward(x)


In [None]:
# Sanity check
X = np.random.randn(5, 4)              # batch=5, in_dim=4
y = np.random.randn(5, 3)              # regression target for mse (batch=5, out_dim=3)

net = NeuralNetwork(in_dim=4, layers=[8, 8], out_dim=3, loss="mse")
pred = net.forward(X)
print("pred shape:", pred.shape)

loss_val = net.loss.loss(pred, y)
grads = net.loss.gradient(pred, y)
net.backward(grads)

for i, layer in enumerate(net.layers):
    print(f"Layer {i}: W {layer.weights.shape}, dW {layer.grad_weight.shape}, b {layer.bias.shape}, db {layer.grad_bias.shape}")
print("loss:", loss_val)


In [None]:
def one_hot(y, num_classes):
    y = np.asarray(y).astype(int)
    oh = np.zeros((y.shape[0], num_classes))
    oh[np.arange(y.shape[0]), y] = 1.0
    return oh



def numerical_grad_weight(net, X, Y, layer_index=0, eps=1e-5):
    # compute numerical gradient for W in net.layers[layer_index]
    layer = net.layers[layer_index]
    W = layer.weights
    numgrad = np.zeros_like(W)

    # baseline
    base_pred = net.forward(X)
    base_loss = net.loss.loss(base_pred, Y)

    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            old = W[i, j]

            W[i, j] = old + eps
            lp = net.loss.loss(net.forward(X), Y)

            W[i, j] = old - eps
            lm = net.loss.loss(net.forward(X), Y)

            numgrad[i, j] = (lp - lm) / (2 * eps)
            W[i, j] = old

    return numgrad, base_loss

# small classification net for the check
Xc = np.random.randn(6, 3)
yc = one_hot(np.random.randint(0, 2, size=6), 2)

netc = NeuralNetwork(in_dim=3, layers=[5], out_dim=2, loss="cross_entropy")

# analytic grad
pred = netc.forward(Xc)
loss_val = netc.loss.loss(pred, yc)
grads = netc.loss.gradient(pred, yc)
netc.backward(grads)
analytic = netc.layers[0].grad_weight.copy()

# numerical grad
num, _ = numerical_grad_weight(netc, Xc, yc, layer_index=0, eps=1e-5)

err = np.linalg.norm(analytic - num)
print("Loss:", loss_val)
print("error:", err)
print("(Rule of thumb: <1e-4 is usually good for this tiny check.)")


In [None]:
import pandas as pd

df = pd.read_csv("/content/sample_data/california_housing_train.csv")

X , y = df.iloc[: , :-1].values , df.iloc[: , -1 : ].values /10000

X = (X  - np.mean(X , axis= 0 )) / np.std(X , axis= 0 )
perm = np.random.permutation(X.shape[0])
X_shuf, y_shuf = X[perm], y[perm]

split = int(X.shape[0] * 0.8)
X_train , y_train = X_shuf[:split] , y_shuf[:split]
X_val , y_val = X_shuf[split:] , y_shuf[split:]

model = NeuralNetwork(X.shape[1] , [16  , 32 , 32 , 16] , y.shape[1] , "mse")

model.train(X_train , y_train , 1000 , 0.001 , 64 , verbose_every=1 , x_val = X_val , y_val = y_val)



In [None]:

df = pd.read_csv("/content/sample_data/mnist_train_small.csv")

X , y = df.iloc[: ,1 : ].values , df.iloc[: , 0 ].values

n_class = len(set(y))
y = one_hot(y , n_class)

perm = np.random.permutation(X.shape[0])
X_shuf, y_shuf = X[perm], y[perm]

split = int(X.shape[0] * 0.8)
X_train , y_train = X_shuf[:split] , y_shuf[:split]
X_val , y_val = X_shuf[split:] , y_shuf[split:]

In [None]:
model = NeuralNetwork(X.shape[1] , [16  , 32 , 32 , 16] , y.shape[1] , "cross_entropy")

model.train(X , y , 1000 , 0.001, 64 , verbose_every=1 , x_val = X_val , y_val = y_val)
