# Backprop from scratch â€” Placeholder (Student TODO)

Implement the `TODO` sections in order:
1. Losses (MSE, CrossEntropy)
2. Layer forward/backward
3. step() with **L2 regularization on weights only**
4. train() mini-batch SGD + validation logging


In [5]:
import numpy as np

EPS = 1e-12

def accuracy_from_logits(probs, y_onehot):
    pred = np.argmax(probs, axis=1)
    true = np.argmax(y_onehot, axis=1)
    return np.mean(pred == true)
class Loss:
    def loss(self, x, y):
        raise NotImplementedError()

    def gradient(self, x, y):
        raise NotImplementedError()

class MSE(Loss):
    # Mean over ALL elements (batch * out_dim)
    def loss(self, x, y):
        return np.mean(np.power(x - y, 2))

    def gradient(self, x, y):
        # derivative of mean((x-y)^2) is 2*(x-y)/x.size
        return 2 * (x - y) / x.size

class CrossEntropy(Loss):
    # Mean over samples, sum over classes: mean( -sum(y*log(p)) )
    def loss(self, x, y):
        x = np.clip(x, EPS, 1.0)
        return -np.mean(np.sum(y * np.log(x), axis=1))

    def gradient(self, x, y):
        x = np.clip(x, EPS, 1.0)
        n = y.shape[0]
        return -(y / x) / n

class Layer:
    def __init__(self, input_dim, output_dim, non_linearity=None) -> None:
        self.in_dim = input_dim
        self.out_dim = output_dim
        self.non_linearity = non_linearity

        self.output = None
        self.input = None
        self.grad_weight = None
        self.grad_bias = None

        # He init is fine for ReLU-ish nets
        self.weights = np.random.randn(self.in_dim, self.out_dim) * np.sqrt(2 / self.in_dim)
        self.bias = np.zeros(self.out_dim)

    def forward(self, x):
        self.input = x
        z = x @ self.weights + self.bias

        if self.non_linearity is None:
            self.output = z
        elif self.non_linearity == "relu":
            self.output = np.maximum(0, z)
        elif self.non_linearity == "tanh":
            self.output = np.tanh(z)
        elif self.non_linearity == "soft_max":
            # stable softmax
            z = z - np.max(z, axis=1, keepdims=True)
            exp_z = np.exp(z)
            self.output = exp_z / np.sum(exp_z, axis=1, keepdims=True)
        else:
            raise ValueError(f"Unknown non_linearity: {self.non_linearity}")

        return self.output

    def backward(self, gradients):
        # gradients is dL/d(output_of_this_layer)
        if self.non_linearity is None:
            grad = gradients
        elif self.non_linearity == "relu":
            grad = (self.output > 0) * gradients
        elif self.non_linearity == "tanh":
            grad = (1 - self.output ** 2) * gradients
        elif self.non_linearity == "soft_max":
            # Jacobian-vector product for softmax
            # dL/dz = s * (g - sum(g*s))
            s = self.output
            grad = s * (gradients - np.sum(gradients * s, axis=1, keepdims=True))
        else:
            raise ValueError(f"Unknown non_linearity: {self.non_linearity}")

        # parameter gradients
        self.grad_weight = self.input.T @ grad
        self.grad_bias = np.sum(grad, axis=0)

        # propagate to previous layer
        grad_to_input = grad @ self.weights.T
        return grad_to_input

class NeuralNetwork:
    def __init__(self, in_dim, layers, out_dim, loss) -> None:
        self.layers_size = layers
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.loss = loss
        self.layers = []
        prev = self.in_dim
        for h in self.layers_size:
            self.layers.append(Layer(prev, h, non_linearity='relu'))
            prev = h

        if loss == "mse":
            self.layers.append(Layer(prev, self.out_dim, non_linearity=None))
            self.loss = MSE()
        elif loss == "cross_entropy":
            self.layers.append(Layer(prev, self.out_dim, non_linearity="soft_max"))
            self.loss = CrossEntropy()
        else:
            raise ValueError("loss must be 'mse' or 'cross_entropy'")

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, gradients):
        for layer in reversed(self.layers):
            gradients = layer.backward(gradients)

    def step(self, lr):
        for layer in self.layers:
            layer.weights -= lr * layer.grad_weight
            layer.bias -= lr * layer.grad_bias

    def train(self, x, y, epochs, lr, batch_size=64, verbose_every=1 , x_val = None , y_val = None):
        n = x.shape[0]

        for epoch in range(epochs):

            #suffhle
            perm = np.random.permutation(n)
            x_shuf , y_shuf = x[perm] , y[perm]
            total_loss = 0
            n_batches = 0
            for start in range(0, n, batch_size):
                end = min(start + batch_size, n)
                xb = x_shuf[start:end]
                yb = y_shuf[start:end]

               # implement training section
                y_pred = self.forward(xb)
                loss = self.loss.loss(y_pred , yb)
                grad_to_loss = self.loss.gradient(y_pred , yb)
                self.backward(grad_to_loss)
                self.step(lr)
                total_loss += loss
                n_batches +=1


            if (epoch + 1) % verbose_every == 0:

                out_string = f"Epoch {epoch+1}/{epochs} - Train_Loss: {total_loss / n_batches:.4f} "

                if x_val is not None  :
                  y_pred = self.forward(x_val)
                  loss = self.loss.loss(y_pred, y_val)
                  out_string += f"Validation_Loss: {loss:.4f} "
                  if isinstance(self.loss , CrossEntropy) :
                    train_accurcy = accuracy_from_logits(y_pred ,y_val )
                    out_string += f"Validation_acc: {train_accurcy:.4f}"

                print(out_string)



    def predict(self, x):
        return self.forward(x)


In [6]:
# Sanity check
X = np.random.randn(5, 4)              # batch=5, in_dim=4
y = np.random.randn(5, 3)              # regression target for mse (batch=5, out_dim=3)

net = NeuralNetwork(in_dim=4, layers=[8, 8], out_dim=3, loss="mse")
pred = net.forward(X)
print("pred shape:", pred.shape)

loss_val = net.loss.loss(pred, y)
grads = net.loss.gradient(pred, y)
net.backward(grads)

for i, layer in enumerate(net.layers):
    print(f"Layer {i}: W {layer.weights.shape}, dW {layer.grad_weight.shape}, b {layer.bias.shape}, db {layer.grad_bias.shape}")
print("loss:", loss_val)


pred shape: (5, 3)
Layer 0: W (4, 8), dW (4, 8), b (8,), db (8,)
Layer 1: W (8, 8), dW (8, 8), b (8,), db (8,)
Layer 2: W (8, 3), dW (8, 3), b (3,), db (3,)
loss: 1.102111510650848


In [7]:
def one_hot(y, num_classes):
    y = np.asarray(y).astype(int)
    oh = np.zeros((y.shape[0], num_classes))
    oh[np.arange(y.shape[0]), y] = 1.0
    return oh



def numerical_grad_weight(net, X, Y, layer_index=0, eps=1e-5):
    # compute numerical gradient for W in net.layers[layer_index]
    layer = net.layers[layer_index]
    W = layer.weights
    numgrad = np.zeros_like(W)

    # baseline
    base_pred = net.forward(X)
    base_loss = net.loss.loss(base_pred, Y)

    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            old = W[i, j]

            W[i, j] = old + eps
            lp = net.loss.loss(net.forward(X), Y)

            W[i, j] = old - eps
            lm = net.loss.loss(net.forward(X), Y)

            numgrad[i, j] = (lp - lm) / (2 * eps)
            W[i, j] = old

    return numgrad, base_loss

# small classification net for the check
Xc = np.random.randn(6, 3)
yc = one_hot(np.random.randint(0, 2, size=6), 2)

netc = NeuralNetwork(in_dim=3, layers=[5], out_dim=2, loss="cross_entropy")

# analytic grad
pred = netc.forward(Xc)
loss_val = netc.loss.loss(pred, yc)
grads = netc.loss.gradient(pred, yc)
netc.backward(grads)
analytic = netc.layers[0].grad_weight.copy()

# numerical grad
num, _ = numerical_grad_weight(netc, Xc, yc, layer_index=0, eps=1e-5)

err = np.linalg.norm(analytic - num)
print("Loss:", loss_val)
print("error:", err)



Loss: 0.6298078461391744
error: 1.6280489075489293e-11


In [10]:
import pandas as pd

df = pd.read_csv("/content/sample_data/california_housing_train.csv")

X , y = df.iloc[: , :-1].values , df.iloc[: , -1 : ].values /10000

#X = (X  - np.mean(X , axis= 0 )) / np.std(X , axis= 0 )
perm = np.random.permutation(X.shape[0])
X_shuf, y_shuf = X[perm], y[perm]

split = int(X.shape[0] * 0.8)
X_train , y_train = X_shuf[:split] , y_shuf[:split]
X_val , y_val = X_shuf[split:] , y_shuf[split:]

mean =np.mean(X_train , axis = 0 )
std = np.std(X_train , axis = 0 )

model = NeuralNetwork(X.shape[1] , [16  , 32 , 32 , 16] , y.shape[1] , "mse")

model.train(X_train , y_train , 1000 , 0.001 , 64 , verbose_every=1 , x_val = X_val , y_val = y_val)



  z = x @ self.weights + self.bias
  return np.mean(np.power(x - y, 2))
  self.grad_weight = self.input.T @ grad
  grad_to_input = grad @ self.weights.T
  grad = (self.output > 0) * gradients


Epoch 1/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 2/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 3/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 4/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 5/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 6/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 7/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 8/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 9/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 10/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 11/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 12/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 13/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 14/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 15/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 16/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 17/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 18/1000 - Train_Loss: nan Validation_Loss: nan 
Epoch 19/1000 - Train_Loss: nan Valid

KeyboardInterrupt: 

In [11]:

df = pd.read_csv("/content/sample_data/mnist_train_small.csv")

X , y = df.iloc[: ,1 : ].values , df.iloc[: , 0 ].values

n_class = len(set(y))
y = one_hot(y , n_class)

perm = np.random.permutation(X.shape[0])
X_shuf, y_shuf = X[perm], y[perm]

split = int(X.shape[0] * 0.8)
X_train , y_train = X_shuf[:split] , y_shuf[:split]
X_val , y_val = X_shuf[split:] , y_shuf[split:]

In [13]:
model = NeuralNetwork(X.shape[1] , [16  , 32 , 32 , 16] , y.shape[1] , "cross_entropy")

model.train(X , y , 1000 , 0.001, 64 , verbose_every=1 , x_val = X_val , y_val = y_val)


Epoch 1/1000 - Train_Loss: 4.8979 Validation_Loss: 1.9264 Validation_acc: 0.2875
Epoch 2/1000 - Train_Loss: 1.9137 Validation_Loss: 1.8489 Validation_acc: 0.3310
Epoch 3/1000 - Train_Loss: 1.8410 Validation_Loss: 1.7700 Validation_acc: 0.3548
Epoch 4/1000 - Train_Loss: 1.8012 Validation_Loss: 1.7784 Validation_acc: 0.3362
Epoch 5/1000 - Train_Loss: 1.7655 Validation_Loss: 1.7043 Validation_acc: 0.3680
Epoch 6/1000 - Train_Loss: 1.7368 Validation_Loss: 1.7762 Validation_acc: 0.3688
Epoch 7/1000 - Train_Loss: 1.7188 Validation_Loss: 1.6644 Validation_acc: 0.3787
Epoch 8/1000 - Train_Loss: 1.7003 Validation_Loss: 1.6528 Validation_acc: 0.3830
Epoch 9/1000 - Train_Loss: 1.6893 Validation_Loss: 1.6551 Validation_acc: 0.3812
Epoch 10/1000 - Train_Loss: 1.6777 Validation_Loss: 1.6403 Validation_acc: 0.3877
Epoch 11/1000 - Train_Loss: 1.6661 Validation_Loss: 1.6256 Validation_acc: 0.3827
Epoch 12/1000 - Train_Loss: 1.6511 Validation_Loss: 1.6265 Validation_acc: 0.3857
Epoch 13/1000 - Train_Los

KeyboardInterrupt: 