In [24]:
import numpy as np
from typing import Tuple

# --- Activations and Loss ---
def relu(x: np.ndarray) -> np.ndarray:
    return np.maximum(0, x)

def relu_deriv(x: np.ndarray) -> np.ndarray:
    return (x > 0).astype(float)

def softmax(x: np.ndarray) -> np.ndarray:
    e_x = np.exp(x - np.max(x))
    return e_x / np.sum(e_x)

# single data point
def cross_entropy_loss(pred: np.ndarray, label: int) -> float:
    return float(-np.log(pred[label].item() + 1e-7))

def cross_entropy_deriv(pred: np.ndarray, label: int) -> np.ndarray:
    grad = pred.copy()
    grad[label] -= 1 # since only 1 class so true label =1 grad = pred - y_true
    return grad

def flatten(x: np.ndarray) -> np.ndarray:
    return x.reshape(-1, 1)

def unflatten(x: np.ndarray, shape: Tuple[int, int, int]) -> np.ndarray:
    return x.reshape(shape)


In [25]:
import numpy as np
from typing import Optional, Tuple

class Conv2D:
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0
    ):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        # Xavier/He init
        scale = np.sqrt(2. / (in_channels * kernel_size * kernel_size))
        # Each filter needs to combine information from all in_channels. So for every output feature map (each out_channel), there is a stack of filters, one per input channel.
        self.filters = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * scale          # normal distribution mean =0; std =1 ,scale only changes std
        # suppose y = Wx, then W (m,n) x(n,1) then var(y) = n * var(W) *var(x) ,to keep variance from not exploding var(W) =1/n
        self.biases = np.zeros((out_channels, 1)) # out_channel or filter ,bias is one scalar per filter
        # we need to define these here as they are learnable parameters .

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.last_input = input  # shape: (in_channels, H, W)  During the forward pass, we compute the output from the input.

        C, H, W = input.shape
        K = self.kernel_size
        S = self.stride
        P = self.padding

        # Apply padding
        if P > 0:
            padded_input = np.pad(input, ((0, 0), (P, P), (P, P)), mode='constant')  # no pad on channel axis ,(top,bottom),(left,right) by P
        else:
            padded_input = input

        self.padded_input = padded_input

        H_out = (H + 2 * P - K) // S + 1
        W_out = (W + 2 * P - K) // S + 1

        output = np.zeros((self.out_channels, H_out, W_out)) # output tensor from conv layer

        for oc in range(self.out_channels):
            for i in range(H_out):
                for j in range(W_out):
                    region = padded_input[:, i*S:i*S+K, j*S:j*S+K]   # all input channels;extracting a region of shape (in_channels, K, K);(in channels ,verticle slice,horizontal slice)
                    output[oc, i, j] = np.sum(region * self.filters[oc]) + self.biases[oc].item() # element wise mult
        self.last_output = output
        return output

    def backward(self, dL_dout: np.ndarray, lr: float) -> np.ndarray:
        C, H, W = self.last_input.shape
        K = self.kernel_size
        S = self.stride
        P = self.padding
        H_out, W_out = dL_dout.shape[1:] #It’s the gradient of the loss with respect to the output of the convolution layer.Shape: (out_channels, H_out, W_out)
        #It slices the shape starting from index 1: shape[0] → out_channels ,shape[1] → H_out ,shape[2] → W_out

        dL_dfilters = np.zeros_like(self.filters)
        dL_dbiases = np.zeros_like(self.biases)
        dL_dinput_padded = np.zeros_like(self.padded_input)  # same shape

        # Compute gradients
        for oc in range(self.out_channels):
            for i in range(H_out):
                for j in range(W_out):
                    region = self.padded_input[:, i*S:i*S+K, j*S:j*S+K]
                    dL_dfilters[oc] += dL_dout[oc, i, j] * region      #The loss gradient at this output pixel (dL_dout[oc, i, j]) gets multiplied by the input patch,This is the gradient of the filter weights — how much this filter should be adjusted
                    dL_dbiases[oc] += dL_dout[oc, i, j]
                    dL_dinput_padded[:, i*S:i*S+K, j*S:j*S+K] += dL_dout[oc, i, j] * self.filters[oc] #you're scaling the filter by how much error the output had.; scalar*(in,K,K)
                    # dL_df=dL_do *do_df =dL_do * region as filter is constant
        # Gradient descent update
        self.filters -= lr * dL_dfilters
        self.biases -= lr * dL_dbiases

        # Remove padding from gradient if it was added
        if self.padding > 0:
            return dL_dinput_padded[:, self.padding:-self.padding, self.padding:-self.padding]
        else:
            return dL_dinput_padded
        #dL_dinput_padded[:, P:-P, P:-P] ; it removes P pixels from both sides .


In [26]:
class MaxPool2x2:
    def forward(self, input: np.ndarray) -> np.ndarray:
        self.last_input = input
        n_filters, h, w = input.shape
        out = np.zeros((n_filters, h // 2, w // 2))
        self.max_pos = np.zeros_like(input)

        for f in range(n_filters):
            for i in range(0, h, 2):  # Loop over rows, stepping by 2
                for j in range(0, w, 2):# Loop over columns, stepping by 2
                    region = input[f, i:i+2, j:j+2]
                    max_val = np.max(region)
                    out[f, i//2, j//2] = max_val
                    for ii in range(2):
                        for jj in range(2):
                            if region[ii, jj] == max_val: #region[ii, jj] iterates over the 2×2 region.
                                self.max_pos[f, i+ii, j+jj] = 1
        return out

    def backward(self, dL_dout: np.ndarray) -> np.ndarray:
        dL_dinput = np.zeros_like(self.last_input) # the gradient of the loss with respect to the input of the max pooling layer
        for f in range(dL_dout.shape[0]):
            for i in range(dL_dout.shape[1]):
                for j in range(dL_dout.shape[2]):
                    region = self.max_pos[f, i*2:i*2+2, j*2:j*2+2]
                    dL_dinput[f, i*2:i*2+2, j*2:j*2+2] += region * dL_dout[f, i, j] #Only the max position in region is 1, rest are 0, so the scalar gets passed to the correct place.

        return dL_dinput


In [27]:
class Dense:
    def __init__(self, input_size: int, output_size: int):
        self.weights = np.random.randn(output_size, input_size) * 0.1
        self.bias = np.zeros((output_size, 1))

    def forward(self, x: np.ndarray) -> np.ndarray:
        self.last_input = x
        return np.dot(self.weights, x) + self.bias

    def backward(self, dL_dout: np.ndarray, lr: float) -> np.ndarray:
        dL_dw = np.dot(dL_dout, self.last_input.T)
        dL_db = dL_dout
        dL_dinput = np.dot(self.weights.T, dL_dout)

        self.weights -= lr * dL_dw
        self.bias -= lr * dL_db
        return dL_dinput


In [28]:
import tensorflow
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load data
(train_X, train_y), (test_X, test_y) = mnist.load_data()

# Normalize to [0,1]
train_X = train_X / 255.0
test_X = test_X / 255.0

# Reshape to (num_samples, 1, 28, 28)
train_X = train_X.reshape(-1, 1, 28, 28)
test_X = test_X.reshape(-1, 1, 28, 28)

# Optionally one-hot encode labels if using that


In [29]:
class SimpleCNN:
    def __init__(self):
        # Conv2D: 1 input channel (grayscale), 1 filter, kernel 3x3, stride=1, padding=0
        self.conv = Conv2D(
            in_channels=1,
            out_channels=1,
            kernel_size=3,
            stride=1,
            padding=0
        )

        # After Conv2D: 28x28 → 26x26 (kernel=3, stride=1, padding=0)
        # After MaxPool2x2: 26x26 → 13x13
        self.pool = MaxPool2x2()

        # Flattened size: 1 channel × 13 × 13 = 169
        self.fc = Dense(input_size=169, output_size=10)

    def forward(self, x: np.ndarray) -> np.ndarray:
        x = self.conv.forward(x)
        self.relu_out = relu(x)
        x = self.pool.forward(self.relu_out)
        self.shape_before_flat = x.shape
        x = flatten(x)
        self.logits = self.fc.forward(x)
        return softmax(self.logits)

    def train_step(self, x: np.ndarray, y_true: int, lr: float = 0.01) -> Tuple[float, np.ndarray]:
        out = self.forward(x)
        loss = cross_entropy_loss(out, y_true)
        grad = cross_entropy_deriv(out, y_true).reshape(-1, 1)

        # Backpropagation
        d_fc = self.fc.backward(grad, lr)
        d_flat = unflatten(d_fc, self.shape_before_flat)
        d_pool = self.pool.backward(d_flat)
        d_relu = d_pool * relu_deriv(self.relu_out)
        self.conv.backward(d_relu, lr)

        return loss, out
#During backpropagation, the gradient at each layer has the same shape as that layer's input — because we’re computing how the loss changes w.r.t. each input pixel of that layer.

In [31]:
model = SimpleCNN()
epochs = 1
lr = 0.01

for epoch in range(epochs):
    loss_sum = 0
    correct = 0
    for i in range(len(train_X)):
        x = train_X[i]
        y = train_y[i]

        loss, out = model.train_step(x, y, lr)
        loss_sum += loss
        pred = np.argmax(out)
        correct += (pred == y)

        if i % 1000 == 0:
            print(f"Step {i}, Loss: {loss:.4f}")

    print(f"Epoch {epoch+1}: Avg Loss = {loss_sum / len(train_X):.4f}, Accuracy = {correct / len(train_X):.4f}")


Step 0, Loss: 1.8340
Step 1000, Loss: 0.0241
Step 2000, Loss: 1.9090
Step 3000, Loss: 1.7064
Step 4000, Loss: 0.0604
Step 5000, Loss: 0.0772
Step 6000, Loss: 0.0001
Step 7000, Loss: 0.5587
Step 8000, Loss: 0.0015
Step 9000, Loss: 0.1391
Step 10000, Loss: 0.0001
Step 11000, Loss: 0.0111
Step 12000, Loss: 0.0310
Step 13000, Loss: 0.4963
Step 14000, Loss: 0.0243
Step 15000, Loss: 0.0666
Step 16000, Loss: 0.0136
Step 17000, Loss: 0.0152
Step 18000, Loss: 0.2762
Step 19000, Loss: 0.0117
Step 20000, Loss: 0.0002
Step 21000, Loss: 0.7302
Step 22000, Loss: 0.1260
Step 23000, Loss: 0.0000
Step 24000, Loss: 0.0724
Step 25000, Loss: 0.4257
Step 26000, Loss: 0.0065
Step 27000, Loss: 0.5249
Step 28000, Loss: 0.0034
Step 29000, Loss: 0.0191
Step 30000, Loss: 0.0020
Step 31000, Loss: 1.8763
Step 32000, Loss: 0.0147
Step 33000, Loss: 1.0887
Step 34000, Loss: 0.0297
Step 35000, Loss: 0.0103
Step 36000, Loss: 0.1104
Step 37000, Loss: 0.0014
Step 38000, Loss: 0.0996
Step 39000, Loss: 0.7441
Step 40000, L