In [10]:
import numpy as np
import struct
import gzip
import urllib.request
import os
import matplotlib.pyplot as plt 

In [11]:
# -------------------------
# MNIST Data Loader (using Google Cloud Storage)
# -------------------------
def download_mnist(path='mnist'):
    os.makedirs(path, exist_ok=True)
    base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
    files = {
        "train_images": "train-images-idx3-ubyte.gz",
        "train_labels": "train-labels-idx1-ubyte.gz",
        "test_images": "t10k-images-idx3-ubyte.gz",
        "test_labels": "t10k-labels-idx1-ubyte.gz"
    }
    for key, filename in files.items():
        filepath = os.path.join(path, filename)
        if not os.path.exists(filepath):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(base_url + filename, filepath)
    print("MNIST dataset downloaded.")

def load_mnist_images(filename):
    with gzip.open(filename, 'rb') as f:
        magic, num, rows, cols = struct.unpack(">IIII", f.read(16))
        images = np.frombuffer(f.read(), dtype=np.uint8).reshape(num, rows, cols)
        images = images.astype(np.float32) / 255.0  # Normalize to [0,1]
        return images

def load_mnist_labels(filename):
    with gzip.open(filename, 'rb') as f:
        magic, num = struct.unpack(">II", f.read(8))
        labels = np.frombuffer(f.read(), dtype=np.uint8)
        return labels

def load_mnist(path='mnist'):
    download_mnist(path)
    train_images = load_mnist_images(os.path.join(path, "train-images-idx3-ubyte.gz"))
    train_labels = load_mnist_labels(os.path.join(path, "train-labels-idx1-ubyte.gz"))
    test_images = load_mnist_images(os.path.join(path, "t10k-images-idx3-ubyte.gz"))
    test_labels = load_mnist_labels(os.path.join(path, "t10k-labels-idx1-ubyte.gz"))
    return train_images, train_labels, test_images, test_labels

# Load MNIST training images
train_images, train_labels, test_images, test_labels = load_mnist()
# Reshape images to (num_samples, height, width, channels)
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)

print(f"Train images shape: {train_images.shape}")
print(f"Test images shape: {test_images.shape}")


# --- Helper for weight initialization ---
def he_initialize_weights(input_dim, output_dim):
    return np.random.randn(input_dim, output_dim) * np.sqrt(2.0 / input_dim)

def he_initialize_conv_filters(filter_height, filter_width, input_channels, output_channels):
    return np.random.randn(filter_height, filter_width, input_channels, output_channels) * \
           np.sqrt(2.0 / (filter_height * filter_width * input_channels))

def xavier_initialize_weights(input_dim, output_dim):
    return np.random.randn(input_dim, output_dim) * np.sqrt(1.0 / input_dim)

MNIST dataset downloaded.
Train images shape: (60000, 28, 28, 1)
Test images shape: (10000, 28, 28, 1)


In [12]:
class Sigmoid:
    def __init__(self):
        self.output = None

    def forward(self, x):
        self.output = 1 / (1 + np.exp(-np.clip(x, -700, 700))) # Clip for numerical stability
        return self.output

    def backward(self, dout):
        return dout * self.output * (1 - self.output)

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass


class ReLU:
    def __init__(self):
        self.input = None

    def forward(self, x):
        self.input = x
        return np.maximum(0, x)

    def backward(self, dout):
        dinput = dout.copy()
        if self.input is not None: # Ensure input was set
          dinput[self.input <= 0] = 0
        return dinput

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass
        

class LeakyReLU:
    def __init__(self, alpha=0.01):
        self.alpha = alpha
        self.input = None

    def forward(self, x):
        self.input = x
        return np.where(x > 0, x, self.alpha * x)

    def backward(self, dout):
        dinput = dout.copy()
        if self.input is not None: # Ensure input was set
            dinput[self.input <= 0] *= self.alpha
        return dinput

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass

class Tanh:
    def __init__(self):
        self.output = None

    def forward(self, x):
        self.output = np.tanh(x)
        return self.output

    def backward(self, dout):
        return dout * (1 - self.output**2)

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass


class BinaryCrossEntropyLoss:
    def __init__(self, epsilon=1e-12):
        self.epsilon = epsilon
        self.y_pred = None
        self.y_true = None

    def forward(self, y_pred, y_true):
        self.y_pred = np.clip(y_pred, self.epsilon, 1. - self.epsilon)
        self.y_true = y_true
        loss = - (self.y_true * np.log(self.y_pred) + \
                  (1 - self.y_true) * np.log(1 - self.y_pred))
        return np.mean(loss)

    def backward(self, y_pred=None, y_true=None):
        if y_pred is None or y_true is None:
            y_pred = self.y_pred
            y_true = self.y_true
        else: # Ensure clipping if new values are passed
            y_pred = np.clip(y_pred, self.epsilon, 1. - self.epsilon)


        batch_size = y_pred.shape[0] if len(y_pred.shape) > 0 else 1
        grad = - (y_true / y_pred - (1 - y_true) / (1 - y_pred))
        if batch_size > 0 : grad /= batch_size # Normalize by batch size only if batch_size > 0
        return grad


In [13]:
class DenseLayer:
    def __init__(self, input_size, output_size, weight_initializer=he_initialize_weights):
        self.weights = weight_initializer(input_size, output_size)
        self.biases = np.zeros((1, output_size))
        self.input_data = None
        self.grad_weights = np.zeros_like(self.weights)
        self.grad_biases = np.zeros_like(self.biases)

    def forward(self, input_data):
        self.input_data = input_data
        return np.dot(self.input_data, self.weights) + self.biases

    def backward(self, dout):
        self.grad_weights = np.dot(self.input_data.T, dout)
        self.grad_biases = np.sum(dout, axis=0, keepdims=True)
        dinput = np.dot(dout, self.weights.T)
        return dinput

    def get_params(self):
        return [self.weights, self.biases]

    def get_grads(self):
        return [self.grad_weights, self.grad_biases]

    def zero_grads(self):
        self.grad_weights = np.zeros_like(self.weights)
        self.grad_biases = np.zeros_like(self.biases)

class FlattenLayer:
    def __init__(self):
        self.original_shape = None

    def forward(self, input_data):
        self.original_shape = input_data.shape
        return input_data.reshape(self.original_shape[0], -1)

    def backward(self, dout):
        return dout.reshape(self.original_shape)

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass

class ReshapeLayer:
    def __init__(self, output_shape_tuple_excluding_batch):
        self.output_shape_tuple = output_shape_tuple_excluding_batch
        self.input_shape = None

    def forward(self, input_data):
        self.input_shape = input_data.shape
        batch_size = self.input_shape[0]
        return input_data.reshape(batch_size, *self.output_shape_tuple)

    def backward(self, dout):
        return dout.reshape(self.input_shape)

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass

class Conv2DLayer:
    def __init__(self, input_channels, num_filters, kernel_size, stride=1, padding=0,
                 filter_initializer=he_initialize_conv_filters):
        if isinstance(kernel_size, int):
            self.kernel_height, self.kernel_width = kernel_size, kernel_size
        else:
            self.kernel_height, self.kernel_width = kernel_size

        self.input_channels = input_channels
        self.num_filters = num_filters
        self.stride = stride
        self.padding = padding

        self.filters = filter_initializer(self.kernel_height, self.kernel_width,
                                          self.input_channels, self.num_filters)
        self.biases = np.zeros((1, 1, 1, self.num_filters))

        self.input_data_shape = None # Store shape for backward pass sanity check
        self.input_padded = None
        self.grad_filters = np.zeros_like(self.filters)
        self.grad_biases = np.zeros_like(self.biases)


    def forward(self, input_data):
        self.input_data_shape = input_data.shape
        batch_size, H_in, W_in, C_in = input_data.shape
        assert C_in == self.input_channels, f"Input channels mismatch. Expected {self.input_channels}, got {C_in}"

        H_out = (H_in - self.kernel_height + 2 * self.padding) // self.stride + 1
        W_out = (W_in - self.kernel_width + 2 * self.padding) // self.stride + 1
        output = np.zeros((batch_size, H_out, W_out, self.num_filters))

        if self.padding > 0:
            self.input_padded = np.pad(input_data,
                                       ((0, 0), (self.padding, self.padding),
                                        (self.padding, self.padding), (0, 0)),
                                       mode='constant', constant_values=0)
        else:
            self.input_padded = input_data

        for i in range(batch_size):
            for f_idx in range(self.num_filters):
                for h_out in range(H_out):
                    for w_out in range(W_out):
                        h_start = h_out * self.stride
                        h_end = h_start + self.kernel_height
                        w_start = w_out * self.stride
                        w_end = w_start + self.kernel_width
                        receptive_field = self.input_padded[i, h_start:h_end, w_start:w_end, :]
                        conv_sum = np.sum(receptive_field * self.filters[:, :, :, f_idx])
                        output[i, h_out, w_out, f_idx] = conv_sum + self.biases[0, 0, 0, f_idx]
        return output

    def backward(self, dout):
        batch_size, H_out_dout, W_out_dout, num_filters_dout = dout.shape
        batch_size_in, H_in, W_in, C_in = self.input_data_shape

        assert num_filters_dout == self.num_filters

        dinput_padded = np.zeros_like(self.input_padded)
        self.grad_filters.fill(0) # Ensure grads are zeroed
        self.grad_biases.fill(0)  # Ensure grads are zeroed

        self.grad_biases = np.sum(dout, axis=(0, 1, 2), keepdims=True).reshape(self.biases.shape)

        for i in range(batch_size):
            for f_idx in range(self.num_filters):
                for h_out in range(H_out_dout):
                    for w_out in range(W_out_dout):
                        h_start = h_out * self.stride
                        h_end = h_start + self.kernel_height
                        w_start = w_out * self.stride
                        w_end = w_start + self.kernel_width
                        receptive_field = self.input_padded[i, h_start:h_end, w_start:w_end, :]
                        grad_slice_dout = dout[i, h_out, w_out, f_idx]
                        self.grad_filters[:, :, :, f_idx] += receptive_field * grad_slice_dout
                        dinput_padded[i, h_start:h_end, w_start:w_end, :] += \
                            self.filters[:, :, :, f_idx] * grad_slice_dout

        if self.padding > 0:
            dinput = dinput_padded[:, self.padding:-self.padding, self.padding:-self.padding, :]
        else:
            dinput = dinput_padded
        
        #dinput matches original input_data's spatial dimensions
        if dinput.shape[1] > H_in: dinput = dinput[:, :H_in, :, :]
        if dinput.shape[2] > W_in: dinput = dinput[:, :, :W_in, :]
        
        return dinput

    def get_params(self): return [self.filters, self.biases]
    def get_grads(self): return [self.grad_filters, self.grad_biases]
    def zero_grads(self):
        self.grad_filters.fill(0)
        self.grad_biases.fill(0)

class MaxPooling2DLayer:
    def __init__(self, pool_size, stride=None):
        if isinstance(pool_size, int):
            self.pool_height, self.pool_width = pool_size, pool_size
        else:
            self.pool_height, self.pool_width = pool_size
        self.stride = stride if stride is not None else self.pool_height
        self.input_data_shape = None
        self.max_indices = None

    def forward(self, input_data):
        self.input_data_shape = input_data.shape
        batch_size, H_in, W_in, C = input_data.shape
        H_out = (H_in - self.pool_height) // self.stride + 1
        W_out = (W_in - self.pool_width) // self.stride + 1
        output = np.zeros((batch_size, H_out, W_out, C))
        self.max_indices = np.zeros((batch_size, H_out, W_out, C, 2), dtype=int)

        for i in range(batch_size):
            for c_idx in range(C):
                for h_out in range(H_out):
                    for w_out in range(W_out):
                        h_start = h_out * self.stride
                        h_end = h_start + self.pool_height
                        w_start = w_out * self.stride
                        w_end = w_start + self.pool_width
                        input_patch = input_data[i, h_start:h_end, w_start:w_end, c_idx]
                        max_val = np.max(input_patch)
                        output[i, h_out, w_out, c_idx] = max_val
                        idx_in_patch = np.unravel_index(np.argmax(input_patch), input_patch.shape)
                        self.max_indices[i, h_out, w_out, c_idx, 0] = h_start + idx_in_patch[0]
                        self.max_indices[i, h_out, w_out, c_idx, 1] = w_start + idx_in_patch[1]
        return output

    def backward(self, dout):
        dinput = np.zeros(self.input_data_shape)
        batch_size_dout, H_out, W_out, C_dout = dout.shape
        for i in range(batch_size_dout):
            for c_idx in range(C_dout):
                for h_out in range(H_out):
                    for w_out in range(W_out):
                        grad_val = dout[i, h_out, w_out, c_idx]
                        max_r_idx = self.max_indices[i, h_out, w_out, c_idx, 0]
                        max_c_idx = self.max_indices[i, h_out, w_out, c_idx, 1]
                        dinput[i, max_r_idx, max_c_idx, c_idx] += grad_val
        return dinput

    def get_params(self): return []
    def get_grads(self): return []
    def zero_grads(self): pass


class SequentialNetwork:
    def __init__(self, layers):
        self.layers = layers
        self.trainable_layers = [layer for layer in layers if hasattr(layer, 'get_params') and layer.get_params()]

    def forward(self, x, training=True):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, dout):
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

    def get_params_and_grads(self):
        params_grads = []
        for layer in self.trainable_layers:
            params = layer.get_params()
            grads = layer.get_grads()
            for p, g in zip(params, grads):
                params_grads.append((p, g))
        return params_grads

    def zero_all_grads(self):
        for layer in self.layers: # Zero grads for all layers that might have them
             if hasattr(layer, 'zero_grads'):
                layer.zero_grads()

class SGD:
    def __init__(self, network, learning_rate=0.001):
        self.network = network
        self.learning_rate = learning_rate

    def step(self):
        for param, grad in self.network.get_params_and_grads():
            if param is not None and grad is not None:
                param -= self.learning_rate * grad
            # else:
            #     print(f"Warning: param ({type(param)}) or grad ({type(grad)}) is None during SGD step.")

In [None]:

IMG_WIDTH = 28
IMG_HEIGHT = 28
CHANNELS = 1
LATENT_DIM = 100

# Generator---
G_layers = [
    DenseLayer(LATENT_DIM, 256, weight_initializer=xavier_initialize_weights),
    LeakyReLU(alpha=0.2),
    DenseLayer(256, 512, weight_initializer=xavier_initialize_weights),
    LeakyReLU(alpha=0.2),
    DenseLayer(512, 1024, weight_initializer=xavier_initialize_weights),
    LeakyReLU(alpha=0.2),
    DenseLayer(1024, IMG_HEIGHT * IMG_WIDTH * CHANNELS, weight_initializer=xavier_initialize_weights),
    Sigmoid(),
    ReshapeLayer((IMG_HEIGHT, IMG_WIDTH, CHANNELS))
]
generator = SequentialNetwork(G_layers)

#Discriminator
D_layers = [
    Conv2DLayer(input_channels=CHANNELS, num_filters=32, kernel_size=5, stride=1, padding=2), # (28,28,1) -> (28,28,32)
    LeakyReLU(alpha=0.2),
    MaxPooling2DLayer(pool_size=2, stride=2), # (28,28,32) -> (14,14,32)
    Conv2DLayer(input_channels=32, num_filters=64, kernel_size=5, stride=1, padding=2), # (14,14,32) -> (14,14,64)
    LeakyReLU(alpha=0.2),
    MaxPooling2DLayer(pool_size=2, stride=2), # (14,14,64) -> (7,7,64)
    FlattenLayer(),
    DenseLayer(7*7*64, 512, weight_initializer=he_initialize_weights),
    LeakyReLU(alpha=0.2),
    DenseLayer(512, 1, weight_initializer=xavier_initialize_weights),
    Sigmoid()
]
discriminator = SequentialNetwork(D_layers)

# --- Loss and Optimizers ---
bce_loss = BinaryCrossEntropyLoss()
lr_g = 0.001 # Adjusted learning rate
lr_d = 0.001 # Adjusted learning rate
optimizer_G = SGD(generator, learning_rate=lr_g)
optimizer_D = SGD(discriminator, learning_rate=lr_d)

#Training Parameter
epochs = 5 # Increased epochs slightly, still low for quick test
batch_size = 64 # Can adjust based on memory/speed
sample_interval = 1
real_label_val = 1.0
fake_label_val = 0.0
#faster training
# train_images_subset = train_images[:1024]
# num_batches = train_images_subset.shape[0] // batch_size
# current_train_images = train_images_subset
current_train_images = train_images # using full dataset
num_batches = current_train_images.shape[0] // batch_size


print(f"Starting GAN training for {epochs} epochs, batch size {batch_size}, on {current_train_images.shape[0]} images.")
os.makedirs("gan_images_scratch", exist_ok=True)

def save_generated_images(epoch, generator_model, examples=16, dim=(4,4), figsize=(6,6)):
    noise = np.random.randn(examples, LATENT_DIM)
    generated_images = generator_model.forward(noise, training=False)
    generated_images = generated_images.reshape(examples, IMG_HEIGHT, IMG_WIDTH)
    plt.figure(figsize=figsize)
    for i in range(generated_images.shape[0]):
        plt.subplot(dim[0], dim[1], i+1)
        plt.imshow(generated_images[i], interpolation='nearest', cmap='gray_r')
        plt.axis('off')
    plt.tight_layout()
    plt.savefig(f"gan_images_scratch/mnist_epoch_{epoch:04d}.png")
    plt.close()

#Training Loop 
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    d_losses_epoch = []
    g_losses_epoch = []

    for batch_idx in range(num_batches):
     
        #  Train Discriminator
      
        # Zero grads for D before any D computation
        discriminator.zero_all_grads()

        # --- Train with real images ---
        real_imgs_batch = current_train_images[batch_idx * batch_size : (batch_idx + 1) * batch_size]
        current_batch_size = real_imgs_batch.shape[0] # Actual batch size, could be smaller for last batch
        real_labels = np.full((current_batch_size, 1), real_label_val)

        d_output_real = discriminator.forward(real_imgs_batch)
        d_loss_real_val = bce_loss.forward(d_output_real, real_labels)
        d_grad_out_real = bce_loss.backward(d_output_real, real_labels)
        discriminator.backward(d_grad_out_real)
        optimizer_D.step() # update D based on real images

        # --- Train with fake images ---
        discriminator.zero_all_grads() # Zero grads again before fake pass for D
        noise = np.random.randn(current_batch_size, LATENT_DIM)
        # Detach G: G's forward pass output is used as input, no gradient flows back to G here.
        # .copy() can ensure detachment if there were any concerns about objects being modified by reference later.
        fake_imgs_batch_d_train = generator.forward(noise, training=False).copy()

        fake_labels = np.full((current_batch_size, 1), fake_label_val)
        d_output_fake = discriminator.forward(fake_imgs_batch_d_train)
        d_loss_fake_val = bce_loss.forward(d_output_fake, fake_labels)
        d_grad_out_fake = bce_loss.backward(d_output_fake, fake_labels)
        discriminator.backward(d_grad_out_fake)
        optimizer_D.step() # Update D based on fake images

        d_total_loss_batch = (d_loss_real_val + d_loss_fake_val) / 2.0
        d_losses_epoch.append(d_total_loss_batch)

       
        #  train Generator
        generator.zero_all_grads() # zero G's grads
        # discriminator's weights are "frozen" because optimizer_G only affects G's parameters.

        noise_g_train = np.random.randn(current_batch_size, LATENT_DIM)
        fake_images_for_g = generator.forward(noise_g_train)
        # D's forward pass with G's output. D is not in training mode here w.r.t its own updates.
        d_output_for_g = discriminator.forward(fake_images_for_g)

        g_loss_val = bce_loss.forward(d_output_for_g, real_labels) # G wants D to predict these as real
        g_losses_epoch.append(g_loss_val)

        grad_loss_wrt_d_output = bce_loss.backward(d_output_for_g, real_labels)
        # rads from D up to its input (G's output).
        # D's internal grads are computed, not used by optimizer_D here
        grad_d_wrt_g_output = discriminator.backward(grad_loss_wrt_d_output)
        generator.backward(grad_d_wrt_g_output) # backprop through G

        optimizer_G.step() 

        if (batch_idx + 1) % (num_batches // 10 if num_batches > 10 else 1) == 0: # Print progress a few times per epoch
            print(f"  Batch {batch_idx+1}/{num_batches} -- D Loss: {d_total_loss_batch:.4f}, G Loss: {g_loss_val:.4f}")

    avg_d_loss = np.mean(d_losses_epoch) if d_losses_epoch else 0
    avg_g_loss = np.mean(g_losses_epoch) if g_losses_epoch else 0
    print(f"Epoch {epoch+1} finished. Avg D Loss: {avg_d_loss:.4f}, Avg G Loss: {avg_g_loss:.4f}")

    if (epoch + 1) % sample_interval == 0:
        save_generated_images(epoch + 1, generator)

print("Training finished")

Starting GAN training for 5 epochs, batch size 64, on 60000 images.

Epoch 1/5


# Discriminator’s CNN Architecture

The Discriminator in your GAN is a Convolutional Neural Network (CNN) designed to classify images as real or fake, featuring the following architecture: a Conv2DLayer that transforms the input from (28, 28, 1) to (28, 28, 32) using 32 filters with a 5x5 kernel, stride 1, and padding 2, followed by a LeakyReLU activation; a MaxPooling2DLayer that reduces the size to (14, 14, 32) with a 2x2 pool and stride 2; another Conv2DLayer that maps (14, 14, 32) to (14, 14, 64) with 64 filters, a 5x5 kernel, stride 1, and padding 2, followed by another LeakyReLU; a second MaxPooling2DLayer that downsamples to (7, 7, 64) with a 2x2 pool and stride 2; a FlattenLayer that reshapes (7, 7, 64) to (3136,); a DenseLayer reducing (3136, 512) with LeakyReLU activation; a final DenseLayer from (512, 1) with a Sigmoid activation, producing a single output value between 0 and 1 to indicate if the input image is fake (0) or real (1).

## How Backpropagation Works in the Discriminator

Backpropagation computes gradients of the loss with respect to weights and biases by propagating errors backward. The loss, defined by BinaryCrossEntropyLoss as $( L = -\frac{1}{N} \sum [y \log(D(x)) + (1-y) \log(1-D(x))])$, yields an initial gradient $( \frac{\partial L}{\partial D(x)} = -\frac{1}{N} \left( \frac{y}{D(x)} - \frac{1-y}{1-D(x)} \right))$, computed in `BinaryCrossEntropyLoss.backward()` and passed to the Sigmoid layer. The Sigmoid layer scales the gradient by $( \sigma(z)(1-\sigma(z)))$, while each DenseLayer computes $( \frac{\partial L}{\partial W} = \frac{\partial L}{\partial z} \cdot x^T )$, $( \frac{\partial L}{\partial b} = \frac{\partial L}{\partial z} )$, and $( \frac{\partial L}{\partial x} = \frac{\partial L}{\partial z} \cdot W^T )$, passing $( \frac{\partial L}{\partial x} )$ to LeakyReLU, which scales by 1 (if $( x > 0 ))$ or 0.2 (if $( x \leq 0 ))$. 

The FlattenLayer reshapes the gradient from (3136,) to (7, 7, 64) for the MaxPooling2DLayer. In the MaxPooling2DLayer, the forward pass stores max indices per 2x2 patch, and the backward pass routes the incoming gradient (ex: (7, 7, 64)) to the max position in the input patch (ex: (14, 14, 64)), setting others to 0, facing challenges like non-differentiability at ties (approximated by choosing the first max), gradient sparsity slowing learning for non-max neurons, and memory overhead from storing indices. 

The LeakyReLU then scales the gradient by 1 or 0.2, passing it to the Conv2DLayer, where the forward pass computes output as $( \sum_{h,w,c} input[i, h_{start}:h_{end}, w_{start}:w_{end}, c] \cdot filters[h, w, c, f_{idx}] + biases[f_{idx}] )$. The backward pass calculates $( \frac{\partial L}{\partial filters} )$ by multiplying the gradient with the input patch, $( \frac{\partial L}{\partial biases} )$ by summing gradients, and $( \frac{\partial L}{\partial input} )$ by convolving the gradient with filters, adjusting for padding, with challenges including padding handling (slicing to match input dimensions), numerical stability (mitigated by normalizing inputs to [0, 1]), and computational complexity from iterating over positions and channels. 

This process repeats for earlier layers, stopping at the input image. Visualizing the first block (Conv2DLayer → LeakyReLU → MaxPooling2DLayer) with an input (1, 28, 28, 1), the Conv2DLayer outputs (1, 28, 28, 32) via 5x5 patch sums, LeakyReLU applies $( f(x) = x $) or $( 0.2x $), and MaxPooling2DLayer outputs (1, 14, 14, 32) by taking maxima; in the backward pass, a gradient (ex: 0.5 at (0, 0, 0)) is routed by MaxPooling2DLayer to the max position (ex: (1, 1)), scaled by LeakyReLU, and the Conv2DLayer updates filters, biases, and input gradient accordingly.

## Challenges during training for backward pass for non linear training:

**Max Pooling:**

Non-Differentiability: As mentioned, max pooling is non-differentiable at points of ambiguity (e.g., ties). The code assumes the first maximum encountered is used, which is a practical approximation.
Gradient Sparsity: Only the maximum position gets a gradient, which can lead to underutilization of some neurons. This is a known limitation of max pooling compared to average pooling.
Memory Usage: Storing self.max_indices increases memory overhead, especially for large inputs or many channels.

**LeakyReLU:**

Vanishing Gradients (Partially Mitigated): LeakyReLU helps avoid vanishing gradients (unlike ReLU, which sets gradients to 0 for negative inputs) by allowing a small gradient (a=0.2)for negative values. However, the small gradient can still slow learning for neurons with consistently negative inputs.
Numerical Stability: The code clips values in Sigmoid (-700, 700), but LeakyReLU doesn’t need such clipping since it’s linear in both regions.

## Image generation quality (visual comparison of generated anime faces):

Without Pytorch : Generated images are likely to have lower resolution (28x28) and may exhibit blocky or blurry features due to the aggressive downsampling and limited filter sizes. The lack of batch normalization or advanced regularization, could lead to mode collapse or inconsistent face generation.

With Pytorch: The plot generated in lab7 shows that the discriminator effectively distinguishes real (higher scores) from fake (lower scores) over epochs, suggesting stable training and potentially higher-quality outputs. Larger resolutions (e.g., 64x64) and advanced features like batch normalization likely produce sharper, more detailed anime faces with better color and texture fidelity compared to the custom 28x28 grayscale images.

## Training stability and convergence:

Without Pytorch: This implementation uses manually defined backward methods for layers (ex Conv2DLayer, MaxPooling2DLayer), with gradients computed via explicit rules (ex: routing to max positions in pooling, convolution with filters). The code lacks evidence of advanced stabilization techniques like label smoothing, gradient penalty, or Adam optimizer with tuned hyperparameters, which are critical for GAN stability.
Convergence: Without these features, the training may suffer from mode collapse or oscillatory behavior, where the Generator produces limited variety or the Discriminator overpowers the Generator. The absence of a score plot in bonus2.ipynb makes it hard to assess convergence, but the manual gradient flow could introduce numerical instability.

With Pytorch: The presence of a real vs. fake score plot (generated via plt.plot(real_scores, fake_scores)) suggests the use of an optimizer and loss function (binary cross-entropy), with the Discriminator and Generator trained alternately. PyTorch’s automatic differentiation and GPU acceleration (noted in metadata) enable stable gradient updates. The upward trend in real scores and downward trend in fake scores indicate balanced training and convergence toward distinguishing real anime faces.
Convergence: The plot implies the GAN converges well, with the Discriminator improving over epochs, likely aided by batch normalization and a robust optimizer, reducing the risk of mode collapse compared to the custom implementation.


## The intricacies of training stability in adversarial models like GANs:

without Pytorch: The custom implementation relies on NumPy-like operations for forward and backward passes, executed on the CPU unless manually optimized. The lack of vectorization or GPU support suggests slower training, especially with two convolution layers and dense layers for each batch. Processing 28x28 images with 32 and 64 filters is computationally lightweight but inefficient without optimization.
Memory usage is moderate due to the small image size and layer counts, but storing max indices in MaxPooling2DLayer (ex: batch_size × 7 × 7 × 64 × 2) adds overhead. No batch normalization or large batch sizes are evident, keeping resource demands low but limiting scalability.
PyTorch-based GAN Implementation:

With Pytorch:The use of PyTorch with GPU acceleration significantly speeds up training by leveraging CUDA for matrix operations across Conv2d and ConvTranspose2d layers. The plot generation over epochs suggests efficient batch processing, likely with larger batch sizes and higher-resolution images (ex: 64x64), increasing computational demand but optimized by PyTorch’s framework.
Higher memory usage is expected due to larger images, deeper networks, and batch normalization, but GPU offloading mitigates this. The Colab environment with GPU support ensures efficient resource utilization compared to the CPU-bound custom implementation.
