# CNN without Xavier/He (naive init)
CIFAR-10, 80/20 split, deep conv stack with deliberately bad uniform init (too large / too small bounds). Includes weight decay regularization, dropout, early stopping, and CSV logging (detailed + summary) for plotting.

In [None]:
import random
from dataclasses import dataclass
from typing import List, Tuple

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

: 

In [None]:
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def get_activation(name: str) -> Tuple[nn.Module, str]:
    if name == "relu":
        return nn.ReLU(), "relu"
    if name == "tanh":
        return nn.Tanh(), "tanh"
    raise ValueError(f"Unsupported activation {name}")


class ConvNet(nn.Module):
    def __init__(self, activation: str, dropout_p: float) -> None:
        super().__init__()
        act, _ = get_activation(activation)
        layers: List[nn.Module] = []
        # Block 1: 3x CONV 32 + pool
        layers += [nn.Conv2d(3, 32, kernel_size=3, padding=1), act]
        layers += [nn.Conv2d(32, 32, kernel_size=3, padding=1), act]
        layers += [nn.Conv2d(32, 32, kernel_size=3, padding=1), act, nn.MaxPool2d(2), nn.Dropout(dropout_p)]
        # Block 2: 3x CONV 64 + pool
        layers += [nn.Conv2d(32, 64, kernel_size=3, padding=1), act]
        layers += [nn.Conv2d(64, 64, kernel_size=3, padding=1), act]
        layers += [nn.Conv2d(64, 64, kernel_size=3, padding=1), act, nn.MaxPool2d(2), nn.Dropout(dropout_p)]
        # Block 3: 3x CONV 128 + pool
        layers += [nn.Conv2d(64, 128, kernel_size=3, padding=1), act]
        layers += [nn.Conv2d(128, 128, kernel_size=3, padding=1), act]
        layers += [nn.Conv2d(128, 128, kernel_size=3, padding=1), act, nn.MaxPool2d(2), nn.Dropout(dropout_p)]
        self.features = nn.Sequential(*layers)
        self.classifier = nn.Linear(128 * 4 * 4, 10)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = torch.flatten(x, 1)
        return self.classifier(x)

In [None]:
def naive_uniform_init(module: nn.Module, bound: float) -> None:
    if isinstance(module, (nn.Conv2d, nn.Linear)):
        nn.init.uniform_(module.weight, -bound, bound)
        if module.bias is not None:
            nn.init.zeros_(module.bias)


def get_data(batch_size: int, seed: int) -> Tuple[DataLoader, DataLoader]:
    transform = transforms.ToTensor()
    full_train = datasets.CIFAR10(root="data", train=True, download=True, transform=transform)
    n_train = int(0.8 * len(full_train))
    n_val = len(full_train) - n_train
    g = torch.Generator().manual_seed(seed)
    train_ds, val_ds = random_split(full_train, [n_train, n_val], generator=g)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, val_loader

In [None]:
def activation_stats(model: nn.Module, x: torch.Tensor) -> Tuple[float, float, float]:
    seq = model.features
    means, stds, frac_zero = [], [], []
    with torch.no_grad():
        for layer in seq:
            x = layer(x)
            if isinstance(layer, (nn.ReLU, nn.Tanh)):
                means.append(x.mean().item())
                stds.append(x.std().item())
                frac_zero.append((x == 0).float().mean().item())
    return (float(np.mean(means)) if means else 0.0,
            float(np.mean(stds)) if stds else 0.0,
            float(np.mean(frac_zero)) if frac_zero else 0.0)


def gradient_norm(model: nn.Module) -> float:
    norms = [p.grad.norm().item() for p in model.parameters() if p.grad is not None]
    return float(np.mean(norms)) if norms else 0.0