
# CIFAR-10: ReLU vs Tanh (AlexNet-style) — Training Error & Epoch Time

This notebook reproduces the classic comparison between **ReLU** and **tanh** activations on **CIFAR-10**, inspired by the small-ConvNet experiment described in the AlexNet paper (Figure 1).  
You will train two *identical* four-layer convolutional networks that differ **only** in their hidden-layer nonlinearity (ReLU vs Tanh), and stop training **per-model** when training error ≤ **25%**.

### What you’ll get
- A 4-layer ConvNet for CIFAR-10 (same architecture for both activations).
- **Early stopping at 25% training error** (per model), with a maximum epoch cap for safety.
- Figure 1: **Training error vs epochs** (ReLU: solid; Tanh: dashed) — analogous to AlexNet’s figure.
- Figure 2: **Time per epoch (seconds)** for both ReLU and Tanh in the **same plot**.

> **Tip:** For a quick run, try fewer epochs or a smaller batch size. For best results, run on GPU.


In [1]:

# Imports & Environment
import math, time, random, os
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Reproducibility (still not perfectly deterministic across cuDNN/CUDA versions)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# For (more) determinism; may reduce throughput
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


ModuleNotFoundError: No module named 'torch'

In [None]:

# CIFAR-10 Data (train set)
batch_size = 128  # you can change
num_workers = 2   # adjust for your machine

# Standard CIFAR-10 normalization
CIFAR10_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR10_STD  = (0.2023, 0.1994, 0.2010)

train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

train_set = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
len(train_set)


In [None]:

# 4-Layer ConvNet (hidden activation = ReLU or Tanh)
class ConvNet4(nn.Module):
    """
    4-layer convnet + 2-layer MLP head.
    - Same architecture for both experiments; only the activation differs.
    - Output layer is linear (CrossEntropyLoss applies softmax internally).
    """
    def __init__(self, activation: str = "relu"):
        super().__init__()
        activation = activation.lower()
        if activation == "relu":
            Act = nn.ReLU
        elif activation == "tanh":
            Act = nn.Tanh
        else:
            raise ValueError("activation must be 'relu' or 'tanh'")

        self.features = nn.Sequential(
            nn.Conv2d(3,  64, kernel_size=5, padding=2),
            Act(),
            nn.MaxPool2d(kernel_size=3, stride=2),  # 32->15

            nn.Conv2d(64, 128, kernel_size=5, padding=2),
            Act(),
            nn.MaxPool2d(kernel_size=3, stride=2),  # 15->7

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            Act(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            Act(),
            nn.MaxPool2d(kernel_size=3, stride=2),  # 7->3
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256*3*3, 512),
            Act(),
            nn.Dropout(0.5),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

def init_weights_xavier(m):
    if isinstance(m, (nn.Conv2d, nn.Linear)):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

# Create a "base" initialization we can reuse for both models (ensures identical starting weights).
base_init_model = ConvNet4("relu")
base_init_model.apply(init_weights_xavier)
base_state = deepcopy(base_init_model.state_dict())


In [None]:

# Training utilities (early stop at ≤25% training error)
def accuracy_from_logits(logits, targets):
    preds = logits.argmax(dim=1)
    correct = (preds == targets).sum().item()
    total = targets.numel()
    return correct / total

@torch.no_grad()
def evaluate_training_set(model, loader, device):
    model.eval()
    total = 0
    correct = 0
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss(reduction='sum')
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        logits = model(x)
        total_loss += criterion(logits, y).item()
        correct += (logits.argmax(1) == y).sum().item()
        total += y.size(0)
    avg_loss = total_loss / total
    acc = correct / total
    err = 1 - acc
    return avg_loss, acc, err

def train_until_25pct(
    model,
    activation_name: str,
    train_loader,
    device,
    lr=0.01,
    momentum=0.9,
    weight_decay=5e-4,
    max_epochs=200,
    milestones=(60, 120, 160),
    gamma=0.2,
):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum,
                                weight_decay=weight_decay, nesterov=True)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=list(milestones), gamma=gamma)

    logs = {"epoch": [], "train_loss": [], "train_acc": [], "train_err": [], "epoch_time_sec": []}
    reached_25 = False

    for epoch in range(1, max_epochs + 1):
        start = time.time()
        model.train()
        running_loss = 0.0
        n = 0

        for x, y in train_loader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            logits = model(x)
            loss = nn.CrossEntropyLoss()(logits, y)
            loss.backward()
            optimizer.step()

            batch_size = y.size(0)
            running_loss += loss.item() * batch_size
            n += batch_size

        # synchronize for accurate wall time on GPU
        if device.type == "cuda":
            torch.cuda.synchronize()
        epoch_time = time.time() - start

        # full train-set evaluation (loss/acc/error)
        train_loss, train_acc, train_err = evaluate_training_set(model, train_loader, device)

        logs["epoch"].append(epoch)
        logs["train_loss"].append(train_loss)
        logs["train_acc"].append(train_acc)
        logs["train_err"].append(train_err)
        logs["epoch_time_sec"].append(epoch_time)

        print(f"{activation_name:>5} | epoch {epoch:3d} | loss {train_loss:.4f} | "
              f"acc {train_acc*100:6.2f}% | err {train_err*100:6.2f}% | time {epoch_time:6.1f}s")

        scheduler.step()

        if train_err <= 0.25:
            print(f"Stopping {activation_name}: training error ≤ 25% reached at epoch {epoch}.")
            reached_25 = True
            break

    if not reached_25:
        print(f"{activation_name} did not reach ≤25% training error within {max_epochs} epochs.")

    import pandas as pd
    df = pd.DataFrame(logs)
    return df


In [None]:

# Run both trainings (identical init & hyperparams)
learning_rate = 0.01
momentum = 0.9
weight_decay = 5e-4
max_epochs = 200

# Create the two models with identical starting weights
relu_model = ConvNet4("relu")
relu_model.load_state_dict(deepcopy(base_state))

tanh_model = ConvNet4("tanh")
tanh_model.load_state_dict(deepcopy(base_state))

# Train ReLU then Tanh
df_relu = train_until_25pct(
    relu_model, "ReLU", train_loader, device,
    lr=learning_rate, momentum=momentum, weight_decay=weight_decay, max_epochs=max_epochs
)

df_tanh = train_until_25pct(
    tanh_model, "Tanh", train_loader, device,
    lr=learning_rate, momentum=momentum, weight_decay=weight_decay, max_epochs=max_epochs
)

# Save logs (optional)
import os
os.makedirs("runs", exist_ok=True)
df_relu.to_csv("runs/relu_logs.csv", index=False)
df_tanh.to_csv("runs/tanh_logs.csv", index=False)

df_relu.tail(), df_tanh.tail()


In [None]:

# Figure 1 — Training error vs epochs (ReLU solid, Tanh dashed)
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 5))
plt.plot(df_relu["epoch"], df_relu["train_err"], linestyle="-", label="ReLU")
plt.plot(df_tanh["epoch"], df_tanh["train_err"], linestyle="--", label="Tanh")
plt.xlabel("Epochs")
plt.ylabel("Training error rate")
plt.title("Training error on CIFAR-10: ReLU vs Tanh")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# Figure 2 — Time per epoch (sec) for both trainings in the SAME plot
plt.figure(figsize=(7, 5))
plt.plot(df_relu["epoch"], df_relu["epoch_time_sec"], label="ReLU")
plt.plot(df_tanh["epoch"], df_tanh["epoch_time_sec"], label="Tanh")
plt.xlabel("Epochs")
plt.ylabel("Time per epoch (sec)")
plt.title("Epoch time: ReLU vs Tanh")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# Final comparison table
import pandas as pd

def summarize(df, name):
    return {
        "name": name,
        "epochs_ran": int(df["epoch"].iloc[-1]),
        "final_train_err": float(df["train_err"].iloc[-1]),
        "final_train_acc": float(df["train_acc"].iloc[-1]),
        "mean_epoch_time_sec": float(df["epoch_time_sec"].mean()),
        "median_epoch_time_sec": float(df["epoch_time_sec"].median()),
    }

summary = pd.DataFrame([summarize(df_relu, "ReLU"), summarize(df_tanh, "Tanh")])
summary



## Notes & Tips

- **Identical initialization**: Both models start from the *same* weights by copying a single randomly-initialized state dict (`base_state`) into each model. This keeps the comparison fair.
- **Architecture**: 4 conv layers with 3 max-pooling layers, then a small MLP head. Output layer is linear (softmax absorbed by `CrossEntropyLoss`).
- **Early stopping**: Each model stops when its own training error ≤ 25%, matching the assignment. If a model never reaches this threshold, it will train up to `max_epochs` and notify you.
- **Speed differences**: ReLU typically trains faster (fewer epochs to a given error) than Tanh. Your exact curves will depend on hardware, batch size, LR schedule, and data augmentations.
- **GPU**: Strongly recommended. If you switch machines (e.g., CPU), the per-epoch times will change; that’s fine for the second figure.
- **Shaping your curves**: If your Tanh line decays too quickly (or ReLU too slowly), try lowering `learning_rate` for Tanh or increasing for ReLU slightly, and/or increasing `max_epochs`. Removing data augmentation can also make the difference more pronounced (but may change the epoch count needed to hit 25% error).
