In [1]:
import argparse
import math
import random
import os
from collections import OrderedDict
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# -----------------------
# Reproducibility helpers
# -----------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    cudnn.deterministic = False
    cudnn.benchmark = True  # faster on GPUs for conv nets

# -----------------------
# Data
# -----------------------
def get_cifar10_loaders(batch_size: int = 128, num_workers: int = 4) -> Tuple[DataLoader, DataLoader]:
    mean = (0.4914, 0.4822, 0.4465)
    std  = (0.2470, 0.2435, 0.2616)

    train_tf = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    test_tf = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    train_ds = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_tf)
    test_ds  = datasets.CIFAR10(root="./data", train=False, download=True, transform=test_tf)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=num_workers, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    return train_loader, test_loader

In [2]:
# -----------------------
# Train / Eval
# -----------------------
@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    total, correct, running_loss = 0, 0, 0.0
    criterion = nn.CrossEntropyLoss()
    for images, labels in loader:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        outputs = model(images)
        loss = criterion(outputs, labels)
        running_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return running_loss / total, correct / total

def train_one_epoch(model, loader, optimizer, device):
    model.train()
    criterion = nn.CrossEntropyLoss()
    running_loss, correct, total = 0.0, 0, 0
    for images, labels in loader:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return running_loss / total, correct / total

In [3]:
def train_and_evaluate(model_name, num_classes=10):
  if model_name == 'alexnet':
    model = AlexNetCIFAR()
  elif model_name == 'vggnet':
    model = VGGNetCIFAR()
  elif model_name == 'vggnet_batch_norm':
    model = VGGNetBNCIFAR()
  elif model_name == 'deep_vggnet':
    model = DeepVGGNetCIFAR()
  elif model_name == 'resnet':
    model = ResNetCIFAR()
  else:
    model = ResNetV2CIFAR()

  model.to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

  best_acc = 0.0
  for epoch in range(1, epochs + 1):
      train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, device)
      val_loss,  val_acc  = evaluate(model, test_loader, device)

      if val_acc > best_acc:
          best_acc = val_acc
          os.makedirs("checkpoints", exist_ok=True)
          torch.save({"model": model.state_dict(),
                      "epoch": epoch,
                      "acc": best_acc
                      },
                      f"checkpoints/{model_name}_best.pt")

      print(f"Epoch {epoch:02d}/{epochs} | "
            f"Train Loss {train_loss:.4f} Acc {train_acc*100:.2f}% | "
            f"Val Loss {val_loss:.4f} Acc {val_acc*100:.2f}% | "
            f"Best Val Acc {best_acc*100:.2f}%")

  # Final evaluation
  test_loss, test_acc = evaluate(model, test_loader, device)
  print(f"\nFinal {model_name.upper()} Test Accuracy: {test_acc*100:.2f}% (loss {test_loss:.4f})")

In [4]:
class ActStats:
    "Testing for the dying relu problem"
    def __init__(self, model):
        self.handles = []
        self.stats = OrderedDict()
        idx = 0
        for name, m in model.named_modules():
            # focus on conv/linear/activation outputs (skip containers)
            if isinstance(m, (torch.nn.Conv2d, torch.nn.Linear, torch.nn.ReLU)):
                key = f"{name or 'model'}.{idx}"
                idx += 1
                def _mk_hook(k):
                    def hook(_, __, out):
                        with torch.no_grad():
                            t = out
                            if isinstance(t, tuple): t = t[0]
                            if t is None: return
                            t = t.detach()
                            s = dict(
                                mean=t.mean().item(),
                                std=t.std().item(),
                                min=t.min().item(),
                                max=t.max().item(),
                            )
                            # % zeros (esp. meaningful after ReLU)
                            s["pct_zero"] = (t == 0).float().mean().item()
                            self.stats.setdefault(k, s)
                    return hook
                self.handles.append(m.register_forward_hook(_mk_hook(key)))

    def close(self):
        for h in self.handles: h.remove()

def run_activation_probe(model, batch, device):
    model.eval()
    x, y = batch
    x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
    probe = ActStats(model)
    with torch.no_grad():
        _ = model(x)
    probe.close()
    # Pretty print a few key stats
    print("=== Activation stats (mean/std and % zeros) ===")
    for k, s in list(probe.stats.items()):
        print(f"{k:30s} mean={s['mean']:+.3f} std={s['std']:.3f} "
              f"min={s['min']:+.3f} max={s['max']:+.3f} %zero={100*s['pct_zero']:.1f}%")

In [5]:
def grad_flow_probe(model, batch, device, criterion=torch.nn.CrossEntropyLoss()):
    "Testing for vanishing and exploding gradients"
    model.train()
    (x, y) = batch
    x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)

    # Zero grads and forward/backward once
    for p in model.parameters():
        if p.grad is not None: p.grad.zero_()
    out = model(x)
    loss = criterion(out, y)
    loss.backward()

    # Collect grad norms and grad-to-weight ratios
    rows = []
    for name, p in model.named_parameters():
        if p.grad is None or not p.requires_grad or not p.is_floating_point():
            continue
        g = p.grad.detach()
        w = p.detach()
        grad_norm = g.norm().item()
        weight_norm = w.norm().item() + 1e-12
        ratio = grad_norm / weight_norm
        rows.append((name, grad_norm, weight_norm, ratio))

    print("=== Gradient flow (per-parameter tensor) ===")
    for (name, g, w, r) in rows:
        print(f"{name:40s} |grad|={g:.3e} |w|={w:.3e} |grad|/|w|={r:.3e}")

    # Heuristic: if early-layer grad norms/ratios are orders of magnitude smaller
    # than late layers → vanishing through depth.


# Model 1: AlexNet

With advances in increased data collection with the rise of the internet and availability of GPUs, AlexNet became the state of the art for image recognition in 2012 paving the way for neural networks in many facets of computer vision

In [4]:
class AlexNetCIFAR(nn.Module):
    """
    AlexNet adapted for 32x32 inputs:
    - Use 3x3 convs (stride 1) instead of 11x11/5x5
    - Slightly reduced channels to fit CIFAR-10 scale
    """
    def __init__(self, num_classes: int = 10, dropout: float = 0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=1),  # 32x32 -> 32x32
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 32 -> 16

            nn.Conv2d(64, 192, kernel_size=5, padding=1),           # 16 -> 16
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 16 -> 8

            nn.Conv2d(192, 384, kernel_size=5, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 8 -> 4
        )

        self.head = nn.Sequential(
            nn.Conv2d(256, num_classes, kernel_size=1),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )

    def forward(self, x):
        x = self.features(x)
        return self.head(x)

In [5]:
model = AlexNetCIFAR()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in AlexNet: {total_params}")

Total number of parameters in AlexNet: 3638090


In [6]:
epochs = 10
batch_size = 128
lr = 0.001
weight_decay = 5e-4
num_workers = 2
seed = 42

In [7]:
set_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size, num_workers=num_workers)

100%|██████████| 170M/170M [00:02<00:00, 70.4MB/s]


In [8]:
train_and_evaluate(model_name='alexnet')

Epoch 01/10 | Train Loss 1.8971 Acc 27.16% | Val Loss 1.5593 Acc 41.99% | Best Val Acc 41.99%
Epoch 02/10 | Train Loss 1.5592 Acc 42.25% | Val Loss 1.4078 Acc 48.03% | Best Val Acc 48.03%
Epoch 03/10 | Train Loss 1.4273 Acc 48.07% | Val Loss 1.2888 Acc 53.55% | Best Val Acc 53.55%
Epoch 04/10 | Train Loss 1.3284 Acc 52.38% | Val Loss 1.2618 Acc 54.31% | Best Val Acc 54.31%
Epoch 05/10 | Train Loss 1.2729 Acc 54.29% | Val Loss 1.1868 Acc 57.33% | Best Val Acc 57.33%
Epoch 06/10 | Train Loss 1.2088 Acc 56.90% | Val Loss 1.1726 Acc 58.25% | Best Val Acc 58.25%
Epoch 07/10 | Train Loss 1.1645 Acc 58.86% | Val Loss 1.0700 Acc 61.78% | Best Val Acc 61.78%
Epoch 08/10 | Train Loss 1.1129 Acc 60.63% | Val Loss 1.1176 Acc 61.14% | Best Val Acc 61.78%
Epoch 09/10 | Train Loss 1.0771 Acc 62.18% | Val Loss 1.0427 Acc 63.53% | Best Val Acc 63.53%
Epoch 10/10 | Train Loss 1.0430 Acc 63.50% | Val Loss 0.9926 Acc 64.89% | Best Val Acc 64.89%

Final ALEXNET Test Accuracy: 64.89% (loss 0.9926)


# Model 2: VGGNet (without Batch Normalization)

Can we improve accuracy by going deeper? Let's replace the large convolutions of alexnet with multiple 3x3 convolutions. Their receptive fields are equivalent.

In [14]:
class VGGNetCIFAR(nn.Module):
    """
    Replace larger convolutions in alexnet with 3x3 convolutions.
    """
    def __init__(self, num_classes: int = 10, dropout: float = 0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.ReLU(inplace=True),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.ReLU(inplace=True),
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 32 -> 16

            nn.Conv2d(64, 32, kernel_size=3, padding=1),           # 16 -> 16
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 192, kernel_size=3, padding=1),           # 16 -> 16
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 16 -> 8

            nn.Conv2d(192, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 384, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 8 -> 4
        )

        self.head = nn.Sequential(
            nn.Conv2d(256, num_classes, kernel_size=1),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )

    def forward(self, x):
        x = self.features(x)
        return self.head(x)

In [15]:
model = VGGNetCIFAR()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in VGGNet: {total_params}")

Total number of parameters in VGGNet: 1894490


In [16]:
epochs = 10
batch_size = 128
lr = 0.001
weight_decay = 5e-4
num_workers = 2
seed = 42

In [14]:
set_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size, num_workers=num_workers)

In [10]:
train_and_evaluate(model_name='vggnet')

Epoch 01/10 | Train Loss 2.3030 Acc 9.75% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 02/10 | Train Loss 2.3028 Acc 9.97% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 03/10 | Train Loss 2.3028 Acc 9.82% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 04/10 | Train Loss 2.3027 Acc 9.91% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 05/10 | Train Loss 2.3027 Acc 9.75% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 06/10 | Train Loss 2.3027 Acc 9.87% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 07/10 | Train Loss 2.3027 Acc 9.95% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 08/10 | Train Loss 2.3027 Acc 9.72% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 09/10 | Train Loss 2.3027 Acc 9.87% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%
Epoch 10/10 | Train Loss 2.3027 Acc 9.76% | Val Loss 2.3026 Acc 10.00% | Best Val Acc 10.00%

Final VGGNET Test Accuracy: 10.00% (loss 2.3026)


Looks like the network is not learning. We can try diagnosing the issue

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VGGNetCIFAR().to(device)
batch = next(iter(train_loader))

In [18]:
# activation probe (look for %zero ~100% or tiny stds in early layers)
run_activation_probe(model, batch, device)

=== Activation stats (mean/std and % zeros) ===
features.0.0                   mean=-0.048 std=0.546 min=-2.904 max=+2.635 %zero=0.0%
features.1.1                   mean=+0.182 std=0.282 min=+0.000 max=+2.635 %zero=49.6%
features.2.2                   mean=+0.007 std=0.200 min=-1.052 max=+1.347 %zero=0.0%
features.3.3                   mean=+0.075 std=0.132 min=+0.000 max=+1.347 %zero=49.8%
features.4.4                   mean=+0.019 std=0.096 min=-0.692 max=+0.585 %zero=0.0%
features.5.5                   mean=+0.047 std=0.062 min=+0.000 max=+0.585 %zero=40.5%
features.7.6                   mean=+0.007 std=0.057 min=-0.303 max=+0.324 %zero=0.0%
features.8.7                   mean=+0.025 std=0.037 min=+0.000 max=+0.324 %zero=47.4%
features.9.8                   mean=+0.003 std=0.042 min=-0.173 max=+0.188 %zero=0.0%
features.10.9                  mean=+0.019 std=0.025 min=+0.000 max=+0.188 %zero=48.4%
features.12.10                 mean=-0.001 std=0.022 min=-0.088 max=+0.085 %zero=0.0%
f

In [19]:
# gradient flow probe (look for tiny |grad|/|w| early vs late)
grad_flow_probe(model, batch, device)

=== Gradient flow (per-parameter tensor) ===
features.0.weight                        |grad|=4.891e-05 |w|=1.593e+00 |grad|/|w|=3.071e-05
features.0.bias                          |grad|=1.117e-05 |w|=3.777e-01 |grad|/|w|=2.957e-05
features.2.weight                        |grad|=7.280e-05 |w|=2.314e+00 |grad|/|w|=3.146e-05
features.2.bias                          |grad|=2.601e-05 |w|=2.399e-01 |grad|/|w|=1.084e-04
features.4.weight                        |grad|=1.725e-04 |w|=4.651e+00 |grad|/|w|=3.709e-05
features.4.bias                          |grad|=8.614e-05 |w|=3.962e-01 |grad|/|w|=2.174e-04
features.7.weight                        |grad|=4.305e-04 |w|=3.258e+00 |grad|/|w|=1.321e-04
features.7.bias                          |grad|=1.827e-04 |w|=1.272e-01 |grad|/|w|=1.437e-03
features.9.weight                        |grad|=3.993e-04 |w|=8.013e+00 |grad|/|w|=4.983e-05
features.9.bias                          |grad|=4.856e-04 |w|=4.828e-01 |grad|/|w|=1.006e-03
features.12.weight       

Very clearly, the gradients in the 1st part of the network are much smaller than the later parts of the network. This indicates a **vanishing gradient problem**

# Model 3: VGGNet + Batch Normalization (to reduce vanishing gradients)

We can address this classic case of vanishing gradients with Batch normalization after every convolution and before the activations. This will normalize the activations of each layer to have a 0 mean and unit variance, improving gradient flow.

In [20]:
class VGGNetBNCIFAR(nn.Module):
    """
    Add Batch Normalization
    """
    def __init__(self, num_classes: int = 10, dropout: float = 0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.BatchNorm2d(8),
            nn.ReLU(inplace=True),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 32 -> 16

            nn.Conv2d(64, 32, kernel_size=3, padding=1),           # 16 -> 16
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 192, kernel_size=3, padding=1),           # 16 -> 16
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 16 -> 8

            nn.Conv2d(192, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 384, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(384),
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 8 -> 4
        )

        self.head = nn.Sequential(
            nn.Conv2d(256, num_classes, kernel_size=1),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )

    def forward(self, x):
        x = self.features(x)
        return self.head(x)

In [17]:
model = VGGNetBNCIFAR()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in VGGNet with BN: {total_params}")

Total number of parameters in VGGNet with BN: 1896522


In [18]:
epochs = 10
batch_size = 128
lr = 0.001
weight_decay = 5e-4
num_workers = 2
seed = 42

In [19]:
set_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size, num_workers=num_workers)

In [20]:
train_and_evaluate(model_name='vggnet_batch_norm')

Epoch 01/10 | Train Loss 1.4771 Acc 44.95% | Val Loss 1.4091 Acc 50.13% | Best Val Acc 50.13%
Epoch 02/10 | Train Loss 1.0121 Acc 63.93% | Val Loss 1.1309 Acc 61.61% | Best Val Acc 61.61%
Epoch 03/10 | Train Loss 0.8328 Acc 70.62% | Val Loss 0.9261 Acc 68.76% | Best Val Acc 68.76%
Epoch 04/10 | Train Loss 0.7151 Acc 75.21% | Val Loss 0.8410 Acc 71.72% | Best Val Acc 71.72%
Epoch 05/10 | Train Loss 0.6426 Acc 77.78% | Val Loss 0.8647 Acc 71.61% | Best Val Acc 71.72%
Epoch 06/10 | Train Loss 0.5923 Acc 79.68% | Val Loss 0.7191 Acc 75.88% | Best Val Acc 75.88%
Epoch 07/10 | Train Loss 0.5524 Acc 81.05% | Val Loss 0.6249 Acc 78.82% | Best Val Acc 78.82%
Epoch 08/10 | Train Loss 0.5196 Acc 81.95% | Val Loss 0.6250 Acc 79.32% | Best Val Acc 79.32%
Epoch 09/10 | Train Loss 0.4972 Acc 83.06% | Val Loss 0.9024 Acc 71.75% | Best Val Acc 79.32%
Epoch 10/10 | Train Loss 0.4745 Acc 83.67% | Val Loss 0.5931 Acc 80.32% | Best Val Acc 80.32%

Final VGGNET_BATCH_NORM Test Accuracy: 80.32% (loss 0.5931)

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VGGNetBNCIFAR().to(device)
batch = next(iter(train_loader))

In [23]:
# activation probe (look for %zero ~100% or tiny stds in early layers)
run_activation_probe(model, batch, device)

=== Activation stats (mean/std and % zeros) ===
features.0.0                   mean=+0.016 std=0.731 min=-3.320 max=+3.006 %zero=0.0%
features.2.1                   mean=+0.282 std=0.446 min=+0.000 max=+3.096 %zero=49.7%
features.3.2                   mean=+0.019 std=0.331 min=-1.727 max=+2.115 %zero=0.0%
features.5.3                   mean=+0.127 std=0.195 min=+0.000 max=+2.148 %zero=45.9%
features.6.4                   mean=-0.008 std=0.144 min=-0.874 max=+0.903 %zero=0.0%
features.8.5                   mean=+0.049 std=0.080 min=+0.000 max=+0.919 %zero=52.9%
features.10.6                  mean=-0.001 std=0.061 min=-0.294 max=+0.291 %zero=0.0%
features.12.7                  mean=+0.025 std=0.035 min=+0.000 max=+0.322 %zero=46.1%
features.13.8                  mean=+0.003 std=0.039 min=-0.185 max=+0.177 %zero=0.0%
features.15.9                  mean=+0.016 std=0.022 min=+0.000 max=+0.154 %zero=47.7%
features.17.10                 mean=-0.001 std=0.022 min=-0.085 max=+0.100 %zero=0.0%
f

In [22]:
# gradient flow probe (look for tiny |grad|/|w| early vs late)
grad_flow_probe(model, batch, device)

=== Gradient flow (per-parameter tensor) ===
features.0.weight                        |grad|=1.104e-01 |w|=1.607e+00 |grad|/|w|=6.872e-02
features.0.bias                          |grad|=6.346e-08 |w|=3.963e-01 |grad|/|w|=1.601e-07
features.1.weight                        |grad|=1.124e-02 |w|=2.828e+00 |grad|/|w|=3.974e-03
features.1.bias                          |grad|=8.962e-03 |w|=1.000e-12 |grad|/|w|=8.962e+09
features.3.weight                        |grad|=2.356e-01 |w|=2.301e+00 |grad|/|w|=1.024e-01
features.3.bias                          |grad|=9.564e-08 |w|=2.729e-01 |grad|/|w|=3.505e-07
features.4.weight                        |grad|=1.491e-02 |w|=4.000e+00 |grad|/|w|=3.727e-03
features.4.bias                          |grad|=1.032e-02 |w|=1.000e-12 |grad|/|w|=1.032e+10
features.6.weight                        |grad|=2.555e-01 |w|=4.602e+00 |grad|/|w|=5.552e-02
features.6.bias                          |grad|=3.757e-08 |w|=4.095e-01 |grad|/|w|=9.174e-08
features.7.weight        

If you notice just the weights, it looks like graidents are of the same magnitude throughout the network. The network is learning.

# Model 4: Adding more layers to VGGNet

Can we go deeper by adding even more layers to the network?

In [24]:
class DeepVGGNetCIFAR(nn.Module):
    """
    Add more layers to create a deeper network
    """
    def __init__(self, num_classes: int = 10, dropout: float = 0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.BatchNorm2d(8),
            nn.ReLU(inplace=True),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 32 -> 16

            nn.Conv2d(64, 32, kernel_size=3, padding=1),           # 16 -> 16
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 192, kernel_size=3, padding=1),           # 16 -> 16
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 16 -> 8

            nn.Conv2d(192, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 384, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(384),
            nn.ReLU(inplace=True),


            ###### Add these layers #######
            nn.Conv2d(384, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True),

            nn.Conv2d(192, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 384, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(384),
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True),

            nn.Conv2d(192, 64, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 384, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(384),
            nn.ReLU(inplace=True),

            ###### End layers added ######

            nn.Conv2d(384, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),          # 8 -> 8
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 8 -> 4
        )

        self.head = nn.Sequential(
            nn.Conv2d(256, num_classes, kernel_size=1),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )

    def forward(self, x):
        x = self.features(x)
        return self.head(x)

In [53]:
model = DeepVGGNetCIFAR()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in Deep VGGNet with BN: {total_params}")

Total number of parameters in Deep VGGNet with BN: 3227850


In [54]:
epochs = 10
batch_size = 128
lr = 0.001
weight_decay = 5e-4
num_workers = 2
seed = 42

In [55]:
set_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size, num_workers=num_workers)

In [56]:
train_and_evaluate(model_name='deep_vggnet')

Epoch 01/10 | Train Loss 1.8102 Acc 29.33% | Val Loss 2.1262 Acc 27.52% | Best Val Acc 27.52%
Epoch 02/10 | Train Loss 1.3589 Acc 49.14% | Val Loss 1.4756 Acc 49.22% | Best Val Acc 49.22%
Epoch 03/10 | Train Loss 1.1065 Acc 60.38% | Val Loss 1.2567 Acc 54.08% | Best Val Acc 54.08%
Epoch 04/10 | Train Loss 0.9466 Acc 66.67% | Val Loss 0.9152 Acc 67.96% | Best Val Acc 67.96%
Epoch 05/10 | Train Loss 0.8455 Acc 70.87% | Val Loss 1.0992 Acc 61.62% | Best Val Acc 67.96%
Epoch 06/10 | Train Loss 0.7703 Acc 73.34% | Val Loss 0.8577 Acc 70.79% | Best Val Acc 70.79%
Epoch 07/10 | Train Loss 0.7051 Acc 75.91% | Val Loss 0.7908 Acc 74.24% | Best Val Acc 74.24%
Epoch 08/10 | Train Loss 0.6508 Acc 77.83% | Val Loss 0.6803 Acc 77.39% | Best Val Acc 77.39%
Epoch 09/10 | Train Loss 0.6088 Acc 79.40% | Val Loss 0.7643 Acc 74.92% | Best Val Acc 77.39%
Epoch 10/10 | Train Loss 0.5745 Acc 80.62% | Val Loss 0.6681 Acc 77.96% | Best Val Acc 77.96%

Final DEEP_VGGNET Test Accuracy: 77.96% (loss 0.6681)


Show training error increase along with test error increase compared to the VGGNetBN (this is not classical overfitting). This is **performance degradation**. You can try increasing layers like this and will notice performance will be worse than theser layers

In [25]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = DeepVGGNetCIFAR().to(device)
batch = next(iter(train_loader))

In [26]:
# activation probe (look for %zero ~100% or tiny stds in early layers)
run_activation_probe(model, batch, device)

=== Activation stats (mean/std and % zeros) ===
features.0.0                   mean=-0.110 std=0.681 min=-2.968 max=+2.684 %zero=0.0%
features.2.1                   mean=+0.222 std=0.344 min=+0.000 max=+2.684 %zero=55.2%
features.3.2                   mean=-0.031 std=0.241 min=-1.499 max=+0.870 %zero=0.0%
features.5.3                   mean=+0.074 std=0.105 min=+0.000 max=+0.870 %zero=49.3%
features.6.4                   mean=-0.005 std=0.088 min=-0.532 max=+0.457 %zero=0.0%
features.8.5                   mean=+0.033 std=0.050 min=+0.000 max=+0.457 %zero=52.5%
features.10.6                  mean=+0.012 std=0.050 min=-0.238 max=+0.228 %zero=0.0%
features.12.7                  mean=+0.026 std=0.034 min=+0.000 max=+0.228 %zero=38.8%
features.13.8                  mean=+0.001 std=0.040 min=-0.156 max=+0.160 %zero=0.0%
features.15.9                  mean=+0.017 std=0.023 min=+0.000 max=+0.160 %zero=48.1%
features.17.10                 mean=-0.000 std=0.023 min=-0.073 max=+0.060 %zero=0.0%
f

In [27]:
# gradient flow probe (look for tiny |grad|/|w| early vs late)
grad_flow_probe(model, batch, device)

=== Gradient flow (per-parameter tensor) ===
features.0.weight                        |grad|=4.680e-01 |w|=1.614e+00 |grad|/|w|=2.899e-01
features.0.bias                          |grad|=2.945e-07 |w|=3.580e-01 |grad|/|w|=8.228e-07
features.1.weight                        |grad|=4.419e-02 |w|=2.828e+00 |grad|/|w|=1.562e-02
features.1.bias                          |grad|=8.214e-02 |w|=1.000e-12 |grad|/|w|=8.214e+10
features.3.weight                        |grad|=7.819e-01 |w|=2.269e+00 |grad|/|w|=3.445e-01
features.3.bias                          |grad|=4.360e-07 |w|=2.771e-01 |grad|/|w|=1.573e-06
features.4.weight                        |grad|=4.208e-02 |w|=4.000e+00 |grad|/|w|=1.052e-02
features.4.bias                          |grad|=3.909e-02 |w|=1.000e-12 |grad|/|w|=3.909e+10
features.6.weight                        |grad|=7.989e-01 |w|=4.616e+00 |grad|/|w|=1.730e-01
features.6.bias                          |grad|=1.466e-07 |w|=3.364e-01 |grad|/|w|=4.358e-07
features.7.weight        

There doesn't seem to be issues of dying reLU or vanishing gradients.

# Model 5: ResNet: Add residual connections to the same network

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Minimal residual block; needed so we can do x + F(x) inside a Sequential
class Residual(nn.Module):
    def __init__(self, in_ch, mid_ch, out_ch):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, mid_ch, kernel_size=3, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(mid_ch)
        self.conv2 = nn.Conv2d(mid_ch, out_ch, kernel_size=3, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_ch)
        self.proj  = None

        if in_ch != out_ch:
            self.proj = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, kernel_size=1, bias=False),
                nn.BatchNorm2d(out_ch),
            )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self.proj is not None:
            identity = self.proj(identity)
        out = self.relu(out + identity)
        return out

class ResNetCIFAR(nn.Module):
    def __init__(self, num_classes: int = 10, dropout: float = 0.5):
        super().__init__()

        # Everything (stem + residual stacks + pools) is inside ONE Sequential
        self.features = nn.Sequential(
            # Stem (kept simple like your original)
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(8),
            nn.ReLU(inplace=True),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            # Residual stacks (your channel plan, now with skips)
            Residual(64, 32, 192),
            nn.MaxPool2d(kernel_size=2, stride=2),   # 16 -> 8

            Residual(192, 64, 384),

            # Your “added” layers as residual pairs
            Residual(384, 64, 192),
            Residual(192, 64, 384),
            Residual(384, 64, 192),
            Residual(192, 64, 384),

            # Tail before final stage
            Residual(384, 256, 256),
            nn.MaxPool2d(kernel_size=2, stride=2),   # 8 -> 4
        )

        # Head kept identical to your style
        self.head = nn.Sequential(
            nn.Conv2d(256, num_classes, kernel_size=1, bias=True),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )

        # Kaiming init + BN defaults
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.features(x)
        x = self.head(x)
        return x


In [42]:
model = ResNetCIFAR()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in Deep ResNet: {total_params}")

Total number of parameters in Deep ResNet: 3727474


In [43]:
epochs = 10
batch_size = 128
lr = 0.001
weight_decay = 5e-4
num_workers = 2
seed = 42

In [44]:
set_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size, num_workers=num_workers)

In [47]:
train_and_evaluate(model_name='resnet')

Epoch 01/10 | Train Loss 1.3642 Acc 50.67% | Val Loss 1.2370 Acc 56.70% | Best Val Acc 56.70%
Epoch 02/10 | Train Loss 0.9287 Acc 66.86% | Val Loss 0.9092 Acc 68.70% | Best Val Acc 68.70%
Epoch 03/10 | Train Loss 0.7638 Acc 73.24% | Val Loss 1.1951 Acc 62.42% | Best Val Acc 68.70%
Epoch 04/10 | Train Loss 0.6807 Acc 76.34% | Val Loss 0.7045 Acc 76.10% | Best Val Acc 76.10%
Epoch 05/10 | Train Loss 0.6215 Acc 78.54% | Val Loss 0.6758 Acc 77.56% | Best Val Acc 77.56%
Epoch 06/10 | Train Loss 0.5808 Acc 80.12% | Val Loss 0.7190 Acc 76.13% | Best Val Acc 77.56%
Epoch 07/10 | Train Loss 0.5456 Acc 81.40% | Val Loss 0.7783 Acc 74.31% | Best Val Acc 77.56%
Epoch 08/10 | Train Loss 0.5182 Acc 82.49% | Val Loss 0.7440 Acc 75.97% | Best Val Acc 77.56%
Epoch 09/10 | Train Loss 0.4967 Acc 82.91% | Val Loss 0.6441 Acc 78.62% | Best Val Acc 78.62%
Epoch 10/10 | Train Loss 0.4654 Acc 84.12% | Val Loss 0.5578 Acc 81.10% | Best Val Acc 81.10%

Final RESNET Test Accuracy: 81.10% (loss 0.5578)


In [39]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ResNetCIFAR().to(device)
batch = next(iter(train_loader))

In [40]:
# activation probe (look for %zero ~100% or tiny stds in early layers)
run_activation_probe(model, batch, device)

=== Activation stats (mean/std and % zeros) ===
features.0.0                   mean=+0.097 std=1.260 min=-9.333 max=+9.075 %zero=0.0%
features.2.1                   mean=+0.496 std=0.846 min=+0.000 max=+9.075 %zero=48.6%
features.3.2                   mean=-0.210 std=1.283 min=-12.405 max=+7.836 %zero=0.0%
features.5.3                   mean=+0.323 std=0.613 min=+0.000 max=+7.836 %zero=54.7%
features.6.4                   mean=+0.076 std=0.933 min=-7.237 max=+8.217 %zero=0.0%
features.8.5                   mean=+0.376 std=0.593 min=+0.000 max=+8.217 %zero=45.7%
features.9.conv1.6             mean=-0.073 std=0.947 min=-6.405 max=+6.142 %zero=0.0%
features.9.relu.8              mean=+0.311 std=0.527 min=+0.000 max=+6.142 %zero=52.9%
features.9.conv2.7             mean=-0.058 std=0.794 min=-6.522 max=+6.761 %zero=0.0%
features.11.conv1.9            mean=+0.287 std=1.490 min=-8.358 max=+12.045 %zero=0.0%
features.11.relu.12            mean=+0.689 std=1.088 min=+0.000 max=+12.045 %zero=44.3

In [41]:
# gradient flow probe (look for tiny |grad|/|w| early vs late)
grad_flow_probe(model, batch, device)

=== Gradient flow (per-parameter tensor) ===
features.0.weight                        |grad|=3.167e+00 |w|=4.164e+00 |grad|/|w|=7.604e-01
features.1.weight                        |grad|=3.899e-01 |w|=2.828e+00 |grad|/|w|=1.378e-01
features.1.bias                          |grad|=3.612e-01 |w|=1.000e-12 |grad|/|w|=3.612e+11
features.3.weight                        |grad|=2.824e+00 |w|=5.721e+00 |grad|/|w|=4.936e-01
features.4.weight                        |grad|=4.471e-01 |w|=4.000e+00 |grad|/|w|=1.118e-01
features.4.bias                          |grad|=4.376e-01 |w|=1.000e-12 |grad|/|w|=4.376e+11
features.6.weight                        |grad|=3.050e+00 |w|=1.150e+01 |grad|/|w|=2.651e-01
features.7.weight                        |grad|=2.642e-01 |w|=8.000e+00 |grad|/|w|=3.302e-02
features.7.bias                          |grad|=2.204e-01 |w|=1.000e-12 |grad|/|w|=2.204e+11
features.9.conv1.weight                  |grad|=4.437e+00 |w|=5.589e+00 |grad|/|w|=7.938e-01
features.9.bn1.weight    

# Model 6: Deeper than deep

In [6]:
class ResNetV2CIFAR(nn.Module):
    def __init__(self, num_classes: int = 10, dropout: float = 0.5):
        super().__init__()

        # Everything (stem + residual stacks + pools) is inside ONE Sequential
        self.features = nn.Sequential(
            # Stem (kept simple like your original)
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(8),
            nn.ReLU(inplace=True),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            # Residual stacks (your channel plan, now with skips)
            Residual(64, 16, 64),          # (was 8->16->64 earlier; now residual at 64)
            nn.MaxPool2d(kernel_size=2, stride=2),   # 32 -> 16

            Residual(64, 32, 192),
            nn.MaxPool2d(kernel_size=2, stride=2),   # 16 -> 8

            Residual(192, 64, 384),

            # “added” layers as residual pairs
            Residual(384, 64, 192),
            Residual(192, 64, 384),
            Residual(384, 64, 192),
            Residual(192, 64, 384),


            # Added layers
            Residual(384, 64, 384),
            Residual(384, 64, 384),
            Residual(384, 64, 384),
            Residual(384, 64, 384),
            Residual(384, 64, 384),

            # Tail before final stage
            Residual(384, 256, 256),
            nn.MaxPool2d(kernel_size=2, stride=2),   # 8 -> 4
        )

        # Head kept identical to your style
        self.head = nn.Sequential(
            nn.Conv2d(256, num_classes, kernel_size=1, bias=True),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )

        # Kaiming init + BN defaults
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.features(x)
        x = self.head(x)
        return x

In [9]:
model = ResNetV2CIFAR()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in Deep ResNet: {total_params}")

Total number of parameters in Deep ResNet: 5943794


In [10]:
epochs = 10
batch_size = 128
lr = 0.001
weight_decay = 5e-4
num_workers = 2
seed = 42

In [11]:
set_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size, num_workers=num_workers)

100%|██████████| 170M/170M [00:04<00:00, 38.5MB/s]


In [13]:
train_and_evaluate(model_name='resnet_v2')

Epoch 01/10 | Train Loss 1.4600 Acc 47.30% | Val Loss 1.3920 Acc 52.20% | Best Val Acc 52.20%
Epoch 02/10 | Train Loss 0.9947 Acc 64.98% | Val Loss 1.0860 Acc 61.74% | Best Val Acc 61.74%
Epoch 03/10 | Train Loss 0.8097 Acc 71.61% | Val Loss 0.9128 Acc 69.12% | Best Val Acc 69.12%
Epoch 04/10 | Train Loss 0.7061 Acc 75.67% | Val Loss 0.9715 Acc 67.66% | Best Val Acc 69.12%
Epoch 05/10 | Train Loss 0.6398 Acc 77.95% | Val Loss 0.6740 Acc 77.13% | Best Val Acc 77.13%
Epoch 06/10 | Train Loss 0.5836 Acc 80.00% | Val Loss 0.6718 Acc 77.28% | Best Val Acc 77.28%
Epoch 07/10 | Train Loss 0.5486 Acc 81.44% | Val Loss 0.6752 Acc 77.96% | Best Val Acc 77.96%
Epoch 08/10 | Train Loss 0.5161 Acc 82.36% | Val Loss 0.6665 Acc 77.77% | Best Val Acc 77.96%
Epoch 09/10 | Train Loss 0.4832 Acc 83.56% | Val Loss 0.5690 Acc 81.13% | Best Val Acc 81.13%
Epoch 10/10 | Train Loss 0.4620 Acc 84.06% | Val Loss 0.5663 Acc 81.27% | Best Val Acc 81.27%

Final RESNET_V2 Test Accuracy: 81.27% (loss 0.5663)


# Conclusion

- Making a neural network deep can lead to issues like the vanishing gradient problem where the network does not learn
- This can be solved with Batch Normalization.
- But as we continue to add layers to the network, there is another problem that arises: performance degradation.
- This is strange as in theory, deeper networks should be able to mimic shallow networks with some layers acting as a passthrough / identity. This however does not happen in practice as it is difficult for Conv + BN + ReLU to emulate a pass through.
- To solve this, we change the structure of the network by adding residual connections. So the resnet block is H(x) = ReLU(x + F(x)). And now, H(x) can more easily mimic a passthrough. This means F(x) which is the Conv + BN + Pooling now has to be 0. This can be done in multiple ways (like having all convolution parameters be 0, batch norm parameters learn to be 0)
- This should ensure in theory, a deeper network is at least as performance as a shallow network. We saw this true in practice with Model 6.