# MNIST CNN Improvement Workshop - Starter Notebook
## From Lesson 3.7 Neural Network & Deep Learning

This notebook provides the foundation code for implementing CNN improvements on MNIST digit classification.

**Baseline Goal:** Start with MLP achieving ~97% accuracy  
**Workshop Goal:** Achieve 99%+ accuracy using CNN techniques

Please use Conda DL environment with PyTorch installed.

## 1. Import Required Libraries

In [1]:
# Import libraries for deep learning and data handling
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
import numpy as np

# Reproducibility: fix random seeds for comparable runs
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Check for CUDA availability and use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if torch.cuda.is_available():
    print(f'CUDA device name: {torch.cuda.get_device_name()}')
    print(f'CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')

Using device: cuda
CUDA device name: Tesla T4
CUDA memory: 14.7 GB


## 2. Load MNIST Dataset

In [2]:
# Define data preprocessing: train with augmentation, test without
transform_train = transforms.Compose([
    transforms.RandomRotation(10),  # Mild rotation for digits (no horizontal flip for 6/9)
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load training data (60,000 images) with augmentation
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)

# Load test data (10,000 images) without augmentation
testset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=False, transform=transform_test)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

print(f'Training samples: {len(trainset)}')
print(f'Test samples: {len(testset)}')

100%|██████████| 9.91M/9.91M [00:00<00:00, 16.1MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 483kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.45MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.52MB/s]

Training samples: 60000
Test samples: 10000





## 3. Helper Functions for Training and Evaluation

In [3]:
def train_model(model, trainloader, criterion, optimizer, scheduler=None, epochs=5):
    """Train a model and return training history"""
    model.train()
    train_losses = []

    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in trainloader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(trainloader)
        train_losses.append(epoch_loss)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}')

        if scheduler:
            scheduler.step()

    return train_losses

In [4]:
def evaluate_model(model, testloader):
    """Evaluate model and return accuracy"""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [5]:
def count_parameters(model):
    """Count total trainable parameters in model"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def validate_model_architecture(model):
    """Validate student model meets requirements"""
    param_count = count_parameters(model)
    print(f"Model has {param_count:,} trainable parameters")

    # Optional: layer type summary to catch obvious mistakes
    layer_types = [type(m).__name__ for m in model.modules() if len(list(m.children())) == 0 and type(m).__name__ != "Sequential"]
    print(f"Layer types: {', '.join(layer_types[:12])}{'...' if len(layer_types) > 12 else ''}")

    # Sanity check: model should have a reasonable number of parameters (e.g. at least 1k)
    min_params = 1000
    assert param_count >= min_params, f"Model has too few parameters ({param_count:,}); expected at least {min_params:,}"

    # Test forward pass
    test_input = torch.randn(1, 1, 28, 28).to(device)
    try:
        output = model(test_input)
        assert output.shape == (1, 10), f"Expected output shape (1, 10), got {output.shape}"
        print("✓ Model architecture validation passed")
        return True
    except Exception as e:
        print(f"✗ Model architecture validation failed: {e}")
        return False

## 4. Baseline Model - From Lesson 3.7 Neural Network & Deep Learning

This is our starting point: a Multi-Layer Perceptron (MLP) that achieves approximately 97% accuracy.

In [6]:
class BaselineMLP(nn.Module):
    def __init__(self):
        super(BaselineMLP, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(x.shape[0], -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

## 5. Train and Evaluate Baseline Model

In [7]:
print("Training Baseline MLP (from Lesson 3.7 Neural Network & Deep Learning)...")
baseline_model = BaselineMLP().to(device)
baseline_criterion = nn.NLLLoss()
baseline_optimizer = optim.Adam(baseline_model.parameters(), lr=0.003)

# Validate architecture
validate_model_architecture(baseline_model)

# Train the model
baseline_losses = train_model(baseline_model, trainloader, baseline_criterion, baseline_optimizer)

# Evaluate the model
baseline_accuracy = evaluate_model(baseline_model, testloader)
print(f'\nBaseline MLP Accuracy: {baseline_accuracy:.2f}%')

Training Baseline MLP (from Lesson 3.7 Neural Network & Deep Learning)...
Model has 109,386 trainable parameters
✓ Model architecture validation passed
Epoch 1/5, Loss: 0.2331
Epoch 2/5, Loss: 0.1128
Epoch 3/5, Loss: 0.0893
Epoch 4/5, Loss: 0.0768
Epoch 5/5, Loss: 0.0668

Baseline MLP Accuracy: 97.03%


## 6. Improvement Exploration

Now that you have a working baseline achieving ~97% accuracy, explore ways to improve performance.

**Possible improvement paths:**

Architecture Changes, Regularization Approaches, Training Enhancements, Data Augmentation, Others

## 7. Your Implementation Area

Use the cells below to implement your chosen improvement techniques:

In [8]:
# Implement your improved model here
class ImprovedModel(nn.Module):
    def __init__(self):
        super(ImprovedModel, self).__init__()
        # Convolutional layer with batch normalization (MNIST is 1 channel grayscale)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(num_features=32)
        # Max pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        # Second convolutional layer with batch normalization
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(num_features=64)
        # Fully connected layers with dropout for regularization
        self.fc1 = nn.Linear(in_features=64 * 7 * 7, out_features=512)
        self.dropout = nn.Dropout(0.25)
        self.fc2 = nn.Linear(in_features=512, out_features=10)

    def forward(self, x):
        # Apply convolutional layers, activation function, and pooling
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        # Flatten the tensor for the fully connected layers
        x = x.view(-1, 64 * 7 * 7)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)  # NLLLoss expects log probabilities

In [None]:
# Test your implementation here
model = ImprovedModel().to(device)
validate_model_architecture(model)

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

print("Training your improved model...")
losses = train_model(model, trainloader, criterion, optimizer, scheduler=scheduler, epochs=10)
accuracy = evaluate_model(model, testloader)
print(f'Your Model Accuracy: {accuracy:.2f}%')

## 8. Results Comparison

In [None]:
# Compare your results with the baseline
print("\n" + "="*50)
print("ACCURACY COMPARISON")
print("="*50)
print(f"Baseline MLP:        {baseline_accuracy:.2f}%")
print(f"Your Model:          {accuracy:.2f}%")
print(f"Improvement:         +{accuracy - baseline_accuracy:.2f}%")
print("="*50)

## 9. Next Steps

**Experiment and iterate:**
- Try different combinations of techniques
- Analyze what works and what doesn't
- Compare training time vs. accuracy trade-offs
- Document your findings and insights

**Remember:** The goal is to understand how different techniques contribute to improved performance!

In [None]:
# Experiment 3: Data Augmentation - Target: ~99%
# Enhanced transforms for training
transform_augmented = transforms.Compose([
   transforms.RandomRotation(10),
   transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
   transforms.ToTensor(),
   transforms.Normalize((0.1307,), (0.3081,))
])

# Create augmented training dataset
trainset_aug = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform_augmented)
trainloader_aug = DataLoader(trainset_aug, batch_size=64, shuffle=True)

# Same architecture as Exercise 2
class AugmentedCNN(nn.Module):
   def __init__(self):
       super(AugmentedCNN, self).__init__()
       self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
       self.bn1 = nn.BatchNorm2d(32)
       self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
       self.bn2 = nn.BatchNorm2d(64)
       self.pool = nn.MaxPool2d(2, 2)
       self.fc1 = nn.Linear(64 * 7 * 7, 128)
       self.dropout = nn.Dropout(0.25)
       self.fc2 = nn.Linear(128, 10)
      
   def forward(self, x):
       x = self.pool(F.relu(self.bn1(self.conv1(x))))
       x = self.pool(F.relu(self.bn2(self.conv2(x))))
       x = x.view(x.size(0), -1)
       x = F.relu(self.fc1(x))
       x = self.dropout(x)
       x = self.fc2(x)
       return F.log_softmax(x, dim=1)

In [None]:
# Test your implementation here
model = AugmentedCNN().to(device)
validate_model_architecture(model)

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

print("Training your improved model...")
losses = train_model(model, trainloader, criterion, optimizer)
accuracy = evaluate_model(model, testloader)
print(f'Your Model Accuracy: {accuracy:.2f}%')

In [None]:
# Experiment 4: Learning Rate Scheduling - Target: ~99.2%
class ScheduledCNN(nn.Module):
   def __init__(self):
       super(ScheduledCNN, self).__init__()
       self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
       self.bn1 = nn.BatchNorm2d(32)
       self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
       self.bn2 = nn.BatchNorm2d(64)
       self.pool = nn.MaxPool2d(2, 2)
       self.fc1 = nn.Linear(64 * 7 * 7, 128)
       self.dropout = nn.Dropout(0.25)
       self.fc2 = nn.Linear(128, 10)
      
   def forward(self, x):
       x = self.pool(F.relu(self.bn1(self.conv1(x))))
       x = self.pool(F.relu(self.bn2(self.conv2(x))))
       x = x.view(x.size(0), -1)
       x = F.relu(self.fc1(x))
       x = self.dropout(x)
       x = self.fc2(x)
       return F.log_softmax(x, dim=1)

In [None]:
# Train CNN with Learning Rate Scheduling
print("Training CNN with Learning Rate Scheduling...")
cnn4_model = ScheduledCNN().to(device)
cnn4_criterion = nn.NLLLoss()
cnn4_optimizer = optim.Adam(cnn4_model.parameters(), lr=0.01)  # Higher initial LR
cnn4_scheduler = optim.lr_scheduler.StepLR(cnn4_optimizer, step_size=2, gamma=0.7)


cnn4_losses = train_model(cnn4_model, trainloader_aug, cnn4_criterion, cnn4_optimizer, cnn4_scheduler, epochs=5)
cnn4_accuracy = evaluate_model(cnn4_model, testloader)
print(f'Scheduled CNN Accuracy: {cnn4_accuracy:.2f}%\n')

In [None]:
# Experiment 5: Advanced ResNet-style Architecture - Target: ~99.3%+
class ResidualBlock(nn.Module):
   def __init__(self, in_channels, out_channels, stride=1):
       super(ResidualBlock, self).__init__()
       self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
       self.bn1 = nn.BatchNorm2d(out_channels)
       self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
       self.bn2 = nn.BatchNorm2d(out_channels)
      
       self.shortcut = nn.Sequential()
       if stride != 1 or in_channels != out_channels:
           self.shortcut = nn.Sequential(
               nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
               nn.BatchNorm2d(out_channels)
           )
  
   def forward(self, x):
       out = F.relu(self.bn1(self.conv1(x)))
       out = self.bn2(self.conv2(out))
       out += self.shortcut(x)
       out = F.relu(out)
       return out


class AdvancedCNN(nn.Module):
   def __init__(self):
       super(AdvancedCNN, self).__init__()
       self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1, bias=False)
       self.bn1 = nn.BatchNorm2d(32)
      
       # Residual blocks
       self.layer1 = ResidualBlock(32, 64, stride=2)  # 28x28 -> 14x14
       self.layer2 = ResidualBlock(64, 128, stride=2)  # 14x14 -> 7x7
      
       # Global average pooling
       self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
      
       # Classifier
       self.fc = nn.Linear(128, 10)
      
   def forward(self, x):
       x = F.relu(self.bn1(self.conv1(x)))
       x = self.layer1(x)
       x = self.layer2(x)
       x = self.avgpool(x)
       x = x.view(x.size(0), -1)
       x = self.fc(x)
       return F.log_softmax(x, dim=1)


# Train Advanced ResNet-style CNN
print("Training Advanced ResNet-style CNN...")
cnn5_model = AdvancedCNN().to(device)
cnn5_criterion = nn.NLLLoss()
cnn5_optimizer = optim.Adam(cnn5_model.parameters(), lr=0.001)
cnn5_scheduler = optim.lr_scheduler.StepLR(cnn5_optimizer, step_size=2, gamma=0.8)


cnn5_losses = train_model(cnn5_model, trainloader_aug, cnn5_criterion, cnn5_optimizer, cnn5_scheduler)
cnn5_accuracy = evaluate_model(cnn5_model, testloader)
print(f'Advanced CNN Accuracy: {cnn5_accuracy:.2f}%\n')
