# Testing different models on unseen data

This playbook constructs some simple CNN, DNN and Transformer based models with the normal mnist training dataset and a modified version of the test dataset.

In [None]:
# The normal imports for this task
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [None]:
# Hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 10

# Device configuration
device = torch.device("cpu")#"cuda" if torch.cuda.is_available() else "cpu")

## Load our MNist data

Unlike before this time we will normalize our input data more 'correctly'.

We want the 'normal' mnist numerical training data.

We also want to load the test dataset using a complex set of transforms. This helps evaluate the model performance on unseen data.

In [None]:
# Load MNIST dataset without normalization
mnist_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
# Load entire dataset in one batch
data_loader = DataLoader(mnist_dataset, batch_size=len(mnist_dataset), shuffle=False)
# Get all images in a single batch
images, _ = next(iter(data_loader))  # Shape: (60000, 1, 28, 28)

# Calculate mean and standard deviation
mean = ## FINISH_ME ## Calculate the mean over all input pixels
std = images.std().item() ## Stddev from all input pixels

# Transform: Convert images to tensors and normalize, but this time to the dataset mean and stddev
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts to [0, 1]
    transforms.Normalize((mean,), (std,))
])

# Load MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

# Transform: Convert images to tensors and normalize
transform_test = transforms.Compose([
    transforms.RandomRotation(degrees= ## FINISH_ME ##, expand=False), # Random rotation of up to 20deg
    transforms.RandomAffine(degrees=0, translate= ## FINISH_ME ## ),  # Random shifts (up to 15% of image size)
    transforms.RandomResizedCrop(size=28, scale= ## FINISH_ME ## ),   # Random scaling (85% to 115% of original size)
    transforms.ColorJitter(brightness= , contrast= ),      # Adjust brightness & contrast slightly (factor 0.2)
    transforms.ToTensor(),  # Converts to [0, 1]
    transforms.Normalize((mean,), (std,))
])

test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# Build Our models

Lets Build a DNN, CNN and transformer based model for training.

In [None]:
# Build a DNN network which goes from 784 -> 512 -> 128 -> 64 -> 10
DNN_Model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 512),
            nn.ReLU(),
            nn.Linear( ## FINISH_ME ##
            nn.ReLU(),
            nn.Linear( ## FINISH_ME ##
            nn.ReLU(),
            nn.Linear(64, 10)
        )

In [None]:
# Build a simple CNN which goes from (n,1,28,28)->(n,32,14,14)->(n,64,7,7) --> 64*7*7 -> 128 -> 64 -> 10
CNN_Model = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1),  # Output: 16 x 14 x 14
            nn.ReLU(),
            nn.Conv2d( ## FINISH_ME ## kernel_size=3, stride=2, padding=1), # Output: 32 x 7 x 7
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear( ## FINISH_ME ##
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10)
        )

In [None]:
# Define Transformer Encoder Layer
# This defines our core transformer as 128dim, 4heads, 128dim-ff, dropout0.1
encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=4, dim_feedforward=128, dropout=0.1)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=5)

TF_Model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 128),      # Pseudo-Embedding layer
            transformer_encoder,          # MHA Transformer defined above
            nn.Flatten(),
            nn.Linear(128, 128),        # Taking output from MHA layer for projecting to decision
            nn.ReLU(),
            nn.Linear( ## FINISH_ME ##
            nn.ReLU(),
            nn.Linear(64, 10)
        )

In [None]:
# Build our models and print them for understanding
dnn_model = DNN_Model.to(device)
cnn_model = CNN_Model.to(device)
tf_model = TF_Model.to(device)

# Dummy batch (e.g., MNIST-like grayscale images)
dummy_images = torch.randn(16, 1, 28, 28).to(device)  # (batch_size=16, height=28, width=28)
logits = dnn_model(dummy_images)
print('DNN Model:')
print(dnn_model)

# Dummy batch (e.g., MNIST-like grayscale images)
dummy_images = torch.randn(16, 1, 28, 28).to(device)  # (batch_size=16, height=28, width=28)
logits = cnn_model(dummy_images)
print('CNN Model:')
print(cnn_model)

# Dummy batch (e.g., MNIST-like grayscale images)
dummy_images = torch.randn(16, 1, 28, 28).to(device)  # (batch_size=16, height=28, width=28)
logits = tf_model(dummy_images)
print('Transformer Model:')
print(tf_model)


## Train our models

As before, train each model over the mnist train dataset. No need to track acc/validation as this isn't too important this-time.

In [None]:
# Now we train our DNN

dnn_criterion = nn.CrossEntropyLoss()
dnn_optimizer = optim.Adam(dnn_model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
    dnn_model.train()
    total_loss = 0

    # Per-batch
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Evaluate, Loss, Backwards, Step
        dnn_optimizer.zero_grad()
        outputs = ## FINISH_ME ##
        loss = dnn_criterion(outputs, labels)
        loss.backward()
        dnn_optimizer.step()

        # Track losses
        total_loss += loss.item()
    print(f'DNN Classifier Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

In [None]:
# Now we train our CNN

cnn_criterion = nn.CrossEntropyLoss()
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
    cnn_model.train()
    total_loss = 0

    # Per-batch
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Evaluate, Loss, Backwards, Step
        cnn_optimizer.zero_grad()
        outputs = ## FINISH_ME ##
        loss = cnn_criterion(outputs, labels)
        loss.backward()
        cnn_optimizer.step()

        # Track losses
        total_loss += loss.item()
    print(f'CNN Classifier Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

In [None]:
# Now we train our Transformer

tf_criterion = nn.CrossEntropyLoss()
tf_optimizer = optim.Adam(tf_model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
    tf_model.train()
    total_loss = 0

    # Per-batch
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Evaluate, Loss, Backwards, Step
        tf_optimizer.zero_grad()
        outputs = ## FINISH_ME ##
        loss = tf_criterion(outputs, labels)
        loss.backward()
        tf_optimizer.step()

        # Track losses
        total_loss += loss.item()
    print(f'Transformer Classifier Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

## Evaluate our model on un-seen test data

Our test dataset has been defined differently this time.

We have introduced random noise/rotation/saturation/brightness jitter.

This helps show how well our model copes with completely unseen, non-perfect inputs.

We will run through this randomized input dataset 5 times to reduce statistical randomness giving us a bad idea of how accurate our model is

In [None]:
# Evaluation on the test set

dnn_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for _ in range(5):
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = dnn_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += ## FINISH_ME ##

accuracy = 100 * correct / total
print(f'DNN Accuracy on adjusted MNIST test set: {accuracy:.2f}%')

In [None]:
# Evaluation on the test set
cnn_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for _ in range(5):
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)


            outputs = cnn_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += ## FINISH_ME ##

accuracy = 100 * correct / total
print(f'CNN Accuracy on adjusted MNIST test set: {accuracy:.2f}%')

In [None]:
# Evaluation on the test set
tf_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for _ in range(5):
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = tf_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += ## FINISH_ME ##

accuracy = 100 * correct / total
print(f'Transformer Accuracy on adjusted MNIST test set: {accuracy:.2f}%')

## Output

You should probably see something like the following:

| Model | Accuracy |
| --- | --- |
| DNN | 50% |
| CNN | 60% |
| Transformer | 50% |

The DNN and transformer models are entirely relient on the structure of 'where' the information on the input is.
The CNN however is able to pick out patterns more effectively and perform better.