# Dataset Sufficiency Analysis for Classification Tutorial

In [None]:
try:
  import google.colab
  !pip install -q daml[torch] torchmetrics torchvision
  !export LC_ALL="en_US.UTF-8"
  !export LD_LIBRARY_PATH="/usr/lib64-nvidia"
  !export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
  !ldconfig /usr/lib64-nvidia
except:
  pass

!pip install -q tabulate

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
from typing import Dict, cast

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
import torchvision.datasets as datasets
import torchvision.transforms.v2 as v2
from torch.utils.data import DataLoader, Subset

from daml.metrics.sufficiency import Sufficiency

np.random.seed(0)
torch.manual_seed(0)
torch.set_float32_matmul_precision('high')
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
datasets.MNIST('./data', train=True, download=True)
datasets.MNIST('./data', train=False, download=True)

## Load data and define functions

Load the MNIST data and create the training and test datasets.


In [None]:
# Download the mnist dataset and preview the images
to_tensor = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])
train_ds = datasets.MNIST('./data', train=True, download=True, transform=to_tensor)
test_ds = datasets.MNIST('./data', train=False, download=True, transform=to_tensor)

In [None]:
fig = plt.figure(figsize=(8, 3))

for lbl in range(10):
    i = (train_ds.targets == lbl).nonzero()[0][0]
    img = train_ds.data[i]
    ax = fig.add_subplot(2, 5, lbl+1)
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.imshow(img, cmap='gray_r')

For the purposes of this example, we will use subsets of the training (2000) and test (500) data.

In [None]:
# Take a subset of 2000 training images and 500 test images
train_ds = Subset(train_ds, range(2000))
test_ds = Subset(test_ds, range(500))

Next, we define the network architecture we will be using.

In [None]:
# Define our network architecture
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(6400, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Compile the model 
model = torch.compile(Net().to(device))

# Type cast the model back to Net as torch.compile returns a Unknown
# Nothing internally changes from the cast; we are simply signaling the type
model = cast(Net, model)

Finally, we define our custom training and evaluation functions. Sufficiency requires that the evaluation function returns a dictionary of the results.

In [None]:
def custom_train(model: nn.Module, dataloader: DataLoader):
    # Defined only for this testing scenario
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    epochs = 10

    for epoch in range(epochs):
        for batch in dataloader:
            # Load data/images to device
            X = torch.Tensor(batch[0]).to(device)
            # Load targets/labels to device
            y = torch.Tensor(batch[1]).to(device)
            # Zero out gradients
            optimizer.zero_grad()
            # Forward propagation
            outputs = model(X)
            # Compute loss
            loss = criterion(outputs, y)
            # Back prop
            loss.backward()
            # Update weights/parameters
            optimizer.step()


def custom_eval(model: nn.Module, dataloader: DataLoader) -> Dict[str, float]:
    metric = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
    result = 0

    # Set model layers into evaluation mode
    model.eval()
    # Tell PyTorch to not track gradients, greatly speeds up processing
    with torch.no_grad():
        for batch in dataloader:
            # Load data/images to device
            X = torch.Tensor(batch[0]).to(device)
            # Load targets/labels to device
            y = torch.Tensor(batch[1]).to(device)
            preds = model(X)
            metric.update(preds, y)
        result = metric.compute()
    return { "Accuracy": result }

## Initialize metric

Attach the custom training and evaluation functions to the Sufficiency metric.

In [None]:
# Instantiate sufficiency metric
suff = Sufficiency(
    model = model,
    train_ds = train_ds,
    test_ds = test_ds,
    train_fn = custom_train,
    eval_fn = custom_eval,
    runs = 10,
    substeps = 10,
    batch_size= 16,
)

## Define training parameters

Define the number of models to train in parallel (stability), as well as the number of steps along the learning curve to evaluate.

## Evaluate Sufficiency

Now we can evaluate the metric to train the models and produce the learning curve.

In [None]:
# Train & test model
output = suff.evaluate()

with np.printoptions(precision=3, suppress=True):
    print(output)

In [None]:
# Print out sufficiency output in a table format
from tabulate import tabulate
print(tabulate(output, headers=list(output.keys()), tablefmt="pretty"))

# Print out projected output values
print(output.project("Accuracy", [100,1000,10000]))

# Or plot the output using the convenience function
_ = output.plot()

## Results

Using this learning curve, we can project performance under much larger datasets (with the same model).