# Quickstart

With code from https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import sys

import time

In [2]:
print(sys.version)
print(torch.__version__)

3.11.11 (main, Feb 12 2025, 15:06:01) [Clang 19.1.6 ]
2.6.0


In [3]:
training_data = datasets.FashionMNIST(
    root='data',
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.FashionMNIST(
    root='data',
    train=False,
    download=True,
    transform=ToTensor(),
)

In [4]:
batch_size = 64

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f'Shape of X [N, C, H, W]: {X.shape}')
    print(f'Shape of y: {y.shape} {y.dtype}')
    break # exit after first batch - all batches are the same size

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


Cool... looks like this new current_accelerator.is_available call returns true for both CUDA and MPS.

In [6]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else 'cpu'
device

'mps'

In [7]:
# device = 'cpu' # to run with CPU uncomment and then run the code below (could be reorganize so only the stuff that has to run after the device change, like the model creation, is after)

In [8]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [9]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [10]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f'loss: {loss:>7f} [{current:>5d}/{size:>5d}]')

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f'Test error: \n Accuracy: {(100 * correct):>0.1f}%, avg loss: {test_loss:>8f} \n')

In [11]:
# epochs = 5
epochs = 25

start_time = time.perf_counter()

for t in range(epochs):
    print(f'Epoch {t + 1}\n------------------')
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print('Done!')

end_time = time.perf_counter()

print(f'Execution time: {end_time - start_time:.3f} seconds')

Epoch 1
------------------
loss: 2.305091 [   64/60000]
loss: 2.298038 [ 6464/60000]
loss: 2.275595 [12864/60000]
loss: 2.266529 [19264/60000]
loss: 2.256486 [25664/60000]
loss: 2.223460 [32064/60000]
loss: 2.219633 [38464/60000]
loss: 2.190681 [44864/60000]
loss: 2.184609 [51264/60000]
loss: 2.157253 [57664/60000]
Test error: 
 Accuracy: 53.7%, avg loss: 2.147785 

Epoch 2
------------------
loss: 2.153378 [   64/60000]
loss: 2.146364 [ 6464/60000]
loss: 2.089663 [12864/60000]
loss: 2.109696 [19264/60000]
loss: 2.054691 [25664/60000]
loss: 1.993927 [32064/60000]
loss: 2.012046 [38464/60000]
loss: 1.935872 [44864/60000]
loss: 1.942367 [51264/60000]
loss: 1.872168 [57664/60000]
Test error: 
 Accuracy: 61.0%, avg loss: 1.866343 

Epoch 3
------------------
loss: 1.896574 [   64/60000]
loss: 1.865302 [ 6464/60000]
loss: 1.753935 [12864/60000]
loss: 1.798329 [19264/60000]
loss: 1.679188 [25664/60000]
loss: 1.634563 [32064/60000]
loss: 1.646075 [38464/60000]
loss: 1.557871 [44864/60000]
los

Running 25 epochs on the 5080 gives final test accuracy 79.9%, and avg loss 0.577765. It took 53.7 seconds. Running the same number of epochs on the Ryzen 7 9800X3D took 72.9 seconds (36% faster, 19.2s), for the same 79.9% test accuracy with an effectively same average test loss of 0.575010. And - kudos to Apple's silicon, at least for this relatively small model - running the same 25 epochs on a Macbook Pro M3 w/ 18GB of RAM using the CPU build of pytorch 2.6 via mps, took 53.2 seconds - about the same as the 5080 (actually 0.5s faster, small sample size of 1); the accuracy was 80.1% and a test loss of 0.578824. (A second MPS run took 54.2s.)

**TODO** There's a bit more on the quickstart page for saving and loading model weights.

**TODO** From a quick chat w/ ChatGPT, it sounds like 30-40% increase is in the normal range for small models (without a ton of parameters) and/or with simpler models (with fewer layers), both of which I think apply here. Some top-of-ChatGPT-mind things I can do to see about increasing performance more include:

- Try larger batch sizes, which can better leverage GPU parallelism, weighing against 'model convergence and generalization performance' considerations.
- Consider more complex model architectures which if done right will give better model performance while showing a bigger difference between GPU and CPU performance (because there's more that can be parallelized).
- Sometimes 'preloading datasets into GPU memory' can 'minimize CPU/GPU data transfer'.