# Quickstart

With code from https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import sys

import time

In [2]:
print(sys.version)
print(torch.__version__)

3.11.11 (main, Mar 11 2025, 17:28:32) [Clang 20.1.0 ]
2.7.0.dev20250218+cu128


In [3]:
training_data = datasets.FashionMNIST(
    root='data',
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.FashionMNIST(
    root='data',
    train=False,
    download=True,
    transform=ToTensor(),
)

In [4]:
batch_size = 64

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f'Shape of X [N, C, H, W]: {X.shape}')
    print(f'Shape of y: {y.shape} {y.dtype}')
    break # exit after first batch - all batches are the same size

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


Cool... looks like this new current_accelerator.is_available call returns true for both CUDA and MPS.

In [5]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else 'cpu'
device

'cuda'

In [6]:
# device = 'cpu' # to run with CPU uncomment and then run the code below (could be reorganize so only the stuff that has to run after the device change, like the model creation, is after)

In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [12]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [13]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f'loss: {loss:>7f} [{current:>5d}/{size:>5d}]')

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f'Test error: \n Accuracy: {(100 * correct):>0.1f}%, avg loss: {test_loss:>8f} \n')

In [14]:
# epochs = 5
epochs = 100

start_time = time.perf_counter()

for t in range(epochs):
    print(f'Epoch {t + 1}\n------------------')
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print('Done!')

end_time = time.perf_counter()

print(f'Execution time: {end_time - start_time:.3f} seconds')

Epoch 1
------------------
loss: 2.323086 [   64/60000]
loss: 2.294036 [ 6464/60000]
loss: 2.285366 [12864/60000]
loss: 2.268813 [19264/60000]
loss: 2.243323 [25664/60000]
loss: 2.244003 [32064/60000]
loss: 2.236463 [38464/60000]
loss: 2.215095 [44864/60000]
loss: 2.207709 [51264/60000]
loss: 2.175890 [57664/60000]
Test error: 
 Accuracy: 43.7%, avg loss: 2.172597 

Epoch 2
------------------
loss: 2.188961 [   64/60000]
loss: 2.165818 [ 6464/60000]
loss: 2.122739 [12864/60000]
loss: 2.129645 [19264/60000]
loss: 2.074983 [25664/60000]
loss: 2.035289 [32064/60000]
loss: 2.052623 [38464/60000]
loss: 1.982457 [44864/60000]
loss: 1.986841 [51264/60000]
loss: 1.912437 [57664/60000]
Test error: 
 Accuracy: 55.6%, avg loss: 1.914276 

Epoch 3
------------------
loss: 1.947489 [   64/60000]
loss: 1.908700 [ 6464/60000]
loss: 1.806847 [12864/60000]
loss: 1.841117 [19264/60000]
loss: 1.733965 [25664/60000]
loss: 1.688758 [32064/60000]
loss: 1.705179 [38464/60000]
loss: 1.608982 [44864/60000]
los

Running 25 epochs on the 5080 gives final test accuracy 79.9%, and avg loss 0.577765. It took 53.7 seconds. Running the same number of epochs on the Ryzen 7 9800X3D took 72.9 seconds (36% faster, 19.2s), for the same 79.9% test accuracy with an effectively same average test loss of 0.575010. And - kudos to Apple's silicon, at least for this relatively small model - running the same 25 epochs on a Macbook Pro M3 w/ 18GB of RAM using the CPU build of pytorch 2.6 via mps, took 53.2 seconds - about the same as the 5080 (actually 0.5s faster, small sample size of 1); the accuracy was 80.1% and a test loss of 0.578824. (A second MPS run took 54.2s.) Ok, and as a likely final update, after I moved back to WSL to check that the uv config updates I made created an environment that works transparently on both MacOS and WSL/Linux - it did, by the way (I just git pulled and then ran jupyter and it brought down the right CUDA enabled version of torch for WSL... I think before I had a nightly of 2.8 and it brought down a nightly of 2.7 I think) - I ran the same code w/ the GPU and it took effectively the same time - here 52.3 seconds for a 79.8% accuracy. Ok, one more... just to see how it'd go w/ more epochs, I ran 100 (compared to 25 before) and after 214s got an accuracy of 84.6%... so the time per epoch stayed the same, as I'd expect. The accuracy was still increasing but clearly was plateauing, as I was only getting 0.1% if that per epoch by the end.  

In [17]:
# how many params?
def count_parameters(model):
    return torch.nn.utils.parameters_to_vector(model.parameters()).numel()

count_parameters(model)

669706

In [20]:
# save weights
torch.save(model.state_dict(), 'model.pth')

In [25]:
# now I can load the weights (I need to execute the code above that defines the NeuralNetwork class, but of course don't need to do the training)
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load('model.pth', weights_only=True))

<All keys matched successfully>

For future reference, just noting that from a quick chat w/ ChatGPT, it sounds like 30-40% increase is in the normal range for small models (without a ton of parameters) and/or with simpler models (with fewer layers), both of which I think apply here. Some top-of-ChatGPT-mind things I can do to see about increasing performance more include:

- Try larger batch sizes, which can better leverage GPU parallelism, weighing against 'model convergence and generalization performance' considerations.
- Consider more complex model architectures which if done right will give better model performance while showing a bigger difference between GPU and CPU performance (because there's more that can be parallelized).
- Sometimes 'preloading datasets into GPU memory' can 'minimize CPU/GPU data transfer'.