In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

# ----------------------------
# Config
# ----------------------------
BATCH_SIZE = 64
EPOCHS = 10
LR = 1e-3

# Select device automatically
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# ----------------------------
# Dataset & Dataloader
# ----------------------------
# ToTensor: [0–255] → [0–1] float
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(".", train=True, download=True,
                   transform=transforms.ToTensor()),
    batch_size=BATCH_SIZE, shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(".", train=False, download=True,
                   transform=transforms.ToTensor()),
    batch_size=BATCH_SIZE
)

# ----------------------------
# MLP Model
# ----------------------------
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        # Flatten (batch_size, 1, 28, 28) -> (batch_size, 784)
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)   # logits; do NOT apply softmax
        return x

model = MLP().to(DEVICE)

# ----------------------------
# Loss & Optimizer
# ----------------------------
# CrossEntropyLoss = log_softmax + NLLLoss internally
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ----------------------------
# Training Loop
# ----------------------------
for epoch in range(EPOCHS):
    model.train()   # enables dropout/bn in deeper models
    running_loss = 0

    for data, target in train_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)

        optimizer.zero_grad()       # clear accumulated grads
        output = model(data)        # forward pass
        loss = criterion(output, target)  # compute loss
        loss.backward()              # autograd builds gradient graph
        optimizer.step()             # update params

        running_loss += loss.item()

    # ----------------------------
    # Validation Loop
    # ----------------------------
    model.eval()  # disable dropout/bn
    val_loss = 0
    correct = 0

    # Disable autograd for speed + memory
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(DEVICE), target.to(DEVICE)

            output = model(data)
            val_loss += criterion(output, target).item()

            # Predictions = index of max logit
            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()

    val_loss /= len(test_loader)
    accuracy = 100. * correct / len(test_loader.dataset)

    print(f"Epoch {epoch+1}/{EPOCHS} | "
          f"Train Loss: {running_loss/len(train_loader):.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Accuracy: {accuracy:.2f}%")


Using device: cuda


100%|██████████| 9.91M/9.91M [00:00<00:00, 17.0MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 447kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.20MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 13.1MB/s]


Epoch 1/10 | Train Loss: 0.3470 | Val Loss: 0.1867 | Accuracy: 94.57%
Epoch 2/10 | Train Loss: 0.1578 | Val Loss: 0.1348 | Accuracy: 96.12%
Epoch 3/10 | Train Loss: 0.1107 | Val Loss: 0.1092 | Accuracy: 96.56%
Epoch 4/10 | Train Loss: 0.0831 | Val Loss: 0.0922 | Accuracy: 97.20%
Epoch 5/10 | Train Loss: 0.0650 | Val Loss: 0.0808 | Accuracy: 97.40%
Epoch 6/10 | Train Loss: 0.0537 | Val Loss: 0.0838 | Accuracy: 97.32%
Epoch 7/10 | Train Loss: 0.0436 | Val Loss: 0.0780 | Accuracy: 97.43%
Epoch 8/10 | Train Loss: 0.0359 | Val Loss: 0.0739 | Accuracy: 97.72%
Epoch 9/10 | Train Loss: 0.0298 | Val Loss: 0.0765 | Accuracy: 97.51%
Epoch 10/10 | Train Loss: 0.0239 | Val Loss: 0.0740 | Accuracy: 97.72%
