# Pytorch, Pytorch Lightning, Model monitoring, Hyper-parameter optimization


Authors:

- Andrei Kartavik - 153925


### Import dependencies


In [None]:
from typing import Any
from pathlib import Path
from dataclasses import dataclass

import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
    LearningRateMonitor,
)
from pytorch_lightning.loggers import WandbLogger

import optuna
import wandb

import torch
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy
from torchvision import transforms
from torchvision.datasets import MNIST

### Lightning Data Module


In [2]:
class MNISTDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_dir: Path | str = Path("./data"),
        batch_size: int = 64,
        num_workers: int = 4,
        val_split: int = 5000,
    ):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split

        self.transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        )

    def prepare_data(self) -> None:
        MNIST(root=self.data_dir, train=True, download=True)
        MNIST(root=self.data_dir, train=False, download=True)

    def setup(self, stage: str) -> None:
        if stage == "fit":
            full_train = MNIST(
                root=self.data_dir, train=True, transform=self.transform, download=False
            )

            train_len = len(full_train) - self.val_split
            val_len = self.val_split

            self.train_ds, self.val_ds = random_split(
                full_train,
                [train_len, val_len],
                generator=torch.Generator().manual_seed(42),
            )
        elif stage == "test":
            self.test_ds = MNIST(
                root=self.data_dir,
                train=False,
                transform=self.transform,
                download=False,
            )
        elif stage == "predict":
            self.predict_ds = MNIST(
                root=self.data_dir,
                train=False,
                transform=self.transform,
                download=False,
            )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            persistent_workers=True,
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.val_ds,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            persistent_workers=True,
        )

    def test_dataloader(self) -> DataLoader:
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            persistent_workers=True,
        )

    def predict_dataloader(self) -> DataLoader:
        return DataLoader(
            self.predict_ds,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            persistent_workers=True,
        )


### LightningModule


In [None]:
class ClassifierMNIST(pl.LightningModule):
    def __init__(
        self, lr: float = 1e-3, dropout: float = 0.1, weight_decay: float = 1e-4
    ):
        super().__init__()

        self.lr = lr
        self.dropout_p = dropout
        self.weight_decay = weight_decay
        self.save_hyperparameters()

        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Dropout(self.dropout_p),
            nn.Linear(128, 10),
        )

        self.train_acc = MulticlassAccuracy(num_classes=10)
        self.val_acc = MulticlassAccuracy(num_classes=10)
        self.test_acc = MulticlassAccuracy(num_classes=10)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.classifier(self.features(x))

    def _shared_step(
        self,
        batch,
        stage: str,
        acc_metric: MulticlassAccuracy,
    ) -> torch.Tensor:
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)

        acc = acc_metric(preds, y)

        self.log(f"{stage}_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log(f"{stage}_acc", acc, on_step=True, on_epoch=True, prog_bar=True)

        return loss

    def training_step(
        self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int
    ) -> torch.Tensor:
        return self._shared_step(batch, "train", self.train_acc)

    def validation_step(
        self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int
    ) -> None:
        self._shared_step(batch, "val", self.val_acc)

    def test_step(
        self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int
    ) -> None:
        self._shared_step(batch, "test", self.test_acc)

    def configure_optimizers(self) -> dict[str, Any]:
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.lr,
            weight_decay=self.weight_decay,
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="min", factor=0.5, patience=2
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {"scheduler": scheduler, "monitor": "val_loss"},
        }

### Training


In [4]:
@dataclass
class TrainConfig:
    batch_size: int = 64
    lr: float = 1e-3
    dropout: float = 0.1
    weight_decay: float = 1e-4
    epochs: int = 10
    num_workers: int = 4
    project: str = "put-lightning-hw"
    run_name: str = "baseline"


def build_callbacks():
    return [
        ModelCheckpoint(
            monitor="val_loss",
            mode="min",
            save_top_k=1,
            filename="mnist-{epoch:02d}-{val_loss:.4f}",
        ),
        EarlyStopping(
            monitor="val_loss",
            mode="min",
            patience=4,
        ),
        LearningRateMonitor(logging_interval="epoch"),
    ]


cfg = TrainConfig()

pl.seed_everything(42)

dm = MNISTDataModule(batch_size=cfg.batch_size, num_workers=cfg.num_workers)


model = ClassifierMNIST(lr=cfg.lr, dropout=cfg.dropout, weight_decay=cfg.weight_decay)

wandb_logger = WandbLogger(project=cfg.project, name=cfg.run_name, log_model=True)


trainer = pl.Trainer(
    max_epochs=cfg.epochs,
    accelerator="auto",
    devices="auto",
    logger=wandb_logger,
    callbacks=build_callbacks(),
    log_every_n_steps=50,
)

Seed set to 42
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores


In [5]:
trainer.fit(model, dm)

[34m[1mwandb[0m: Currently logged in as: [33mkarlsonav[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | features   | Sequential         | 18.8 K | train
1 | classifier | Sequential         | 402 K  | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
----------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.687     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 860/860 [00:10<00:00, 78.27it/s, v_num=j1jv, train_loss_step=0.0168, train_acc_step=1.000, val_loss_step=0.0267, val_acc_step=1.000, val_loss_epoch=0.0368, val_acc_epoch=0.991, train_loss_epoch=0.0078, train_acc_epoch=0.997]    

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 860/860 [00:10<00:00, 78.24it/s, v_num=j1jv, train_loss_step=0.0168, train_acc_step=1.000, val_loss_step=0.0267, val_acc_step=1.000, val_loss_epoch=0.0368, val_acc_epoch=0.991, train_loss_epoch=0.0078, train_acc_epoch=0.997]


In [6]:
trainer.test(model, dm, ckpt_path="best")

Restoring states from the checkpoint path at ./put-lightning-hw/fiw8j1jv/checkpoints/mnist-epoch=08-val_loss=0.0349.ckpt
Loaded model weights from the checkpoint at ./put-lightning-hw/fiw8j1jv/checkpoints/mnist-epoch=08-val_loss=0.0349.ckpt


Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 157/157 [00:01<00:00, 121.68it/s]
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
       Test metric             DataLoader 0
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
     test_acc_epoch         0.9923626184463501
     test_loss_epoch        0.02674693986773491
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â

[{'test_loss_epoch': 0.02674693986773491,
  'test_acc_epoch': 0.9923626184463501}]

### HPO


In [7]:
def hpo_objective(trial: optuna.Trial) -> float:
    pl.seed_everything(42)

    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    dm = MNISTDataModule(batch_size=batch_size)
    model = ClassifierMNIST(lr=lr, dropout=dropout, weight_decay=weight_decay)

    run = wandb.init(
        project=cfg.project,
        name=f"optuna-trial-{trial.number}",
        reinit="create_new",
    )
    wandb_logger = WandbLogger(experiment=run, log_model=False)

    trainer = pl.Trainer(
        max_epochs=cfg.epochs,
        accelerator="auto",
        devices="auto",
        logger=wandb_logger,
        callbacks=build_callbacks(),
        enable_progress_bar=False,
    )

    trainer.fit(model, dm)

    val_loss = trainer.callback_metrics["val_loss"].item()
    val_acc = trainer.callback_metrics["val_acc"].item()
    trial.set_user_attr("val_acc", val_acc)
    trial.set_user_attr("best_model_path", trainer.checkpoint_callback.best_model_path)

    return val_loss


In [8]:
study = optuna.create_study(direction="minimize")
study.optimize(hpo_objective, n_trials=5)

[I 2025-11-27 22:45:24,687] A new study created in memory with name: no-name-f3f0c49e-a4b7-4b08-8e51-6730b718dd71
Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | features   | Sequential         | 18.8 K | train
1 | classifier | Sequential         | 402 K  | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
----------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.687     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: `max_epochs=10` reached.
[I 2025-11-27 22:46:59,232] Trial 0 finished with value: 0.047170329838991165 and parameters: {'lr': 7.025063562656615e-05, 'dropout': 0.31406987209232373, 'weight_decay': 2.825818195272384e-05, 'batch_size': 64}. Best is trial 0 with value: 0

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | features   | Sequential         | 18.8 K | train
1 | classifier | Sequential         | 402 K  | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
----------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.687     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: `max_epochs=10` reached.
[I 2025-11-27 22:47:55,231] Trial 1 finished with value: 0.03551575168967247 and parameters: {'lr': 0.0009027058173154408, 'dropout': 0.2286517095997713, 'weight_decay': 2.6764546413633446e-05, 'batch_size': 128}. Best is trial 1 with value: 0

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | features   | Sequential         | 18.8 K | train
1 | classifier | Sequential         | 402 K  | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
----------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.687     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode
[I 2025-11-27 22:50:40,742] Trial 2 finished with value: 0.03653668984770775 and parameters: {'lr': 0.0008321937568831512, 'dropout': 0.41928998983819565, 'weight_decay': 0.0002198331923627932, 'batch_size': 32}. Best is trial 1 with value: 0.03551575168967247.
Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | features   | Sequential         | 18.8 K | train
1 | classifier | Sequential         | 402 K  | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
----------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.687     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: `max_epochs=10` reached.
[I 2025-11-27 22:53:49,421] Trial 3 finished with value: 0.05262177065014839 and parameters: {'lr': 3.2523881187812815e-05, 'dropout': 0.10713012928854138, 'weight_decay': 0.00020324819410084616, 'batch_size': 32}. Best is trial 1 with value: 

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | features   | Sequential         | 18.8 K | train
1 | classifier | Sequential         | 402 K  | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
----------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.687     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: `max_epochs=10` reached.
[I 2025-11-27 22:54:43,706] Trial 4 finished with value: 0.03759666532278061 and parameters: {'lr': 0.0006707415243196819, 'dropout': 0.34112442166192264, 'weight_decay': 0.00021399383651116293, 'batch_size': 128}. Best is trial 1 with value: 

In [13]:
print("Best val_loss:", study.best_trial.value)
print("Best params:", study.best_trial.params)
print("Best val_acc:", study.best_trial.user_attrs["val_acc"])

Best val_loss: 0.03551575168967247
Best params: {'lr': 0.0009027058173154408, 'dropout': 0.2286517095997713, 'weight_decay': 2.6764546413633446e-05, 'batch_size': 128}
Best val_acc: 0.9901406168937683


### Comparing Baseline vs HPO

In [None]:
baseline_ckpt = trainer.checkpoint_callback.best_model_path
baseline_model = ClassifierMNIST.load_from_checkpoint(baseline_ckpt)

hpo_best_trial = study.best_trial
hpo_ckpt = hpo_best_trial.user_attrs["best_model_path"]
hpo_model: ClassifierMNIST = ClassifierMNIST.load_from_checkpoint(hpo_ckpt)

eval_trainer = pl.Trainer(accelerator="auto", devices="auto", logger=False)

print("===Baseline model metrics===")
baseline_metrics = eval_trainer.test(baseline_model, dm, verbose=False)
print(baseline_metrics)

print("\n===HPO model metrics===")
hpo_metrics = eval_trainer.test(hpo_model, dm, verbose=False)
print(hpo_metrics)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores


===Baseline model metrics===
Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 157/157 [00:01<00:00, 127.90it/s]
[{'test_loss_epoch': 0.02674693986773491, 'test_acc_epoch': 0.9923626184463501}]

===HPO model metrics===
Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 157/157 [00:01<00:00, 113.06it/s]
[{'test_loss_epoch': 0.023135846480727196, 'test_acc_epoch': 0.992534875869751}]
