# Hyperparameter Tuning using Ray package
* Hyperpatameter tuning is done using Asynchronous Successive Halving Scheduler(ASHA)
* **Model** : a classification model built with lightning
* **Dataset used for training the model** : MNIST


In [None]:
!pip install "ray[tune]" torch torchvision pytorch-lightning==1.9.5

In [2]:
import sys
sys.setrecursionlimit(30000)

In [3]:
import math

import torch
import pytorch_lightning as pl
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
from torchvision.datasets import MNIST
from torchvision import transforms
import os


In [4]:
from pytorch_lightning.loggers import TensorBoardLogger
from ray import train, tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import (
    TuneReportCallback,
    TuneReportCheckpointCallback,
)


## Classification Model With Lightning
Built a Classification Model Class from LightningModule

In [5]:
class LightningMNISTClassifier(pl.LightningModule):
    def __init__(self, config, data_dir=None):
        super(LightningMNISTClassifier, self).__init__()

        self.data_dir = data_dir or os.getcwd()

        self.layer_1_size = config["layer_1_size"]
        self.layer_2_size = config["layer_2_size"]
        self.lr = config["lr"]
        self.batch_size = config["batch_size"]

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
        self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
        self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)

    def forward(self, x):
        batch_size, channels, width, height = x.size()
        x = x.view(batch_size, -1)

        x = self.layer_1(x)
        x = torch.relu(x)

        x = self.layer_2(x)
        x = torch.relu(x)

        x = self.layer_3(x)
        x = torch.log_softmax(x, dim=1)

        return x

    def cross_entropy_loss(self, logits, labels):
        return F.nll_loss(logits, labels)

    def accuracy(self, logits, labels):
        _, predicted = torch.max(logits.data, 1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / len(labels)
        return torch.tensor(accuracy)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        self.log("ptl/train_loss", loss)
        self.log("ptl/train_accuracy", accuracy)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)
        return {"val_loss": loss, "val_accuracy": accuracy}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_accuracy", avg_acc)

    # def on_validation_epoch_end(self):
    #     avg_loss = torch.stack([x["val_loss"] for x in self.validation_step]).mean()
    #     avg_acc = torch.stack([x["val_accuracy"] for x in self.validation_step]).mean()
    #     self.log("ptl/val_loss", avg_loss)
    #     self.log("ptl/val_accuracy", avg_acc)

    @staticmethod
    def download_data(data_dir):
        transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        )
        with FileLock(os.path.expanduser("~/.data.lock")):
            return MNIST(data_dir, train=True, download=True, transform=transform)

    def prepare_data(self):
        mnist_train = self.download_data(self.data_dir)

        self.mnist_train, self.mnist_val = random_split(mnist_train, [55000, 5000])

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=int(self.batch_size))

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=int(self.batch_size))

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer


# def train_mnist(config):
#     model = LightningMNISTClassifier(config)
#     trainer = pl.Trainer(max_epochs=10, enable_progress_bar=True)

#     trainer.fit(model)


## Tuning the model parameters

In [6]:
# Training Function
def train_mnist_tune(config, num_epochs=10, num_gpus=0, data_dir="~/data"):
    data_dir = os.path.expanduser(data_dir)
    model = LightningMNISTClassifier(config, data_dir)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(save_dir=os.getcwd(), name="", version="."),
        enable_progress_bar=True,
        callbacks=[
            TuneReportCallback(
                {"loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy"},
                on="validation_end",
            )
        ],
    )
    trainer.fit(model)



In [9]:
def tune_mnist_asha(num_samples=5, num_epochs=3, gpus_per_trial=1, data_dir="~/data"):
    # Configuring the search space
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }
    # Selecting a scheduler: ASHA
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2, brackets=5)
    # Report
    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"],
    )
    # Passing constants to the train function
    train_fn_with_parameters = tune.with_parameters(
        train_mnist_tune,
        num_epochs=num_epochs,
        num_gpus=gpus_per_trial,
        data_dir=data_dir,
    )
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}
    # Tuning hyperparameters
    tuner = tune.Tuner(
        tune.with_resources(train_fn_with_parameters, resources=resources_per_trial),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            scheduler=scheduler,
            num_samples=num_samples,
        ),
        run_config=train.RunConfig(
            name="tune_mnist_asha",
            progress_reporter=reporter,
        ),
        param_space=config,
    )
    results = tuner.fit()

    print("Best hyperparameters found were: ", results.get_best_result().config)


In [8]:
tune_mnist_asha()

2024-06-21 14:28:08,901	INFO worker.py:1770 -- Started a local Ray instance.
2024-06-21 14:28:11,562	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.


+----------------------------------------------------------+
| Configuration for experiment     tune_mnist_asha         |
+----------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator   |
| Scheduler                        AsyncHyperBandScheduler |
| Number of trials                 5                       |
+----------------------------------------------------------+

View detailed results here: /root/ray_results/tune_mnist_asha
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-06-21_14-28-05_437677_314/artifacts/2024-06-21_14-28-11/tune_mnist_asha/driver_artifacts`

Trial status: 5 PENDING
Current time: 2024-06-21 14:28:12. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-------------------------------------------------------------------------------------------------------+
| Trial name                     status       layer_1_size     la

[36m(train_mnist_tune pid=1228)[0m   rank_zero_deprecation(
[36m(train_mnist_tune pid=1228)[0m GPU available: True (cuda), used: True
[36m(train_mnist_tune pid=1228)[0m TPU available: False, using: 0 TPU cores
[36m(train_mnist_tune pid=1228)[0m IPU available: False, using: 0 IPUs
[36m(train_mnist_tune pid=1228)[0m HPU available: False, using: 0 HPUs


[36m(train_mnist_tune pid=1228)[0m Failed to download (trying next):
[36m(train_mnist_tune pid=1228)[0m HTTP Error 403: Forbidden
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to /root/data/MNIST/raw/train-images-idx3-ubyte.gz


[36m(train_mnist_tune pid=1228)[0m   0%|          | 0/9912422 [00:00<?, ?it/s]
  1%|          | 65536/9912422 [00:00<00:18, 545950.54it/s]
  2%|▏         | 163840/9912422 [00:00<00:13, 701520.53it/s]
  6%|▋         | 622592/9912422 [00:00<00:04, 2109913.46it/s]
 25%|██▍       | 2457600/9912422 [00:00<00:01, 7244788.08it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 16224579.16it/s]


[36m(train_mnist_tune pid=1228)[0m Extracting /root/data/MNIST/raw/train-images-idx3-ubyte.gz to /root/data/MNIST/raw
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Failed to download (trying next):
[36m(train_mnist_tune pid=1228)[0m HTTP Error 403: Forbidden
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to /root/data/MNIST/raw/train-labels-idx1-ubyte.gz


[36m(train_mnist_tune pid=1228)[0m   0%|          | 0/28881 [00:00<?, ?it/s]
[36m(train_mnist_tune pid=1228)[0m 100%|██████████| 28881/28881 [00:00<00:00, 473504.44it/s]


[36m(train_mnist_tune pid=1228)[0m Extracting /root/data/MNIST/raw/train-labels-idx1-ubyte.gz to /root/data/MNIST/raw
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Failed to download (trying next):
[36m(train_mnist_tune pid=1228)[0m HTTP Error 403: Forbidden
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to /root/data/MNIST/raw/t10k-images-idx3-ubyte.gz


[36m(train_mnist_tune pid=1228)[0m   0%|          | 0/1648877 [00:00<?, ?it/s]
  4%|▍         | 65536/1648877 [00:00<00:02, 532550.18it/s]
 12%|█▏        | 196608/1648877 [00:00<00:01, 845040.70it/s]
100%|██████████| 1648877/1648877 [00:00<00:00, 3812548.97it/s]


[36m(train_mnist_tune pid=1228)[0m Extracting /root/data/MNIST/raw/t10k-images-idx3-ubyte.gz to /root/data/MNIST/raw
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Failed to download (trying next):
[36m(train_mnist_tune pid=1228)[0m HTTP Error 403: Forbidden
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
[36m(train_mnist_tune pid=1228)[0m Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to /root/data/MNIST/raw/t10k-labels-idx1-ubyte.gz


[36m(train_mnist_tune pid=1228)[0m   0%|          | 0/4542 [00:00<?, ?it/s]100%|██████████| 4542/4542 [00:00<00:00, 11088782.75it/s]


[36m(train_mnist_tune pid=1228)[0m Extracting /root/data/MNIST/raw/t10k-labels-idx1-ubyte.gz to /root/data/MNIST/raw
[36m(train_mnist_tune pid=1228)[0m 


[36m(train_mnist_tune pid=1228)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(train_mnist_tune pid=1228)[0m 
[36m(train_mnist_tune pid=1228)[0m   | Name    | Type   | Params
[36m(train_mnist_tune pid=1228)[0m -----------------------------------
[36m(train_mnist_tune pid=1228)[0m 0 | layer_1 | Linear | 100 K 
[36m(train_mnist_tune pid=1228)[0m 1 | layer_2 | Linear | 8.3 K 
[36m(train_mnist_tune pid=1228)[0m 2 | layer_3 | Linear | 650   
[36m(train_mnist_tune pid=1228)[0m -----------------------------------
[36m(train_mnist_tune pid=1228)[0m 109 K     Trainable params
[36m(train_mnist_tune pid=1228)[0m 0         Non-trainable params
[36m(train_mnist_tune pid=1228)[0m 109 K     Total params
[36m(train_mnist_tune pid=1228)[0m 0.438     Total estimated model params size (MB)
[36m(train_mnist_tune pid=1228)[0m 2024-06-21 14:28:25.246561: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register facto

Sanity Checking: 0it [00:00, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  6.34it/s]
Epoch 0:   0%|          | 0/1876 [00:00<?, ?it/s] 
Epoch 0:   1%|          | 20/1876 [00:00<00:48, 38.60it/s, loss=1.66, v_num=.]
Epoch 0:   2%|▏         | 40/1876 [00:00<00:35, 51.82it/s, loss=0.695, v_num=.]
Epoch 0:   3%|▎         | 60/1876 [00:01<00:30, 58.78it/s, loss=0.591, v_num=.]
Epoch 0:   4%|▍         | 80/1876 [00:01<00:28, 62.48it/s, loss=0.505, v_num=.]
Epoch 0:   5%|▌         | 100/1876 [00:01<00:27, 64.76it/s, loss=0.529, v_num=.]
Epoch 0:   6%|▋         | 120/1876 [00:01<00:26, 67.33it/s, loss=0.52, v_num=.] 
Epoch 0:   7%|▋         | 140/1876 [00:02<00:25, 69.13it/s, loss=0.416, v_num=.]
Epoch 0:   9%|▊         | 160/1876 [00:02<00:24, 70.14it/s, loss=0.517, v_num=.]
Epoch 0:  10%|▉         | 180/1876 [00:02<00:23, 71.09it/s, loss=0.431, v_num=.]
Epoch 0:  11%|█         | 200/1876 [00:02<00:

2024-06-21 14:29:43,960	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/tune_mnist_asha' in 0.0062s.
Resume experiment with: Tuner.restore(path="/root/ray_results/tune_mnist_asha", trainable=...)
- train_mnist_tune_7ce32_00001: FileNotFoundError('Could not fetch metrics for train_mnist_tune_7ce32_00001: both result.json and progress.csv were not found at /root/ray_results/tune_mnist_asha/train_mnist_tune_7ce32_00001_1_batch_size=32,layer_1_size=64,layer_2_size=128,lr=0.0586_2024-06-21_14-28-12')
- train_mnist_tune_7ce32_00002: FileNotFoundError('Could not fetch metrics for train_mnist_tune_7ce32_00002: both result.json and progress.csv were not found at /root/ray_results/tune_mnist_asha/train_mnist_tune_7ce32_00002_2_batch_size=64,layer_1_size=64,layer_2_size=128,lr=0.0014_2024-06-21_14-28-12')
- train_mnist_tune_7ce32_00003: FileNotFoundError('Could not fetch metrics for train_mnist_tune_7ce32_00003: both result.json and pro


Trial status: 1 TERMINATED | 4 PENDING
Current time: 2024-06-21 14:29:43. Total running time: 1min 31s
Logical resource usage: 0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T4)
Current best trial: 7ce32_00000 with loss=0.32802343368530273 and params={'layer_1_size': 128, 'layer_2_size': 64, 'lr': 0.011934566962302634, 'batch_size': 32}
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                     status         layer_1_size     layer_2_size           lr     batch_size        acc     iter     total time (s)       loss |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| train_mnist_tune_7ce32_00000   TERMINATED              128               64   0.0119346              32   0.928344        3             84.849   0.328023 |
| train_mnist_tune_7ce32_00001