# Hyper-parameter tuning with RAY tutorial

220831, by wygo

- [ref](https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html)

In [1]:
## install ray
# https://docs.ray.io/en/latest/index.html
!pip install ray

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray
  Downloading ray-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl (59.4 MB)
[K     |████████████████████████████████| 59.4 MB 1.3 MB/s 
Collecting virtualenv
  Downloading virtualenv-20.16.3-py2.py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 33.2 MB/s 
[?25hCollecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 44.2 MB/s 
Collecting distlib<1,>=0.3.5
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 69.8 MB/s 
[?25hCollecting platformdirs<3,>=2.4
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Installing collected packages: platformdirs, distlib, virtualenv, grpcio, ray
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.47.0
    Uninstal

- Hyperparameter: learnig rate size, batchsize, network layer size
- dramatically impact on DL model performance
- Ray Tune helps to find the best combination of parameters
- Industry standard tool
    - distributed hyperparameter tuning
    - latest hyperparameter search algorithms
- Process
    - 1) wrap data loading and training in functions
    - 2) make some network parameters configurable
    - 3) add checkpointing (optional),
    - 4) define the search space for the model tuning

In [2]:
# import
%matplotlib inline
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [3]:
# functions
def load_data(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

# RAY를 이용한 모델 구조 변경

모델의 hidden vector size를 configure로 정의하여 RAY에서 최적화 한다

In [4]:
# define model
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool  = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Train function
------------------

- RAY 튜닝을 위하여 [pytorch 공식 튜토리얼](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)을 조금만 수정하여 사용할 수 있음

- ``train_cifar(config, checkpoint_dir=None, data_dir=None)`` 함수에 hyperparameters 정보가 담긴  ``config`` parameter를 지정해서 입력

.. code-block:: python

    net = Net(config["l1"], config["l2"])

- Learning rate 역시 아래와 같이 optimizer에 config를 지정 가능

.. code-block:: python

    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

- Multi-GPU 사용 (multi) GPU support with DataParallel
    - Pytorch의 multi gpu를 위한 ``nn.DataParallel`` 함수 사용 가능
    - device='cpu'로 cpu 사용 가능

.. code-block:: python

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

.. code-block:: python

    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

- pytorch와 Ray Tune을 통신을 위한 tune.report
    - loss, accuracy 등을 기준으로 hyperparameter config 수정
    - bad performing rials를 멈추는대도 쓰임
    
.. code-block:: python

    with tune.checkpoint_dir(epoch) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint")
        torch.save((net.state_dict(), optimizer.state_dict()), path)

    tune.report(loss=(val_loss / val_steps), accuracy=correct / total)

In [5]:
# define train function
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [6]:
# define test function
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

## Configuring the search space

    - Define Ray Tune's search space
        - ``tune.sample_from()``: 2^2, 2^3, 2^4, ...2^9
        - ``tune.loguniform(1e-4, 1e-1)``: uniformly sampled between 0.0001 and 0.1
        - ``tune.choice``: choice between 2, 4, 8, and 16.    
.. code-block:: python

    config = {
        "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }


    - Ray Tune은 이 search space에서 hyperparameter들의 조합을 랜덤으로 생성한다
    - 여러 조합의 모델들을 병렬로 학습하고 best performing하는 조합을 찾는다
    - ``ASHAScheduler`` 스케쥴러로 bad performing trials인 경우 조기에 멈춘다

.. code-block:: python

    gpus_per_trial = 2
    # ...
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 8, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        checkpoint_at_end=True)

        - ``num_workers`` of the PyTorch ``DataLoader`` instances
        - GPU: visible to PyTorch in each trial
    - 학습이 완료되면 best performing 조합과 학습된 모델을 저장
        - test set accuracy and report everything by printing 가능

In [7]:
# define main function
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    # best_checkpoint_dir = best_trial.checkpoint.value  # 예전버전
    best_checkpoint_dir = best_trial.checkpoint.dir_or_data  # 최신버전

    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
    print('best model path: %s'%best_trial.checkpoint.dir_or_data)
    return best_trial

if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    # best_trial = main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)  # 40min
    best_trial = main(num_samples=4, max_num_epochs=2, gpus_per_trial=0)  # 5min

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /content/data/cifar-10-python.tar.gz to /content/data
Files already downloaded and verified


2022-08-30 07:26:41,795	INFO worker.py:1518 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/ray-air/key-concepts.html#session

2022-08-30 07:26:43,639	INFO tensorboardx.py:170 -- pip install "ray[tune]" to see TensorBoard files.


== Status ==
Current time: 2022-08-30 07:26:44 (running for 00:00:00.45)
Memory usage on this node: 1.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.29 GiB heap, 0.0/3.65 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/train_cifar_2022-08-30_07-26-43
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-------------------------+----------+----------------+--------------+------+------+-------------+
| Trial name              | status   | loc            |   batch_size |   l1 |   l2 |          lr |
|-------------------------+----------+----------------+--------------+------+------+-------------|
| train_cifar_19036_00000 | RUNNING  | 172.28.0.2:349 |            2 |    8 |    8 | 0.000726443 |
| train_cifar_19036_00001 | PENDING  |                |           16 |    4 |   32 | 0.0100875   |
| train_cifar_19036_00002 | PENDING  |          

[2m[36m(func pid=349)[0m   cpuset_checked))


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Bracket: Iter 8.000: -1.3435589964509942 | Iter 4.000: -1.348611863257829 | Iter 2.000: -1.505167769608798 | Iter 1.000: -1.7919447158694268
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.29 GiB heap, 0.0/3.65 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/train_cifar_2022-08-30_07-26-43
Number of trials: 10/10 (4 PENDING, 1 RUNNING, 5 TERMINATED)
+-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status     | loc            |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_iteration |
|-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+----------------------|
| train_cifar_19036_00005 | RUNNING    | 172.28.0.2:349 |            2 |   32 |  128 | 0.000721057 | 1.242   |     0.5687 |      

2022-08-30 08:05:56,167	INFO tune.py:759 -- Total run time: 2352.63 seconds (2352.39 seconds for the tuning loop).


Result for train_cifar_19036_00009:
  accuracy: 0.5136
  date: 2022-08-30_08-05-56
  done: true
  experiment_id: 9a21177c0a3d4c429360b4ce74de8887
  hostname: 0a3dd5b68053
  iterations_since_restore: 4
  loss: 1.3448131079673766
  node_ip: 172.28.0.2
  pid: 349
  should_checkpoint: true
  time_since_restore: 121.16454768180847
  time_this_iter_s: 30.85304570198059
  time_total_s: 121.16454768180847
  timestamp: 1661846756
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '19036_00009'
  warmup_time: 0.003358125686645508
  
== Status ==
Current time: 2022-08-30 08:05:56 (running for 00:39:12.40)
Memory usage on this node: 1.6/12.7 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -1.2759399535892066 | Iter 4.000: -1.3278944023549557 | Iter 2.000: -1.4347771763324737 | Iter 1.000: -1.6243453384011983
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.29 GiB heap, 0.0/3.65 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/train_cifar_2022-08-

In [12]:
'''
- 대부분의 시도는 리소스 낭비를 피하기 위해 조기에 종료된다
- Best performing trail의 valid/test 성능이 표시되고 저장된다

+-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+------+
| ... | status     | loc            |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_iteration |
|-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+------|
| ... | TERMINATED | 172.28.0.2:349 |            2 |    8 |    8 | 0.000726443 | 1.31513 |     0.5279 |                   10 |
| ... | TERMINATED | 172.28.0.2:349 |           16 |    4 |   32 | 0.0100875   | 1.63021 |     0.3875 |                    1 |
| ... | TERMINATED | 172.28.0.2:349 |           16 |    8 |    4 | 0.0248801   | 1.98025 |     0.2294 |                    1 |
| ... | TERMINATED | 172.28.0.2:349 |            4 |   16 |   16 | 0.00740085  | 1.95368 |     0.2558 |                    1 |
| ... | TERMINATED | 172.28.0.2:349 |           16 |   64 |   16 | 0.000126019 | 2.30178 |     0.1043 |                    1 |
| ... | TERMINATED | 172.28.0.2:349 |            2 |   32 |  128 | 0.000721057 | 1.21369 |     0.5835 |                   10 |
| ... | TERMINATED | 172.28.0.2:349 |            8 |   32 |   16 | 0.00202316  | 1.15951 |     0.6044 |                   10 |
| ... | TERMINATED | 172.28.0.2:349 |            4 |    8 |   64 | 0.0010649   | 1.32789 |     0.5276 |                    4 |
| ... | TERMINATED | 172.28.0.2:349 |           16 |    4 |   32 | 0.0115535   | 1.77092 |     0.3247 |                    1 |
| ... | TERMINATED | 172.28.0.2:349 |            8 |    8 |   32 | 0.00349549  | 1.34481 |     0.5136 |                    4 |
+-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+------+


Best trial config: {'l1': 32, 'l2': 16, 'lr': 0.002023158258409915, 'batch_size': 8}
Best trial final validation loss: 1.1595149888277054
Best trial final validation accuracy: 0.6044
Files already downloaded and verified
Files already downloaded and verified
Best trial test set accuracy: 0.6022
best model path: /root/ray_results/train_cifar_2022-08-30_07-26-43/train_cifar_19036_00006_6_batch_size=8,l1=32,l2=16,lr=0.0020_2022-08-30_07-55-25/checkpoint_000009
'''

"\n- 대부분의 시도는 리소스 낭비를 피하기 위해 조기에 종료된다\n- Best performing trail의 valid/test 성능이 표시되고 저장된다\n\n+-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+------+\n| ... | status     | loc            |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_iteration |\n|-------------------------+------------+----------------+--------------+------+------+-------------+---------+------------+------|\n| ... | TERMINATED | 172.28.0.2:349 |            2 |    8 |    8 | 0.000726443 | 1.31513 |     0.5279 |                   10 |\n| ... | TERMINATED | 172.28.0.2:349 |           16 |    4 |   32 | 0.0100875   | 1.63021 |     0.3875 |                    1 |\n| ... | TERMINATED | 172.28.0.2:349 |           16 |    8 |    4 | 0.0248801   | 1.98025 |     0.2294 |                    1 |\n| ... | TERMINATED | 172.28.0.2:349 |            4 |   16 |   16 | 0.00740085  | 1.95368 |     0.2558 |                    1 |\n| ..