# Ray hyperparameter optimization Notebook

----

**TLDR**: Ray Tune + Ray Train
|


### Step 1: Start the Ray cluster in the background

In [1]:
# Sbatch configuration
sbatch_cmd_options = \
"--time=00:30:00 \
-q debug \
-A dasrepo \
--image=nersc/pytorch:ngc-22.09-v0 \
--nodes=2"

In [2]:
# Bash script to execute sbatch
!source scripts/submit_ray_cluster.sh "$sbatch_cmd_options"

Submitted batch job 4973461


In [1]:
# Check the job is running
!sqs

JOBID            ST USER      NAME          NODES TIME_LIMIT       TIME  SUBMIT_TIME          QOS             START_TIME           FEATURES       NODELIST(REASON
4973461          R  asnaylor  sbatch_submi  2          30:00      10:25  2023-01-24T20:58:05  gpu_debug       2023-01-24T20:59:01  gpu&a100&hbm40 nid[002465,0024


In [2]:
# Check job log
!cat slurm-*.out

[slurm] - IP Head: nid002465:6379 | 128.55.64.85
[slurm] - Starting ray HEAD
2023-01-25 04:59:20,982	INFO usage_lib.py:452 -- Usage stats collection is disabled.
2023-01-25 04:59:20,982	INFO scripts.py:719 -- Local node IP: nid002465
2023-01-25 04:59:23,145	SUCC scripts.py:756 -- --------------------
2023-01-25 04:59:23,145	SUCC scripts.py:757 -- Ray runtime started.
2023-01-25 04:59:23,145	SUCC scripts.py:758 -- --------------------
2023-01-25 04:59:23,145	INFO scripts.py:760 -- Next steps
2023-01-25 04:59:23,145	INFO scripts.py:761 -- To connect to this Ray runtime from another node, run
2023-01-25 04:59:23,145	INFO scripts.py:764 --   ray start --address='nid002465:6379'
2023-01-25 04:59:23,145	INFO scripts.py:780 -- Alternatively, use the following Python code:
2023-01-25 04:59:23,145	INFO scripts.py:782 -- import ray
2023-01-25 04:59:23,145	INFO scripts.py:786 -- ray.init(address='auto', _node_ip_address='nid002465')
2023-01-25 04:59:23,145	INFO scripts.py:798 -- To connect to thi

### Step 2: Connect to the Ray cluster

In [3]:
import ray

from utility import get_ray_cluster_address, cluster_summary

cluster_address = get_ray_cluster_address()

In [4]:
ray.init(cluster_address)

0,1
Python version:,3.8.13
Ray version:,2.0.0


In [5]:
cluster_summary()

Cluster Summary
---------------
Nodes: 2
CPU:   256
GPU:   8
RAM:   313.51 GB


### Step 3: Setup PyTorch Model

### Step 4: Train Model (Ray Train)

### Step 5: Tune Model (Ray Tune)

In [6]:
# Change this to match your cluster scale.
NUM_SAMPLING_TASKS = 256
NUM_SAMPLES_PER_TASK = 10_000_000
TOTAL_NUM_SAMPLES = NUM_SAMPLING_TASKS * NUM_SAMPLES_PER_TASK

In [7]:
import random
import math
from typing import Tuple, List

def sampling_task(num_samples: int, task_id: int, verbose=True) -> int:
    num_inside = 0
    for i in range(num_samples):
        x, y = random.uniform(-1, 1), random.uniform(-1, 1)
        # check if the point is inside the circle
        if math.hypot(x, y) <= 1:
            num_inside += 1
    if verbose:
        print(f"Task id: {task_id} | Samples in the circle: {num_inside}")
    return num_inside

@ray.remote
def sample_task_distribute(sample_size, i) -> object:
    return sampling_task(sample_size, i)

def run_disributed(sample_size) -> List[int]:
    # Launch Ray remote tasks in a comprehension list, each returns immediately with a future ObjectRef 
    # Use ray.get to fetch the computed value; this will block until the ObjectRef is resolved or its value is materialized.
    results = ray.get([
            sample_task_distribute.remote(sample_size, i+1) for i in range(NUM_SAMPLING_TASKS)
        ])
    return results

def calculate_pi(results: List[int]) -> float:
    total_num_inside = sum(results)
    pi = (total_num_inside * 4) / TOTAL_NUM_SAMPLES
    return pi

In [8]:
%%time
results = run_disributed(NUM_SAMPLES_PER_TASK)
pi = calculate_pi(results)

[2m[36m(sample_task_distribute pid=19487)[0m Task id: 1 | Samples in the circle: 7854304
[2m[36m(sample_task_distribute pid=45563)[0m Task id: 4 | Samples in the circle: 7853305
[2m[36m(sample_task_distribute pid=45574)[0m Task id: 3 | Samples in the circle: 7856010
[2m[36m(sample_task_distribute pid=19486)[0m Task id: 2 | Samples in the circle: 7853484
[2m[36m(sample_task_distribute pid=45562)[0m Task id: 7 | Samples in the circle: 7855152
[2m[36m(sample_task_distribute pid=45598)[0m Task id: 12 | Samples in the circle: 7854451
[2m[36m(sample_task_distribute pid=45600)[0m Task id: 8 | Samples in the circle: 7853267
[2m[36m(sample_task_distribute pid=45567)[0m Task id: 5 | Samples in the circle: 7854218
[2m[36m(sample_task_distribute pid=45571)[0m Task id: 6 | Samples in the circle: 7854859
[2m[36m(sample_task_distribute pid=45596)[0m Task id: 9 | Samples in the circle: 7854571
[2m[36m(sample_task_distribute pid=45599)[0m Task id: 13 | Samples in the ci

In [9]:
print(f"Estimated value of π is: {pi:5f}")

Estimated value of π is: 3.141604


#### Close the ray connection and end the job

In [10]:
ray.shutdown()

In [11]:
!scancel -u $USER