# Ray hyperparameter optimization Notebook

In [30]:
# Config
sbatch_cmd_options = \
"--time=00:30:00 \
-q debug \
-A dasrepo \
--image=nersc/pytorch:ngc-22.05-v1 \
--nodes=2"

# sbatch_cmd_options = \
# "--time=00:30:00 \
# -q debug \
# -A dasrepo \
# --nodes=2"

In [31]:
# Run cell to setup RAY Cluster
!source scripts/submit_ray_cluster.sh "$sbatch_cmd_options"

<> Submiting Ray cluster job
Submitted batch job 66230704


In [48]:
#While loop pull contents
import os 
ray_scratch_file = os.path.join(os.getenv('SCRATCH'), 'ray_cluster', 'head_node_address')

print(ray_scratch_file)

/global/cscratch1/sd/asnaylor/ray_cluster/head_node_address


In [49]:
!ls /global/cscratch1/sd/asnaylor/ray_cluster
# !ls /pscratch/sd/a/asnaylor/ray_cluster/

head_node_address


In [50]:
#srun parts
!cat $ray_scratch_file

10.128.4.83


In [6]:
# !ssh 10.128.4.224 echo 'hello'

In [67]:
!sqs

JOBID            ST USER      NAME          NODES TIME_LIMIT       TIME  SUBMIT_TIME          QOS             START_TIME           FEATURES       NODELIST(REASON
66230704         CG asnaylor  sbatch_submi  2          30:00       4:09  2023-01-24T14:04:46  debug_hsw       2023-01-24T14:04:47  haswell        nid0[1098-1099]


In [66]:
!scancel -u $USER

In [51]:
# !cat slurm-*.out
!cat slurm-66230704.out
# !ls *.out÷

[slurm] - IP Head: nid01098:6379
[slurm] - Starting ray HEAD
2023-01-24 14:04:59,416	INFO usage_lib.py:452 -- Usage stats collection is disabled.
2023-01-24 14:04:59,416	INFO scripts.py:719 -- [37mLocal node IP[39m: [1mnid01098[22m
2023-01-24 14:05:07,718	SUCC scripts.py:756 -- [32m--------------------[39m
2023-01-24 14:05:07,718	SUCC scripts.py:757 -- [32mRay runtime started.[39m
2023-01-24 14:05:07,718	SUCC scripts.py:758 -- [32m--------------------[39m
2023-01-24 14:05:07,719	INFO scripts.py:760 -- [36mNext steps[39m
2023-01-24 14:05:07,719	INFO scripts.py:761 -- To connect to this Ray runtime from another node, run
2023-01-24 14:05:07,719	INFO scripts.py:764 -- [1m  ray start --address='nid01098:6379'[22m
2023-01-24 14:05:07,719	INFO scripts.py:780 -- Alternatively, use the following Python code:
2023-01-24 14:05:07,719	INFO scripts.py:782 -- [35mimport[39m[26m ray
2023-01-24 14:05:07,719	INFO scripts.py:786 -- ray[35m.[39m[26minit(address[35m=[39m[26m[33m'a

-----

In [52]:
import ray
import math

def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


#get address from file
with open(ray_scratch_file) as f:
    RAY_NODE_ADDRESS = f.read().strip('\n')
    RAY_NODE_ADDRESS = 'ray://{}:10001'.format(RAY_NODE_ADDRESS)

print(RAY_NODE_ADDRESS)

ray://10.128.4.83:10001


In [53]:
# ray.init("ray://10.128.4.224:10001")
ray.init(RAY_NODE_ADDRESS)
# ray.init(RAY_NODE_ADDRESS, dashboard_host="0.0.0.0")
# ray.init(RAY_NODE_ADDRESS, webui_host="0.0.0.0")

0,1
Python version:,3.8.13
Ray version:,2.0.0
Dashboard:,http://127.0.0.1:8265


In [54]:
nodes = ray.nodes()
print(nodes)

[{'NodeID': '2044111d92ccc49d8b064f7f3ed2f7a7c9b080786e25a31f3dac2884', 'Alive': True, 'NodeManagerAddress': 'nid01098', 'NodeManagerHostname': 'nid01098', 'NodeManagerPort': 46509, 'ObjectManagerPort': 38797, 'ObjectStoreSocketName': '/tmp/ray/session_2023-01-24_14-04-59_419096_63838/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2023-01-24_14-04-59_419096_63838/sockets/raylet', 'MetricsExportPort': 59127, 'NodeName': 'nid01098', 'alive': True, 'Resources': {'CPU': 64.0, 'node:nid01098': 1.0, 'memory': 82824243815.0, 'object_store_memory': 39781818777.0}}, {'NodeID': 'a4fa0e1f495ceffe4b01f803f8897b09611dc565840ba756c0b0e1c5', 'Alive': True, 'NodeManagerAddress': '10.128.4.84', 'NodeManagerHostname': 'nid01099', 'NodeManagerPort': 33983, 'ObjectManagerPort': 36107, 'ObjectStoreSocketName': '/tmp/ray/session_2023-01-24_14-04-59_419096_63838/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2023-01-24_14-04-59_419096_63838/sockets/raylet', 'MetricsExportPort': 50

In [55]:
node_resources = ray.cluster_resources()
print("total of {} CPUs and {} RAM".format(node_resources['CPU'], convert_size(node_resources['memory'])))


total of 128.0 CPUs and 163.41 GB RAM


In [23]:
! hostname

cori13


In [60]:
# Change this to match your cluster scale.
NUM_SAMPLING_TASKS = 128
NUM_SAMPLES_PER_TASK = 10_000_000
TOTAL_NUM_SAMPLES = NUM_SAMPLING_TASKS * NUM_SAMPLES_PER_TASK

In [61]:
import random
import math
from typing import Tuple, List

def sampling_task(num_samples: int, task_id: int, verbose=True) -> int:
    num_inside = 0
    for i in range(num_samples):
        x, y = random.uniform(-1, 1), random.uniform(-1, 1)
        # check if the point is inside the circle
        if math.hypot(x, y) <= 1:
            num_inside += 1
    if verbose:
        print(f"Task id: {task_id} | Samples in the circle: {num_inside}")
    return num_inside

@ray.remote
def sample_task_distribute(sample_size, i) -> object:
    return sampling_task(sample_size, i)

def run_disributed(sample_size) -> List[int]:
    # Launch Ray remote tasks in a comprehension list, each returns immediately with a future ObjectRef 
    # Use ray.get to fetch the computed value; this will block until the ObjectRef is resolved or its value is materialized.
    results = ray.get([
            sample_task_distribute.remote(sample_size, i+1) for i in range(NUM_SAMPLING_TASKS)
        ])
    return results

def calculate_pi(results: List[int]) -> float:
    total_num_inside = sum(results)
    pi = (total_num_inside * 4) / TOTAL_NUM_SAMPLES
    return pi

In [62]:
%%time
results = run_disributed(NUM_SAMPLES_PER_TASK)
pi = calculate_pi(results)

[2m[36m(sample_task_distribute pid=22546, ip=10.128.4.84)[0m Task id: 39 | Samples in the circle: 7853324
[2m[36m(sample_task_distribute pid=22593, ip=10.128.4.84)[0m Task id: 56 | Samples in the circle: 7854255
[2m[36m(sample_task_distribute pid=22583, ip=10.128.4.84)[0m Task id: 66 | Samples in the circle: 7852943
[2m[36m(sample_task_distribute pid=23136, ip=10.128.4.84)[0m Task id: 48 | Samples in the circle: 7855919
[2m[36m(sample_task_distribute pid=22575, ip=10.128.4.84)[0m Task id: 72 | Samples in the circle: 7853361
[2m[36m(sample_task_distribute pid=22572, ip=10.128.4.84)[0m Task id: 64 | Samples in the circle: 7853546
[2m[36m(sample_task_distribute pid=22542, ip=10.128.4.84)[0m Task id: 40 | Samples in the circle: 7853527
[2m[36m(sample_task_distribute pid=22568, ip=10.128.4.84)[0m Task id: 53 | Samples in the circle: 7853396
[2m[36m(sample_task_distribute pid=64254)[0m Task id: 13 | Samples in the circle: 7853629
[2m[36m(sample_task_distribute pi

In [64]:
print(f"Estimated value of π is: {pi:5f}")

Estimated value of π is: 3.141518


In [65]:
ray.shutdown()