# Ray hyperparameter optimization Notebook

In [1]:
# Config
sbatch_cmd_options = \
"--time=00:30:00 \
-q debug \
-A dasrepo \
--image=nersc/pytorch:ngc-22.09-v0 \
--nodes=2"

# sbatch_cmd_options = \
# "--time=00:30:00 \
# -q debug \
# -A dasrepo \
# --nodes=2"

In [2]:
# Run cell to setup RAY Cluster
!source scripts/submit_ray_cluster.sh "$sbatch_cmd_options"

<> Submiting Ray cluster job
Submitted batch job 66232268


In [3]:
#While loop pull contents
import os 
ray_scratch_file = os.path.join(os.getenv('SCRATCH'), 'ray_cluster', 'head_node_address')

print(ray_scratch_file)

/global/cscratch1/sd/asnaylor/ray_cluster/head_node_address


In [5]:
!ls /global/cscratch1/sd/asnaylor/ray_cluster
# !ls /pscratch/sd/a/asnaylor/ray_cluster/

head_node_address


In [6]:
#srun parts
!cat $ray_scratch_file

10.128.5.162


In [7]:
# !ssh 10.128.4.224 echo 'hello'

In [9]:
!sqs

JOBID            ST USER      NAME          NODES TIME_LIMIT       TIME  SUBMIT_TIME          QOS             START_TIME           FEATURES       NODELIST(REASON
66232268         R  asnaylor  sbatch_submi  2          30:00       0:21  2023-01-24T15:32:04  debug_hsw       2023-01-24T15:32:05  haswell        nid0[1431-1432]


In [28]:
!scancel -u $USER

In [13]:
# !cat slurm-*.out
!cat slurm-66232268.out
# !ls *.out÷

[slurm] - IP Head: nid01431:6379 | 10.128.5.162
[slurm] - Starting ray HEAD
2023-01-24 23:32:17,431	INFO usage_lib.py:452 -- Usage stats collection is disabled.
2023-01-24 23:32:17,431	INFO scripts.py:719 -- Local node IP: nid01431
2023-01-24 23:32:19,654	SUCC scripts.py:756 -- --------------------
2023-01-24 23:32:19,654	SUCC scripts.py:757 -- Ray runtime started.
2023-01-24 23:32:19,654	SUCC scripts.py:758 -- --------------------
2023-01-24 23:32:19,654	INFO scripts.py:760 -- Next steps
2023-01-24 23:32:19,654	INFO scripts.py:761 -- To connect to this Ray runtime from another node, run
2023-01-24 23:32:19,654	INFO scripts.py:764 --   ray start --address='nid01431:6379'
2023-01-24 23:32:19,654	INFO scripts.py:780 -- Alternatively, use the following Python code:
2023-01-24 23:32:19,654	INFO scripts.py:782 -- import ray
2023-01-24 23:32:19,654	INFO scripts.py:786 -- ray.init(address='auto', _node_ip_address='nid01431')
2023-01-24 23:32:19,654	INFO scripts.py:798 -- To connect to this Ra

-----

In [14]:
import ray
import math

def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


#get address from file
with open(ray_scratch_file) as f:
    RAY_NODE_ADDRESS = f.read().strip('\n')
    RAY_NODE_ADDRESS = 'ray://{}:10001'.format(RAY_NODE_ADDRESS)

print(RAY_NODE_ADDRESS)

ray://10.128.5.162:10001


In [15]:
# ray.init("ray://10.128.4.224:10001")
ray.init(RAY_NODE_ADDRESS)
# ray.init(RAY_NODE_ADDRESS, dashboard_host="0.0.0.0")
# ray.init(RAY_NODE_ADDRESS, webui_host="0.0.0.0")

0,1
Python version:,3.8.13
Ray version:,2.0.0


In [16]:
nodes = ray.nodes()
print(nodes)

[{'NodeID': 'f6f33592b8b894fa4792f945f64c63a7d5e059250a4a37459eefc063', 'Alive': True, 'NodeManagerAddress': 'nid01431', 'NodeManagerHostname': 'nid01431', 'NodeManagerPort': 38983, 'ObjectManagerPort': 40811, 'ObjectStoreSocketName': '/tmp/ray/session_2023-01-24_23-32-17_483326_23385/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2023-01-24_23-32-17_483326_23385/sockets/raylet', 'MetricsExportPort': 44037, 'NodeName': 'nid01431', 'alive': True, 'Resources': {'node:nid01431': 1.0, 'memory': 81094708634.0, 'CPU': 64.0, 'object_store_memory': 39040589414.0}}, {'NodeID': 'efa8d87b519450ccfa35bb9eec1f37db64ec2b1105315272d44eaa8e', 'Alive': True, 'NodeManagerAddress': '10.128.5.163', 'NodeManagerHostname': 'nid01432', 'NodeManagerPort': 37477, 'ObjectManagerPort': 40317, 'ObjectStoreSocketName': '/tmp/ray/session_2023-01-24_23-32-17_483326_23385/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2023-01-24_23-32-17_483326_23385/sockets/raylet', 'MetricsExportPort': 4

In [17]:
node_resources = ray.cluster_resources()
print("total of {} CPUs and {} RAM".format(node_resources['CPU'], convert_size(node_resources['memory'])))


total of 128.0 CPUs and 160.29 GB RAM


In [18]:
! hostname

cori19


In [23]:
# Change this to match your cluster scale.
NUM_SAMPLING_TASKS = int(node_resources['CPU'])
NUM_SAMPLES_PER_TASK = 10_000_000
TOTAL_NUM_SAMPLES = NUM_SAMPLING_TASKS * NUM_SAMPLES_PER_TASK

In [24]:
import random
import math
from typing import Tuple, List

def sampling_task(num_samples: int, task_id: int, verbose=True) -> int:
    num_inside = 0
    for i in range(num_samples):
        x, y = random.uniform(-1, 1), random.uniform(-1, 1)
        # check if the point is inside the circle
        if math.hypot(x, y) <= 1:
            num_inside += 1
    if verbose:
        print(f"Task id: {task_id} | Samples in the circle: {num_inside}")
    return num_inside

@ray.remote
def sample_task_distribute(sample_size, i) -> object:
    return sampling_task(sample_size, i)

def run_disributed(sample_size) -> List[int]:
    # Launch Ray remote tasks in a comprehension list, each returns immediately with a future ObjectRef 
    # Use ray.get to fetch the computed value; this will block until the ObjectRef is resolved or its value is materialized.
    results = ray.get([
            sample_task_distribute.remote(sample_size, i+1) for i in range(NUM_SAMPLING_TASKS)
        ])
    return results

def calculate_pi(results: List[int]) -> float:
    total_num_inside = sum(results)
    pi = (total_num_inside * 4) / TOTAL_NUM_SAMPLES
    return pi

In [25]:
%%time
results = run_disributed(NUM_SAMPLES_PER_TASK)
pi = calculate_pi(results)

[2m[36m(sample_task_distribute pid=23691)[0m Task id: 1 | Samples in the circle: 7854177
[2m[36m(sample_task_distribute pid=23690)[0m Task id: 2 | Samples in the circle: 7853680
[2m[36m(sample_task_distribute pid=23774)[0m Task id: 3 | Samples in the circle: 7854499
[2m[36m(sample_task_distribute pid=23772)[0m Task id: 4 | Samples in the circle: 7853547
[2m[36m(sample_task_distribute pid=23769)[0m Task id: 7 | Samples in the circle: 7851641
[2m[36m(sample_task_distribute pid=23775)[0m Task id: 6 | Samples in the circle: 7853648
[2m[36m(sample_task_distribute pid=23770)[0m Task id: 5 | Samples in the circle: 7854066
[2m[36m(sample_task_distribute pid=23771)[0m Task id: 10 | Samples in the circle: 7856117
[2m[36m(sample_task_distribute pid=23776)[0m Task id: 11 | Samples in the circle: 7852118
[2m[36m(sample_task_distribute pid=23768)[0m Task id: 9 | Samples in the circle: 7855405
[2m[36m(sample_task_distribute pid=23779)[0m Task id: 13 | Samples in the c

In [26]:
print(f"Estimated value of π is: {pi:5f}")

Estimated value of π is: 3.141644


In [27]:
ray.shutdown()

[2m[36m(sample_task_distribute pid=32729, ip=10.128.5.163)[0m Task id: 97 | Samples in the circle: 7852667
[2m[36m(sample_task_distribute pid=32768, ip=10.128.5.163)[0m Task id: 99 | Samples in the circle: 7855225
