# Ray hyperparameter optimization Notebook

In [73]:
import uuid
import os

tmp_ray_folder = os.getenv('SCRATCH') + '/ray_cluster'
os.makedirs(tmp_ray_folder, exist_ok=True)
tmp_ray_address_file = os.path.join(tmp_ray_folder, str(uuid.uuid4()))

# Config
sbatch_cmd_options = \
"--time=00:30:00 \
-q debug \
-A dasrepo \
--image=nersc/pytorch:ngc-22.05-v1 \
--nodes=2"

In [74]:
# Run cell to setup RAY Cluster
!source scripts/submit_ray_cluster.sh "$sbatch_cmd_options" $tmp_ray_address_file

<> Submiting Ray cluster job
/global/u2/a/asnaylor/projects/ray_ml_jupyter/nersc_ray_notebook/scripts/sbatch_submit_script.sbatch --time=00:30:00 -q debug -A dasrepo --image=nersc/pytorch:ngc-22.05-v1 --nodes=1 -C haswell --cpus-per-task=64 --ntasks-per-node=1
Submitted batch job 66130677


In [88]:
!ls /global/cscratch1/sd/asnaylor/ray_cluster/

595ba054-ff3d-4924-b8d6-db70c4a9d25d  eeaf023b-ccec-44b2-bf06-4fb80c8cde9b
7e52ba4f-45fc-4fba-a5fc-7b5fb9347ab7  f4f1ea59-803d-446a-99ce-f0c988f6b402


In [79]:
#srun parts
!cat $tmp_ray_address_file

nid00610:6379


In [85]:
!sqs

JOBID            ST USER      NAME          NODES TIME_LIMIT       TIME  SUBMIT_TIME          QOS             START_TIME           FEATURES       NODELIST(REASON
66130677         R  asnaylor  sbatch_submi  1          30:00       4:46  2023-01-19T14:09:48  debug_hsw       2023-01-19T14:09:49  haswell        nid00610       


In [86]:
!scancel -u $USER

In [87]:
!cat slurm-66130677.out

[slurm] - IP Head: nid00610:6379
Writting to file: /global/cscratch1/sd/asnaylor/ray_cluster/f4f1ea59-803d-446a-99ce-f0c988f6b402
[slurm] - Starting ray HEAD
srun: fatal: Can not execute ray
[slurm] - Starting 0 ray worker


-----

In [None]:
# Setup cell
import ray

In [None]:

import argparse
import ray
import time
import math
import os

## Variables
RAY_NODE_ADDRESS=os.getenv('RAY_NODE_ADDRESS', 'auto')


## Function
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


def main():
    ## Check args
    parser = argparse.ArgumentParser()
    parser.add_argument("n_workers", type=int,
                        help="Number of ray workers")
    # parser.add_argument("redis_password", type=str,
    #                     help="redis password")
    parser.add_argument("-t", "--timeout", type=float, default=5.0,
                        help="Timeout [default %(default)s min]")
    args = parser.parse_args()

    ## Connect to ray
    print("<> Connecting to ray cluster at: {}".format(RAY_NODE_ADDRESS))
    ray.init(address=RAY_NODE_ADDRESS,)#, _redis_password=args.redis_password)

    ## Checking ray workers
    nodes = ray.nodes()
    print("<> Found {}/{} ray workers".format(len(nodes)-1,args.n_workers))

    time_count = 0
    while True:
        if len(nodes) >= (args.n_workers+1):
            break
        elif time_count >= (args.timeout*60):
            raise TimeoutError("Couldn't find all of ray workers")
        print("...waiting to detect all workers")
        time.sleep(30)
        time_count+=30
        nodes = ray.nodes()
    
    print("<> Found all {} ray nodes".format(len(nodes)))
    node_resources = ray.cluster_resources()
    print("         with total of {} CPUs and {} RAM".format(node_resources['CPU'], convert_size(node_resources['memory'])))

    return

if __name__ == "__main__":
    main()