# Ray cluster dashboard at NERSC 

-----

### Load libraries


Put the grafana and prometheus stuff in a utility library and move to ray init in browser. But add snipettes in markdown.
That way the notebook has a very clean look and isn't overly complicated.

In [1]:
from subprocess import Popen, PIPE
import os
from shlex import split
import sys

In [2]:
#Metrics features are new in Ray 2.3.0
!{sys.executable} -m pip install "ray[default]==2.3.0"

Defaulting to user installation because normal site-packages is not writeable


### Start ray head-node

In [3]:
user_jupyter = os.getenv('JUPYTERHUB_SERVICE_PREFIX')
jupyterhub = 'https://jupyter.nersc.gov'
grafana_port = '3000'
grafana_root_url = f"{jupyterhub}{user_jupyter}proxy/{grafana_port}"

In [4]:
os.environ["RAY_GRAFANA_IFRAME_HOST"] = grafana_root_url
ray_head_process = Popen(
    split("ray start --head --block")
)

2023-03-12 20:04:42,799	INFO usage_lib.py:435 -- Usage stats collection is disabled.
2023-03-12 20:04:42,799	INFO scripts.py:710 -- [37mLocal node IP[39m: [1m128.55.64.30[22m
2023-03-12 20:04:44,990	SUCC scripts.py:747 -- [32m--------------------[39m
2023-03-12 20:04:44,990	SUCC scripts.py:748 -- [32mRay runtime started.[39m
2023-03-12 20:04:44,990	SUCC scripts.py:749 -- [32m--------------------[39m
2023-03-12 20:04:44,990	INFO scripts.py:751 -- [36mNext steps[39m
2023-03-12 20:04:44,990	INFO scripts.py:752 -- To connect to this Ray runtime from another node, run
2023-03-12 20:04:44,990	INFO scripts.py:755 -- [1m  ray start --address='128.55.64.30:6379'[22m
2023-03-12 20:04:44,990	INFO scripts.py:771 -- Alternatively, use the following Python code:
2023-03-12 20:04:44,990	INFO scripts.py:773 -- [35mimport[39m[26m ray
2023-03-12 20:04:44,990	INFO scripts.py:777 -- ray[35m.[39m[26minit(address[35m=[39m[26m[33m'auto'[39m[26m)
2023-03-12 20:04:44,991	INFO scripts.

If the notebook is not running on the same machine as the ray head node you can ssh the dashboard port and follow the same commands:


```python
port = '8265'
compute_node = 'nid02010'
ray_dashboard_port_foward = Popen(
    split(f'ssh -N -L localhost:{port}:localhost:{port} {compute_node} -o LOGLEVEL=ERROR')
)
```

### Start Prometheus

In [5]:
from pathlib import Path

scratch_dir = os.getenv('SCRATCH')
prometheus_image = "prom/prometheus:v2.42.0"
prometheus_config = "/tmp/ray/session_latest/metrics/prometheus/prometheus.yml"

prometheus_db_dir = os.path.join(f"{scratch_dir}", "ray_cluster/prometheus")
Path(prometheus_db_dir).mkdir(parents=True, exist_ok=True)

prometheus_process = Popen(
    split(
        "shifter " \
        f"--image={prometheus_image} "\
        f"--volume={prometheus_db_dir}:/prometheus "\
        "/bin/prometheus "\
        f"--config.file={prometheus_config} "\
        "--storage.tsdb.path=/prometheus"
    )
)

ts=2023-03-13T03:05:00.766Z caller=main.go:512 level=info msg="No time or size retention was set so using the default time retention" duration=15d
ts=2023-03-13T03:05:00.766Z caller=main.go:556 level=info msg="Starting Prometheus Server" mode=server version="(version=2.42.0, branch=HEAD, revision=225c61122d88b01d1f0eaaee0e05b6f3e0567ac0)"
ts=2023-03-13T03:05:00.766Z caller=main.go:561 level=info build_context="(go=go1.19.5, platform=linux/amd64, user=root@c67d48967507, date=20230201-07:53:32)"
ts=2023-03-13T03:05:00.766Z caller=main.go:562 level=info host_details="(Linux 5.14.21-150400.24.46_12.0.63-cray_shasta_c #1 SMP Fri Mar 3 22:39:37 UTC 2023 (6e164f9) x86_64 login21 )"
ts=2023-03-13T03:05:00.766Z caller=main.go:563 level=info fd_limits="(soft=500000, hard=500000)"
ts=2023-03-13T03:05:00.766Z caller=main.go:564 level=info vm_limits="(soft=unlimited, hard=unlimited)"
ts=2023-03-13T03:05:00.770Z caller=web.go:561 level=info component=web msg="Start listening for connections" address

### Start Grafana

In [6]:
grafana_image = "grafana/grafana-oss:9.4.3"
grafana_config = "/tmp/ray/session_latest/metrics/grafana/grafana.ini"
grafana_provisioning = "/tmp/ray/session_latest/metrics/grafana/provisioning"

grafana_db_dir = os.path.join(f"{scratch_dir}", "ray_cluster/grafana")
Path(grafana_db_dir).mkdir(parents=True, exist_ok=True)


user_jupyter = os.getenv('JUPYTERHUB_SERVICE_PREFIX')
jupyterhub = 'https://jupyter.nersc.gov'
grafana_port = '3000'
grafana_root_url = f"{jupyterhub}{user_jupyter}proxy/{grafana_port}/"

grafana_process = Popen(
    split(
        "shifter " \
        f"--image={grafana_image} "\
        f"--volume={grafana_db_dir}:/grafana "\
        "--env GF_PATHS_DATA=/grafana "\
        "--env GF_PATHS_PLUGINS=/grafana/plugins "\
        f"--env GF_SERVER_ROOT_URL={grafana_root_url} "\
        f"--env GF_PATHS_CONFIG={grafana_config} "\
        f"--env GF_PATHS_PROVISIONING={grafana_provisioning} "
        "--entrypoint"
    )
)

logger=settings t=2023-03-12T20:05:08.601534072-07:00 level=info msg="Starting Grafana" version=9.4.3 commit=cf0a135595 branch=HEAD compiled=2023-03-02T12:28:42-08:00
logger=settings t=2023-03-12T20:05:08.601853768-07:00 level=warn msg="\"sentry\" frontend logging provider is deprecated and will be removed in the next major version. Use \"grafana\" provider instead."
logger=settings t=2023-03-12T20:05:08.601866712-07:00 level=info msg="Config loaded from" file=/usr/share/grafana/conf/defaults.ini
logger=settings t=2023-03-12T20:05:08.601871581-07:00 level=info msg="Config loaded from" file=/tmp/ray/session_latest/metrics/grafana/grafana.ini
logger=settings t=2023-03-12T20:05:08.6018763-07:00 level=info msg="Config overridden from command line" arg="default.paths.data=/grafana"
logger=settings t=2023-03-12T20:05:08.601879927-07:00 level=info msg="Config overridden from command line" arg="default.paths.logs=/var/log/grafana"
logger=settings t=2023-03-12T20:05:08.601882983-07:00 level=inf

### Access ray dashboard

In [7]:
ray_dashboard_port = '8265'
f'{jupyterhub}{user_jupyter}proxy/localhost:{ray_dashboard_port}/#/node'

'https://jupyter.nersc.gov/user/asnaylor/perlmutter-shared-node-cpu/proxy/localhost:8265/#/node'

### Access grafana dashboard

```
username: admin
password: admin
```

In [8]:
f'{jupyterhub}{user_jupyter}proxy/{grafana_port}/login'

'https://jupyter.nersc.gov/user/asnaylor/perlmutter-shared-node-cpu/proxy/3000/login'

logger=context userId=1 orgId=1 uname=admin t=2023-03-12T20:05:17.97512492-07:00 level=info msg="Request Completed" method=GET path=/api/live/ws status=-1 remote_addr=192.184.142.228 time_ms=32 duration=32.449119ms size=0 referer= handler=/api/live/ws
logger=context userId=1 orgId=1 uname=admin t=2023-03-12T20:05:18.015486729-07:00 level=info msg="Request Completed" method=GET path=/api/live/ws status=-1 remote_addr=192.184.142.228 time_ms=34 duration=34.180288ms size=0 referer= handler=/api/live/ws
logger=live t=2023-03-12T20:05:18.156461678-07:00 level=info msg="Initialized channel handler" channel=grafana/dashboard/uid/rayDefaultDashboard address=grafana/dashboard/uid/rayDefaultDashboard
logger=context userId=1 orgId=1 uname=admin t=2023-03-12T20:05:23.917985382-07:00 level=info msg="Request Completed" method=GET path=/api/live/ws status=-1 remote_addr=192.184.142.228 time_ms=33 duration=33.974104ms size=0 referer= handler=/api/live/ws
logger=context userId=1 orgId=1 uname=admin t=2

### Close down everything

In [9]:
ray_head_process.kill()
prometheus_process.kill()
grafana_process.kill()