In [32]:
# Standard preamble to use the Slurm cluster
import random
from dask_jobqueue import SLURMCluster
from distributed import Client

# Slurm cluster submission to the Andes cluster
# The cluster configuration is in ./etc/dask/dask.yml with sensible defaults
# Refer to the "dask.jobqueue.slurm"
dashboard_port = random.randint(10000,60000)
cluster = SLURMCluster(
    scheduler_options={"dashboard_address": f":{dashboard_port}"}
)
# We print out the address you copy into the dask-labextension
print("Dashboard address for the dask-labextension")
print(f"/proxy/{dashboard_port}")

# Create the client object
client = Client(cluster)
client

Dashboard address for the dask-labextension
/proxy/49736


0,1
Client  Scheduler: tcp://10.43.202.83:40761  Dashboard: http://10.43.202.83:49736/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [33]:
%%time
import pandas as pd
import fastparquet as fp
import dask.dataframe as dd

# Scale up right before running compute
cluster.scale(jobs=4)

# Actual computation
FN = '/gpfs/alpine/stf218/proj-shared/stf008stc/openbmc.summit.raw/openbmc-202004*-*.parquet'
df = dd.read_parquet(
    FN,
    engine='fastparquet',
    index=False,
    columns=['timestamp', 'total_power'],
    gather_statistics=False,
).repartition(partition_size="100MB").set_index('timestamp')

# Persist the indexed dataset into the cluster
# The data will be sent off to the dask worker scratchspace
# eventually on gpfs (/gpfs/alpine/scratch/<userid>/.gears/dask/dask-worker-space)
df = client.persist(df)


KeyboardInterrupt



In [27]:
# Calculation utilizing the persisted dataset should be quicker
value = df['total_power'].std().compute()
value
cluster.scale(jobs=0)

KilledWorker: ("('read-parquet-439db2776847e1b067144e3f0c98bb11', 26327)", <Worker 'tcp://10.43.17.58:44245', name: 1-1, memory: 0, processing: 1620>)

In [24]:
cluster.scale(0)

In [31]:
client.close()
cluster.close()