# Dataset E: 100 hosts sample (among 4,626 nodes) for all dates

# Dask Setup

In [1]:
#
# workers x memory_per_worker <= available memory
# threads per worker == 1 if workload is CPU intensive
# dashboard port might need to change if running multiple dask instances within lab
#
# Sizing below is based on the basic jupyterlab environment provided by https://jupyter.olcf.ornl.gov
#
WORKERS = 16
MEMORY_PER_WORKER = "2GB"
THREADS_PER_WORKER = 1
DASHBOARD_PORT = ":8787"

## Local Dask cluster setup

* Install bokeh, spawn cluster, provide access point to dashboards
* Access jupyter hub at the address - https://jupyter.olcf.ornl.gov/hub/user-redirect/proxy/8787/status")
* Or access point for the Dask jupyter extension - /proxy/8787

In [2]:
# General prerequisites we want to have loaded from the get go
!pip install bokeh loguru



In [3]:
# Cleanup
try:
    client.shutdown()
    client.close()
except Exception as e:
    pass

In [4]:
# Setup block
import os
import pwd
import glob
import pandas as pd
from distributed import LocalCluster, Client
import dask
import dask.dataframe as dd

#LOCALDIR = "/gpfs/alpine/stf218/scratch/shinw/.tmp/dask-interactive"
LOCALDIR = "/tmp/dask"

In [5]:
dask.config.set({'worker.memory': {'target': False, 'spill': False, 'pause': 0.8, 'terminate': 0.95}})
#dask.config.config

<dask.config.set at 0x7f7487f278e0>

In [6]:
# Cluster creation
cluster = LocalCluster(processes=True, n_workers=WORKERS, threads_per_worker=THREADS_PER_WORKER,
                       dashboard_address=DASHBOARD_PORT, local_directory=LOCALDIR,
                       memory_limit=MEMORY_PER_WORKER)

client = Client(cluster)
cluster
print("Access jupyter hub at the address - https://jupyter.olcf.ornl.gov/hub/user-redirect/proxy/8787/status")
print("Dask jupyter extension - /proxy/8787")
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34953 instead


Access jupyter hub at the address - https://jupyter.olcf.ornl.gov/hub/user-redirect/proxy/8787/status
Dask jupyter extension - /proxy/8787


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:34953/status,

0,1
Dashboard: http://127.0.0.1:34953/status,Workers: 16
Total threads: 16,Total memory: 29.80 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37885,Workers: 16
Dashboard: http://127.0.0.1:34953/status,Total threads: 16
Started: Just now,Total memory: 29.80 GiB

0,1
Comm: tcp://127.0.0.1:45149,Total threads: 1
Dashboard: http://127.0.0.1:34767/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:36161,
Local directory: /tmp/dask/dask-worker-space/worker-ukhrfmxn,Local directory: /tmp/dask/dask-worker-space/worker-ukhrfmxn

0,1
Comm: tcp://127.0.0.1:33379,Total threads: 1
Dashboard: http://127.0.0.1:39375/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:33087,
Local directory: /tmp/dask/dask-worker-space/worker-80yhvr3b,Local directory: /tmp/dask/dask-worker-space/worker-80yhvr3b

0,1
Comm: tcp://127.0.0.1:38301,Total threads: 1
Dashboard: http://127.0.0.1:33889/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:41591,
Local directory: /tmp/dask/dask-worker-space/worker-9f72jdi5,Local directory: /tmp/dask/dask-worker-space/worker-9f72jdi5

0,1
Comm: tcp://127.0.0.1:40871,Total threads: 1
Dashboard: http://127.0.0.1:34805/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:45563,
Local directory: /tmp/dask/dask-worker-space/worker-_a10im7k,Local directory: /tmp/dask/dask-worker-space/worker-_a10im7k

0,1
Comm: tcp://127.0.0.1:32973,Total threads: 1
Dashboard: http://127.0.0.1:45879/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:40611,
Local directory: /tmp/dask/dask-worker-space/worker-tb4apvcl,Local directory: /tmp/dask/dask-worker-space/worker-tb4apvcl

0,1
Comm: tcp://127.0.0.1:43609,Total threads: 1
Dashboard: http://127.0.0.1:36501/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:36825,
Local directory: /tmp/dask/dask-worker-space/worker-xpxib7_g,Local directory: /tmp/dask/dask-worker-space/worker-xpxib7_g

0,1
Comm: tcp://127.0.0.1:38529,Total threads: 1
Dashboard: http://127.0.0.1:42135/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:33089,
Local directory: /tmp/dask/dask-worker-space/worker-2kyekhy1,Local directory: /tmp/dask/dask-worker-space/worker-2kyekhy1

0,1
Comm: tcp://127.0.0.1:42795,Total threads: 1
Dashboard: http://127.0.0.1:42991/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:43503,
Local directory: /tmp/dask/dask-worker-space/worker-73d7pdix,Local directory: /tmp/dask/dask-worker-space/worker-73d7pdix

0,1
Comm: tcp://127.0.0.1:41855,Total threads: 1
Dashboard: http://127.0.0.1:34149/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:37209,
Local directory: /tmp/dask/dask-worker-space/worker-cyxbyxl5,Local directory: /tmp/dask/dask-worker-space/worker-cyxbyxl5

0,1
Comm: tcp://127.0.0.1:33145,Total threads: 1
Dashboard: http://127.0.0.1:36945/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:38227,
Local directory: /tmp/dask/dask-worker-space/worker-6xk2k5hf,Local directory: /tmp/dask/dask-worker-space/worker-6xk2k5hf

0,1
Comm: tcp://127.0.0.1:40161,Total threads: 1
Dashboard: http://127.0.0.1:38273/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:41153,
Local directory: /tmp/dask/dask-worker-space/worker-p7mnfk_c,Local directory: /tmp/dask/dask-worker-space/worker-p7mnfk_c

0,1
Comm: tcp://127.0.0.1:34415,Total threads: 1
Dashboard: http://127.0.0.1:38681/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:37041,
Local directory: /tmp/dask/dask-worker-space/worker-hhr68zx0,Local directory: /tmp/dask/dask-worker-space/worker-hhr68zx0

0,1
Comm: tcp://127.0.0.1:32853,Total threads: 1
Dashboard: http://127.0.0.1:38869/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:34363,
Local directory: /tmp/dask/dask-worker-space/worker-d9lqxb0h,Local directory: /tmp/dask/dask-worker-space/worker-d9lqxb0h

0,1
Comm: tcp://127.0.0.1:42259,Total threads: 1
Dashboard: http://127.0.0.1:45275/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:34575,
Local directory: /tmp/dask/dask-worker-space/worker-nvbp0gvo,Local directory: /tmp/dask/dask-worker-space/worker-nvbp0gvo

0,1
Comm: tcp://127.0.0.1:44757,Total threads: 1
Dashboard: http://127.0.0.1:38815/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:36177,
Local directory: /tmp/dask/dask-worker-space/worker-cndw55gv,Local directory: /tmp/dask/dask-worker-space/worker-cndw55gv

0,1
Comm: tcp://127.0.0.1:39173,Total threads: 1
Dashboard: http://127.0.0.1:45973/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:34903,
Local directory: /tmp/dask/dask-worker-space/worker-ovucakej,Local directory: /tmp/dask/dask-worker-space/worker-ovucakej


# Preloading tools & libraries

In [7]:
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
print("seaborn version: {}".format(sns.__version__))
print("Python version:\n{}\n".format(sys.version))
print("matplotlib version: {}".format(matplotlib.__version__))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))

seaborn version: 0.11.2
Python version:
3.8.10 | packaged by conda-forge | (default, May 11 2021, 07:01:05) 
[GCC 9.3.0]

matplotlib version: 3.4.2
pandas version: 1.3.1
numpy version: 1.19.5


# File locations

In [8]:
DATA_BASE_PATH = "../data"
INPUT_FILES = f"{DATA_BASE_PATH}/powtemp_10sec_mean/**/*.parquet"
INPUT_PATH = f"{DATA_BASE_PATH}/powtemp_10sec_mean"
OUTPUT_PATH = f"{DATA_BASE_PATH}/e_full_10sec_100hosts"

In [9]:
!ls {INPUT_FILES}

../data/powtemp_10sec_mean/202001/20200101.parquet
../data/powtemp_10sec_mean/202001/20200102.parquet
../data/powtemp_10sec_mean/202001/20200103.parquet
../data/powtemp_10sec_mean/202001/20200106.parquet
../data/powtemp_10sec_mean/202001/20200107.parquet
../data/powtemp_10sec_mean/202001/20200108.parquet
../data/powtemp_10sec_mean/202001/20200109.parquet
../data/powtemp_10sec_mean/202001/20200110.parquet
../data/powtemp_10sec_mean/202001/20200111.parquet
../data/powtemp_10sec_mean/202001/20200112.parquet
../data/powtemp_10sec_mean/202001/20200113.parquet
../data/powtemp_10sec_mean/202001/20200114.parquet
../data/powtemp_10sec_mean/202001/20200115.parquet
../data/powtemp_10sec_mean/202001/20200116.parquet
../data/powtemp_10sec_mean/202001/20200117.parquet
../data/powtemp_10sec_mean/202001/20200118.parquet
../data/powtemp_10sec_mean/202001/20200119.parquet
../data/powtemp_10sec_mean/202001/20200120.parquet
../data/powtemp_10sec_mean/202001/20200121.parquet
../data/powtemp_10sec_mean/2020

# Schema Globals

Schema related global variables

In [10]:
# Developing a COLUMN filter we can use to process the data
RAW_COLUMN_FILTER = [
    # Meta information
    'timestamp',
    'node_state',
    'hostname',
    # Node input power (power supply)
    'ps0_input_power',
    'ps1_input_power',
    # Power consumption (Watts)
    # - GPU power
    'p0_gpu0_power',
    'p0_gpu1_power',
    'p0_gpu2_power',
    'p1_gpu0_power',
    'p1_gpu1_power',
    'p1_gpu2_power',
    # - CPU power
    'p0_power',
    'p1_power',
    # Thermal (Celcius)
    # - V100 core temperature
    'gpu0_core_temp',
    'gpu1_core_temp',
    'gpu2_core_temp',
    'gpu3_core_temp',
    'gpu4_core_temp',
    'gpu5_core_temp',
    # - V100 mem temperature (HBM memory)
    'gpu0_mem_temp',
    'gpu1_mem_temp',
    'gpu2_mem_temp',
    'gpu3_mem_temp',
    'gpu4_mem_temp',
    'gpu5_mem_temp',
    # - CPU core temperatures
    'p0_core0_temp',
    'p0_core1_temp',
    'p0_core2_temp',
    'p0_core3_temp',
    'p0_core4_temp',
    'p0_core5_temp',
    'p0_core6_temp',
    'p0_core7_temp',
    'p0_core8_temp',
    'p0_core9_temp',
    'p0_core10_temp',
    'p0_core11_temp',
    'p0_core12_temp',
    'p0_core14_temp',
    'p0_core15_temp',
    'p0_core16_temp',
    'p0_core17_temp',
    'p0_core18_temp',
    'p0_core19_temp',
    'p0_core20_temp',
    'p0_core21_temp',
    'p0_core22_temp',
    'p0_core23_temp',
    'p1_core0_temp',
    'p1_core1_temp',
    'p1_core2_temp',
    'p1_core3_temp',
    'p1_core4_temp',
    'p1_core5_temp',
    'p1_core6_temp',
    'p1_core7_temp',
    'p1_core8_temp',
    'p1_core9_temp',
    'p1_core10_temp',
    'p1_core11_temp',
    'p1_core12_temp',
    'p1_core14_temp',
    'p1_core15_temp',
    'p1_core16_temp',
    'p1_core17_temp',
    'p1_core18_temp',
    'p1_core19_temp',
    'p1_core20_temp',
    'p1_core21_temp',
    'p1_core22_temp',
    'p1_core23_temp',
]

In [11]:
# Column lists we actually end up using
COLS = [
        # Meta information
    'timestamp',
    'node_state',
    'hostname',
    # Node input power (power supply)
    'ps0_input_power',
    'ps1_input_power',
    # Power consumption (Watts)
    # - GPU power
    'p0_gpu0_power',
    'p0_gpu1_power',
    'p0_gpu2_power',
    'p1_gpu0_power',
    'p1_gpu1_power',
    'p1_gpu2_power',
    # - CPU power
    'p0_power',
    'p1_power',
    # Thermal (Celcius)
    # - V100 core temperature
    'gpu0_core_temp',
    'gpu1_core_temp',
    'gpu2_core_temp',
    'gpu3_core_temp',
    'gpu4_core_temp',
    'gpu5_core_temp',
    # - V100 mem temperature (HBM memory)
    'gpu0_mem_temp',
    'gpu1_mem_temp',
    'gpu2_mem_temp',
    'gpu3_mem_temp',
    'gpu4_mem_temp',
    'gpu5_mem_temp',
]

In [12]:
# Columns in order to calculate the row-wise min,max,mean
P0_CORES = ["p0_core0_temp",
        "p0_core1_temp",
        "p0_core2_temp",
        "p0_core3_temp",
        "p0_core4_temp",
        "p0_core5_temp",
        "p0_core6_temp",
        "p0_core7_temp",
        "p0_core8_temp",
        "p0_core9_temp",
        "p0_core10_temp",
        "p0_core11_temp",
        "p0_core12_temp",
        #"p0_core13_temp",
        "p0_core14_temp",
        "p0_core15_temp",
        "p0_core16_temp",
        "p0_core17_temp",
        "p0_core18_temp",
        "p0_core19_temp",
        "p0_core20_temp",
        "p0_core21_temp",
        "p0_core22_temp",
        "p0_core23_temp"]

P1_CORES = ["p1_core0_temp",
        "p1_core1_temp",
        "p1_core2_temp",
        "p1_core3_temp",
        "p1_core4_temp",
        "p1_core5_temp",
        "p1_core6_temp",
        "p1_core7_temp",
        "p1_core8_temp",
        "p1_core9_temp",
        "p1_core10_temp",
        "p1_core11_temp",
        "p1_core12_temp",
        #"p1_core13_temp",
        "p1_core14_temp",
        "p1_core15_temp",
        "p1_core16_temp",
        "p1_core17_temp",
        "p1_core18_temp",
        "p1_core19_temp",
        "p1_core20_temp",
        "p1_core21_temp",
        "p1_core22_temp",
        "p1_core23_temp"]

# Sampling & coarsening the data and creating a sampled dataset

Utilize map partitions feature and create a few samples from 4,626 nodes in 1 minute increments.
Trying to see if we can randomize from the partitions as well to reduce the I/O happening.

In [13]:
# Definition of the whole pipeline
import os
import shutil
import random
import glob


def find_work_to_do(output_path, input_path):
    return [
        os.path.basename(file).split(".")[0]
        for file in sorted(glob.glob(f"{input_path}/**/*.parquet"))
        if not os.access(
            os.path.join(
                output_path, os.path.basename(file)
            ), os.F_OK
        )
    ]


def handle_part(df):
    # Aggregate core temp
    df['p0_temp_max'] = df.loc[:,tuple(P0_CORES)].max(axis=1)
    df['p0_temp_min'] = df.loc[:,tuple(P0_CORES)].min(axis=1)
    df['p0_temp_mean'] = df.loc[:,tuple(P0_CORES)].mean(axis=1)
    df['p1_temp_max'] = df.loc[:,tuple(P1_CORES)].max(axis=1)
    df['p1_temp_min'] = df.loc[:,tuple(P1_CORES)].min(axis=1)
    df['p1_temp_mean'] = df.loc[:,tuple(P1_CORES)].mean(axis=1)
    COL_LIST = COLS + ['p0_temp_max', 'p0_temp_mean', 'p0_temp_min', 'p1_temp_max', 'p1_temp_mean', 'p1_temp_min']
    
    return df.loc[:, tuple(COL_LIST)]


def sample_hosts(output_path, input_path, hostnames=[], nhosts=1):
    # Limiting the # of files
    work_to_do = find_work_to_do(output_path, input_path)
    print(work_to_do)
    
    # Get random hostnames
    if hostnames == []:
        files = sorted(glob.glob(f"{input_path}/**/*.parquet"))
        ddf = dd.read_parquet(
                    files[0],
                    index=False,
                    columns=RAW_COLUMN_FILTER,
                    engine="pyarrow",
                    split_row_groups=True,
                    gather_statistics=True)
        df = ddf.get_partition(0).compute().set_index('hostname')
        hostnames = random.sample(df.index.unique().to_list(), nhosts)
        del ddf
        del df
    with open(f"{output_path}/hosts.txt", "w") as f:
        for host in sorted(hostnames):
            f.write(f"{host}\r\n")
    
    for date_key in work_to_do:
        print(f"  - sample day working on {date_key}")
        month_key = date_key[0:6]
        day_input_path = f"{input_path}/{month_key}/{date_key}.parquet"
        day_output_path = f"{output_path}/{date_key}.parquet"
        
        print(f"Day output path {day_output_path}")
        os.makedirs(os.path.dirname(day_output_path), exist_ok=True)
        ddf = dd.read_parquet(
                [day_input_path],
                index=False,
                columns=RAW_COLUMN_FILTER,
                engine="pyarrow",
                split_row_groups=True,
                gather_statistics=True)
        
        # Get only the hosts we are interested
        hostname_mask = ddf['hostname'].isin(hostnames)
        
        # Calculate the aggregates and dump the result
        df = ddf[hostname_mask].map_partitions(handle_part).compute()
        
        # Sort the day before sending it out
        df = df.sort_values(['hostname', 'timestamp'])
        
        # Write to the final file
        df.to_parquet(day_output_path, engine="pyarrow")

In [14]:
sample_hosts(OUTPUT_PATH, INPUT_PATH, nhosts=100)

[]


# Testing the output data

In [15]:
import glob
import pandas as pd

def get_host_dataframe(
    input_path = OUTPUT_PATH,
    hostnames = [],
    months = ["202001", "202008", "202102", "202108", "202201"],
    sort_values=["hostname", "timestamp"],
    set_index=["hostname"],
    columns=None,
):
    print(f"[reading time series for {hostnames} during {months}]")
    if columns != None:
        if "hostname" not in columns:
            columns.push("hostname")
        if "timestamp" not in columns:
            columns.push("timestamp")
    
    # Iterate all the files and fetch data for only the hostnames we're interested
    df_list = []
    for month in months:
        print(f"- reading {month}")
        files = sorted(glob.glob(f"{input_path}/{month}*.parquet"))
        for file in files:
            df = pd.read_parquet(file, engine="pyarrow", columns=columns)
            if hostnames != []:
                mask = df['hostname'].isin(hostnames)
                df_list.append(df[mask])
            else:
                df_list.append(df)
        
    print("- merging dataframe")
    df = pd.concat(df_list).reset_index(drop=True)
    
    print(f"- sorting based on {sort_values}")
    if sort_values != []:
        df = df.sort_values(sort_values)
    if set_index != []:
        df = df.set_index(set_index)
    print("- read success")
    return df    

In [16]:
df = get_host_dataframe(hostnames = ['f04n08', 'e34n12'], columns=["timestamp", "hostname", "ps0_input_power", "ps1_input_power"])

[reading time series for ['f04n08', 'e34n12'] during ['202001', '202008', '202102', '202108', '202201']]
- reading 202001
- reading 202008
- reading 202102
- reading 202108
- reading 202201
- merging dataframe
- sorting based on ['hostname', 'timestamp']
- read success


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2495783 entries, e34n12 to f04n08
Data columns (total 3 columns):
 #   Column           Dtype              
---  ------           -----              
 0   timestamp        datetime64[ns, UTC]
 1   ps0_input_power  float32            
 2   ps1_input_power  float32            
dtypes: datetime64[ns, UTC](1), float32(2)
memory usage: 57.1+ MB


In [18]:
df

Unnamed: 0_level_0,timestamp,ps0_input_power,ps1_input_power
hostname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
e34n12,2020-01-01 00:00:00+00:00,320.125000,344.750000
e34n12,2020-01-01 00:00:10+00:00,320.500000,344.250000
e34n12,2020-01-01 00:00:20+00:00,321.000000,343.500000
e34n12,2020-01-01 00:00:30+00:00,321.000000,344.250000
e34n12,2020-01-01 00:00:40+00:00,321.250000,344.375000
...,...,...,...
f04n08,2022-01-31 23:59:10+00:00,301.500000,370.799988
f04n08,2022-01-31 23:59:20+00:00,301.799988,370.899994
f04n08,2022-01-31 23:59:30+00:00,301.100006,371.600006
f04n08,2022-01-31 23:59:40+00:00,301.600006,370.500000
