In [1]:
import os
import sys
from time import time
import pandas as pd
import dask.dataframe as dd
from dask import compute
from itertools import product
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import patches, dates, lines
from matplotlib.transforms import blended_transform_factory

In [2]:
plt.rcParams["axes.labelweight"] = "bold"

sns.set_palette("deep")
sns.set_style("white")
sns.set_context("paper", font_scale = 2.0, rc={"grid.linewidth": 2.5, 'fontweight':'bold'})

plt.style.use("acm_ieee_latex_pubstyle.txt")

SINGLE_COLUMN = SINGLE_WIDTH = 8.0
DOUBLE_COLUMN = DOUBLE_WIDTH = 16.0

def width_height(width=SINGLE_COLUMN, height=None, columns=1, rows=1):
    if height is None:
        height = width * 3/4
    ratio = float(width) / (float(height) / float(rows))
    return {"height": (float(width) / ratio), "aspect": ratio / float(columns)}


In [3]:
TIME = 'timestamp'
NODE = 'hostname'
ROW = 'row'
RACK = 'rack'
GPU_MEM = 'gpu_mem.mean'
DIMM = 'dimm.mean'
CPU = 'cpu_core.mean'
INPUT = 'input_power.mean'
TOTAL = 'total_power.mean'
GPU_POWER = 'gpu_total_power.mean'
CPU_POWER = 'cpu_total_power.mean'
MEM_POWER = 'mem_power.mean'
GPU_CORES, GPU_MEMS = [[f'gpu{gpu}_{sensor}_temp.mean' for gpu in range(6)] for sensor in ['core', 'mem']]
GPU_MAXS = [f'gpu{gpu}_core_temp.max' for gpu in range(6)]
GPU_POWERS = [f'p{p}_gpu{gpu}_power.mean' for p, gpu in product(range(2), range(3))]
GPU_CORE = 'gpu_core'
GPU_MAX = 'gpu_core.max'
GPU_POWER = 'gpu_power'

DIMMS = [f'dimm{dimm}_temp.mean' for dimm in range(16)]
CPU_CORES = [f'p{p}_core{core}_temp.mean' for p, core in product(range(2), set(range(24)) - {13})]


In [41]:
def plot_cluster_power(ax, df, time_from=None, time_to=None):
    time_from = time_from or df.index.min()
    time_to = time_to or df.index.max()
    df = df[(df.index >= time_from) & (df.index <= time_to)]
    ax.plot(df[POWER], color='black')
    ax.xaxis.set_visible(False)
    

def plot_node_variation(ax, df, time_from=None, time_to=None):
    time_from = time_from or df.index.min()
    time_to = time_to or df.index.max()
    df = df[(df.index >= time_from) & (df.index <= time_to)]
    df_min = df.groupby(df.index).min().reset_index()
    df_max = df.groupby(df.index).max().reset_index()
    df = df.reset_index().melt(id_vars=TIME, var_name='field', value_name='value')

    sns.boxplot(x=TIME, y='value', data=df, hue='field', hue_order=[GPU_POWER, np.nan], showfliers=False,
                ax=ax, linewidth=.5)
    power_extrema_pos = np.arange(len(df_min)) - .2
    thermal_extrema_pos = np.arange(len(df_min)) + .2
    ms = 2.5
    ax.plot(power_extrema_pos, df_min[GPU_POWER], marker='v', linestyle='None', color=blue, markersize=ms)
    ax.plot(power_extrema_pos, df_max[GPU_POWER], marker='^', linestyle='None', color=blue, markersize=ms)
    ax.set_ylim(bottom=0)
    ax2 = ax.twinx()
    sns.boxplot(x=TIME, y='value', data=df, hue='field', hue_order=[np.nan, GPU_CORE], showfliers=False,
                ax=ax2, linewidth=.5)
    ax2.plot(thermal_extrema_pos, df_min[GPU_CORE], marker='v', linestyle='None', color=orange, markersize=ms)
    ax2.plot(thermal_extrema_pos, df_max[GPU_CORE], marker='^', linestyle='None', color=orange, markersize=ms)

    ax.set_xlabel(' ')
    ax.tick_params(labelsize=14)
    freq = 6
    ticklabels = df[TIME].unique().strftime("%H:%M")
    ax.set_xticks(range(len(ticklabels))[::freq])
    ax.set_xticklabels(ticklabels[::freq])

    ax.get_legend().remove()
    ax2.get_legend().remove()

    return ax, ax2


def plot_time_slice(ax, df, time):
    sns.kdeplot(data=df.loc[time], x=GPU_POWER, y=GPU_CORE, fill=True, alpha=.5, ax=ax, color=green)
#     ax.set_xlim(left=0)
    ax.set_xticks([0, 100, 200, 300])
    ax.set_yticks([30, 40, 50, 60])
    ax.set_ylim(top=65)
    ax.set_xlabel('Power (W)')
    ax.set_ylabel('')
    
    return ax


def plot_heatmap(ax, df, time, col, cbar_ax):
    min_temp = df[GPU_CORE].min()
    max_temp = df[GPU_MAX].max()
    temp_per_rack = df.loc[time].groupby([ROW, RACK]).agg({GPU_CORE: 'mean', GPU_MAX: 'max'}).reset_index()
    temp_per_rack[GPU_MAX] = temp_per_rack[GPU_MAX].astype('float32')
    df_to_plot = temp_per_rack.pivot(index=RACK, columns=ROW, values=col)
    hm = sns.heatmap(df_to_plot, vmin=min_temp, vmax=max_temp, linewidths=0, ax=ax,
                cbar_ax=cbar_ax, cbar_kws={'orientation': 'horizontal'})
    df_to_plot.loc[6, 5] = 0
    hm = sns.heatmap(df_to_plot, mask=(df_to_plot > 0), cmap=matplotlib.colors.ListedColormap(['lime']),
                     linewidths=0, ax=ax, cbar=False)
    hm.set_facecolor('lightgray')
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    return ax


**Get node lists for flagship job 1286388 (4608 nodes)**

In [5]:
%%time
PERNODE_JOBS = '/gpfs/alpine/stf218/proj-shared/data/lake.dev/perhost_jobs_full/pernode_jobs_full.csv'
JOB_ID = 'allocation_id'
NODE_OLD = 'node_name'

pernode_jobs = pd.read_csv(PERNODE_JOBS, usecols=[JOB_ID, NODE_OLD])
nodes = pernode_jobs[pernode_jobs[JOB_ID] == 1286388][NODE_OLD].values

CPU times: user 52.6 s, sys: 2.15 s, total: 54.8 s
Wall time: 55.3 s


**Get per-node power and temperature measurements from July 28**

In [6]:
import random
from dask_jobqueue import SLURMCluster
from distributed import Client


# Set up Slurm cluster.
dashboard_port = random.randint(10000,60000)
cluster = SLURMCluster(scheduler_options={"dashboard_address": f":{dashboard_port}"})

# We print out the address you copy into the dask-labextension
print("Dashboard address for the dask-labextension")
print(f"/proxy/{dashboard_port}")

# Create the client object
client = Client(cluster)
client

Dashboard address for the dask-labextension
/proxy/28396


0,1
Client  Scheduler: tcp://10.43.202.81:39517  Dashboard: http://10.43.202.81:28396/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [7]:
%%time
date_key = '20200728'
columns = [NODE] + GPU_CORES + GPU_MAXS + GPU_POWERS
columns_renamer = {**{col: GPU_CORE for col in GPU_CORES}, **{col: GPU_MAX for col in GPU_MAXS},
                   **{col: GPU_POWER for col in GPU_POWERS}}
ddf = dd.read_parquet(
    f'/gpfs/alpine/stf218/proj-shared/data/lake/summit_power_temp_openbmc/10s_agg/{date_key}.parquet',
    index=TIME, columns=columns, engine='pyarrow').rename(columns=columns_renamer)

CPU times: user 962 ms, sys: 125 ms, total: 1.09 s
Wall time: 1.69 s


In [8]:
cluster.scale(jobs=4)

In [9]:
%%time
def t2p(time):
    t = pd.to_datetime(time)
    return t.hour * 60 + t.minute

assert ddf.npartitions == 1440 # ensure data is complete
df = pd.concat(compute([ddf.get_partition(i) for i in range(ddf.npartitions)[t2p('15:09'):t2p('15:17')]])[0])

CPU times: user 203 ms, sys: 81.7 ms, total: 285 ms
Wall time: 26 s


In [10]:
cluster.close()
client.close()

In [11]:
df = df[df[NODE].isin(nodes)]
df[ROW] = df[NODE].str[1:3].astype(int)
rows = {letter: i + 1 for i, letter in enumerate('abcdefgh')}
df[RACK] = df[NODE].str[:1].apply(rows.get)
df = df[df[ROW] != 50]
df = df.drop(columns=NODE)

In [12]:
df_long = pd.concat([df.iloc[:, [gpu, gpu+6, gpu+12, 18, 19]] for gpu in range(6)])

In [13]:
time_from = '20200728 15:10:00'
time_to = '20200728 15:16:30'
slice_times = ['20200728 ' + t for t in ['15:10:10', '15:10:50', '15:11:40', '15:14:40', '15:15:30', '15:16:20']]
slice_times_idxs = [int((pd.to_datetime(t) - pd.to_datetime(time_from)).total_seconds() / 10) for t in slice_times]

**Plot node input power and GPU core temperature**

In [None]:
%%time
blue, orange, green = sns.color_palette('deep', n_colors=3)
fig, axes = plt.subplots(nrows=7, ncols=len(slice_times), gridspec_kw={'height_ratios': [1, 0.3, 1, .42, .5, .5, .1]},
                         figsize=(DOUBLE_WIDTH, 6))
wspace = hspace = .2
gs = axes[0][0].get_gridspec()

dummy_rows = [1, 3] # for increased vertical space
for i in dummy_rows:
    dummies = axes[i]
    [ax.axis('off') for ax in dummies]
[ax.axis('off') for ax in axes[0]]
timeline = fig.add_subplot(gs[0, :])
slices = axes[2]
heatmaps_mean = axes[4]
heatmaps_max = axes[5]
[ax.axis('off') for ax in axes[6]]
colorbar = fig.add_subplot(gs[6, :])

for i in range(1, len(slices)):
    slices[i].sharex(slices[i - 1])
    slices[i].sharey(slices[i - 1])

# Plot node variation.
left, right = plot_node_variation(timeline, df_long, time_from, time_to)
left.set_ylabel('Power (W)')
right.set_ylabel('Temp. (˚C)')
left.tick_params(axis='y', colors=blue)
right.tick_params(axis='y', colors=orange)
left.spines['left'].set_color(blue)
right.spines['left'].set_color(blue)
left.spines['right'].set_color(orange)
right.spines['right'].set_color(orange)
right.spines['right'].set_visible(True)
timeline.xaxis.tick_bottom()

# Make boxplots hollow.
for ax in [left, right]:
    for i, artist in enumerate(ax.artists):
        col = artist.get_facecolor()
        artist.set_edgecolor(col)
        artist.set_facecolor('None')

    for j in range(len(ax.lines)):
        line = ax.lines[j]
        line.set_color(col)
        line.set_mfc(col)
        line.set_mec(col)

# Make legend.
orange_patch = patches.Patch(edgecolor=orange, facecolor='white', label='Temp.')
blue_patch = patches.Patch(edgecolor=blue, facecolor='white', label='Power')
left.legend(handles=[blue_patch, orange_patch], loc=(0.01, .63), fontsize=12, frameon=False)

# Plot time slices.
for t, slice_ in zip(slice_times, slices):
    plot_time_slice(slice_, df_long, t)
    slice_.xaxis.tick_bottom()
    slice_.yaxis.tick_left()
slices[0].set_ylabel('Temp. (˚C)')

# Annotate time slices.
y_line = -125
slice_notch_length = 2
y_max_offset = 8
lw = plt.rcParams['axes.linewidth']
timeline_len = (pd.to_datetime(time_to) - pd.to_datetime(time_from)).total_seconds() / 10 + 1
for i, (plot, slice_plot, x_slice) in enumerate(zip([timeline]*len(slices), slices, slice_times_idxs)):
#     plot.get_xticklabels()[x_slice // 6 + x_slice % 6].set_weight('bold')
    plot.axvspan(x_slice - .5, x_slice + .5, color='gray', alpha=0.15)    
    xmin, xmax = slice_plot.get_xlim()
    _, ymax = slice_plot.get_ylim()

    slice_plot_width = timeline_len / (len(slices) + wspace * (len(slices) - 1))
    slice_plot_width_with_gap = slice_plot_width * (1 + wspace)
    plot.add_line(lines.Line2D((i * slice_plot_width_with_gap + .5 * slice_plot_width / (1 + wspace), x_slice),
                               (y_line, 0), clip_on=False, color='k', linestyle='--', lw=lw))
    slice_plot.add_line(lines.Line2D((xmin, xmax), (ymax + y_max_offset, ymax + y_max_offset), clip_on=False, color='k', lw=lw))
    slice_plot.add_line(lines.Line2D((xmin, xmin), (ymax + y_max_offset - slice_notch_length, ymax + y_max_offset), clip_on=False, color='k', lw=lw))
    slice_plot.add_line(lines.Line2D((xmax, xmax), (ymax + y_max_offset - slice_notch_length, ymax + y_max_offset), clip_on=False, color='k', lw=lw))

# Plot heatmaps.
for t, heatmap in zip(slice_times, heatmaps_mean):
    plot_heatmap(heatmap, df_long, t, GPU_CORE, colorbar)
for t, heatmap in zip(slice_times, heatmaps_max):
    plot_heatmap(heatmap, df_long, t, GPU_MAX, colorbar)

heatmaps_mean[0].yaxis.set_visible(True)
heatmaps_mean[0].set_yticklabels(['' for _ in heatmaps_mean[0].get_yticklabels()])
heatmaps_mean[0].set_ylabel('Mean', rotation=0, labelpad=18, y=0.5, ha='center', va='center')
heatmaps_max[0].yaxis.set_visible(True)
heatmaps_max[0].set_yticklabels(['' for _ in heatmaps_mean[0].get_yticklabels()])
heatmaps_max[0].set_ylabel('Max', rotation=0, labelpad=18, y=0.5, ha='center', va='center')
cbar = heatmaps_max[-1].collections[0].colorbar.ax
cbar.set_xticklabels([x.get_text() + '˚C' for x in cbar.get_xticklabels()])

plt.tight_layout()
plt.subplots_adjust(wspace=wspace, hspace=hspace)

In [None]:
fig.savefig('../plots/component_variation.pdf')

In [38]:
# Get non-outlier temperature and power spread at time slice 2.
for col in [GPU_CORE, GPU_POWER]:
    x = df_long.loc[slice_times[2]][col].values
    p25 = np.nanquantile(x, .25)
    p75 = np.nanquantile(x, .75)
    iqr = p75 - p25
    print(f'spread of {col} is {p75 + 1.5*iqr - (p25 - 1.5*iqr)}')

spread of gpu_core is 15.822235107421875
spread of gpu_power is 62.2222900390625
