## Getting Started

In [None]:
import sys
sys.path.append('../bin')

from IPython.display import display
import json
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import re
import seaborn as sns
from tensorflow import keras

from sklearn.metrics import \
    mean_absolute_error

from utils import \
    UNITS, \
    anomaly_score, \
    check_std, \
    predict_intervals

from train import \
    load_dataset, \
    load_datasets, \
    is_categorical, \
    create_dataset, \
    create_mlp, \
    create_rf, \
    create_pipeline, \
    mean_absolute_percentage_error, \
    prediction_interval_coverage, \
    coverage_error, \
    evaluate_cv

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Load Minibench Data

In [None]:
columns = [
    'node_type',
    'cpu_iops',
    'cpu_flops',
    'cpu_mem_bw',
    'disk_read',
    'disk_write',
    'gpu_flops',
    'gpu_mem_bw'
]



def plot_minibench(df, outfile, w=16, h=4):
    # sort data by node type
    df = df.sort_values('node_type')

    # rename metrics to include units
    mapper = {
        'cpu_flops':  'cpu_flops (GFLOP/s)',
        'cpu_mem_bw': 'cpu_mem_bw (GB/s)',
        'disk_read':  'disk_read (GB/s)',
        'disk_write': 'disk_write (GB/s)',
        'gpu_flops':  'gpu_flops (GFLOP/s)',
        'gpu_mem_bw': 'gpu_mem_bw (GB/s)',
    }

    df = df.rename(columns=mapper)

    # plot minibench metrics
    plots = [
        ('node_type', mapper['cpu_flops']),
        ('node_type', mapper['cpu_mem_bw']),
        ('node_type', mapper['disk_read']),
        ('node_type', mapper['disk_write']),
        ('node_type', mapper['gpu_flops']),
        ('node_type', mapper['gpu_mem_bw']),
    ]

    plt.subplots(len(plots), 1, figsize=(w, h * len(plots)), sharex=True)

    for i, (x, y) in enumerate(plots):
        plt.subplot(len(plots), 1, i + 1)

        if x != None:
            sns.barplot(x=x, y=y, data=df, palette='rocket', ci=68)
        else:
            sns.histplot(y=y, data=df)

        # plt.xticks(rotation=90)

    plt.tight_layout()
    plt.savefig(outfile)
    plt.show()

### Palmetto

In [None]:
# load palmetto data
df = load_dataset('../../minibench/trace-palmetto/minibench.minibench.trace.txt')
df = df[columns]

# fix issues with gpu metrics
df.loc[df['gpu_flops'].abs() > 1e7, 'gpu_flops'] = 0
df.loc[df['gpu_mem_bw'].abs() > 1e4, 'gpu_mem_bw'] = 0

display(df)

In [None]:
def version_sort(x):
    n = int(re.search(r'\d+', x)[0])
    p = x[-1] if x[-1].isalpha() else ''
    
    return '%02d%s' % (n, p)

# format node_type values to be in order
df['node_type'] = df['node_type'].apply(lambda x: version_sort(x))

In [None]:
plot_minibench(df, '03-minibench-palmetto.pdf')

### Nautilus

In [None]:
# load nautilus data
df = load_dataset('../../minibench/trace-nautilus/minibench.minibench.trace.txt')
df = df[columns]

# fix issues with gpu metrics
df['gpu_flops'] = df['gpu_flops'].astype(np.float64)
df['gpu_mem_bw'] = df['gpu_mem_bw'].astype(np.float64)

df.loc[df['gpu_flops'].abs() > 1e7, 'gpu_flops'] = 0
df.loc[df['gpu_mem_bw'].abs() > 1e4, 'gpu_mem_bw'] = 0

# rename node types for plot
mapping = {
    '1060': 'GTX 1060',
    '1070': 'GTX 1070',
    '1080': 'GTX 1080',
    '1080Ti': 'GTX 1080Ti',
    '2080Ti': 'RTX 2080Ti',
    '3090': 'RTX 3090',
    'A100': 'Tesla A100',
    'A40': 'Tesla A40',
    'M4000': 'Quadro M4000',
    'T4': 'Tesla T4',
    'TITANRTX': 'TITAN RTX',
    'V100': 'Tesla V100',
    'cpu': 'CPU',
    'titan-xp': 'TITAN Xp',
}

for k, v in mapping.items():
    df['node_type'] = df['node_type'].str.replace('^' + k + '$', v, regex=True)

display(df)

In [None]:
# load nautilus node types
df_nodes = pd.read_fwf('../../minibench/trace-nautilus/nautilus-node-types.txt')

mapper = {
    'NAME': 'hostname',
    'PROCESSOR': 'processor',
    'GPU-TYPE': 'gpu_type'
}

df_nodes = df_nodes.rename(columns=mapper)
df_nodes = df_nodes[mapper.values()]
df_nodes = df_nodes.fillna('')

df_nodes['gpu_type'] = df_nodes['gpu_type'].str.replace('^$', 'cpu', regex=True)

display(df_nodes)

In [None]:
plot_minibench(df, '03-minibench-nautilus.pdf', w=12, h=4)

### Google Cloud

In [None]:
# load google cloud data
df = load_dataset('../../minibench/trace-google/minibench.minibench.trace.txt')
df = df[columns]

# fix issues with gpu metrics
df['gpu_flops'] = df['gpu_flops'].astype(np.float64)
df['gpu_mem_bw'] = df['gpu_mem_bw'].astype(np.float64)

df.loc[df['gpu_flops'].abs() > 1e7, 'gpu_flops'] = 0
df.loc[df['gpu_mem_bw'].abs() > 1e4, 'gpu_mem_bw'] = 0

display(df)

In [None]:
plot_minibench(df, '03-minibench-google.pdf', w=4, h=4)

## Load Trace Data

In [None]:
# load config file
config = json.load(open('../workflows/kinc/params.json'))

# load trace data
dataset_names = [
    'breast-palmetto',
    'breast-nautilus',
    'unified-nautilus'
]
data_map = {}

for dataset_name in dataset_names:
    base_dir = '../_datasets/kinc-%s' % (dataset_name)

    # aggregate minibench data before merging
    df = load_dataset('%s/minibench.minibench.trace.txt' % (base_dir))
    df = df.drop(columns=['hash', 'cpus', 'time', 'disk', 'memory'], errors='ignore')
    df['gpu_flops'] = df['gpu_flops'].astype(np.float64)
    df['gpu_mem_bw'] = df['gpu_mem_bw'].astype(np.float64)
    df.loc[df['gpu_flops'].abs() > 1e7, 'gpu_flops'] = 0
    df.loc[df['gpu_mem_bw'].abs() > 1e4, 'gpu_mem_bw'] = 0
    df = df.groupby('node_type').mean()
    df.to_csv('%s/minibench.minibench.trace.txt' % (base_dir), sep='\t')

    process_names = config['train_inputs'].keys()
    merge_files = [arg.split(' ') for arg in config['train_merge_args']]

    data_map[dataset_name] = load_datasets('kinc', process_names, base_dir=base_dir, merge_files=merge_files)

# remove additional unused columns
for dataset_name in data_map.keys():
    for process_name, df in data_map[dataset_name].items():
        df['platform'] = dataset_name.split('-')[1]
        df = df.drop(columns=['hash', 'cpus', 'time', 'disk', 'memory', 'workdir', 'hostname'], errors='ignore')
        data_map[dataset_name][process_name] = df

In [None]:
# select one similarity_chunk run for each unique workflow run
for dataset_name in ['breast-palmetto']:
    df = data_map[dataset_name]['similarity_chunk']

    df = resample(df, ['dataset', 'n_rows', 'n_cols', 'hardware_type', 'chunks', 'threads', 'platform', 'node_type'])

    display(df)

    data_map[dataset_name]['similarity_chunk'] = df

## Cross-Platform Runtime Prediction

### Palmetto P100/V100

In [None]:
process_name, target = 'similarity_chunk', 'runtime_hr'

# get performance data for pipeline / process
df = data_map['breast-palmetto'][process_name]
inputs = config['train_inputs'][process_name]

# remove samples with missing data
df = df.dropna()

# remove inputs that have constant value
inputs = [c for c in inputs if df[c].nunique() > 1]

# append system metrics if appropriate
if 'hardware_type' in inputs:
    inputs.remove('hardware_type')
    inputs += [
        'cpu_flops',
        'cpu_mem_bw',
        'disk_read',
        'disk_write',
        'gpu_flops',
        'gpu_mem_bw',
    ]

# perform parameter sweep
df_scores = []
y_preds = {}

hw_a = 'p100'
hw_b = 'v100'
n_rows_max = df['n_rows'].max()
n_rows_values = sorted(df['n_rows'].unique())

ab_values = [(a, b) for a in n_rows_values for b in n_rows_values[:-1]]
names = ['%0.2f, %0.2f' % (a / n_rows_max, b / n_rows_max) for a, b in ab_values]

for name, (n_rows_a, n_rows_b) in zip(names, ab_values):
    # extract datasets for system A, system B
    df_a = df[((df['hardware_type'] == hw_a) & (df['n_rows'] <= n_rows_a)) | ((df['hardware_type'] == hw_b) & (df['n_rows'] <= n_rows_b))]
    df_b = df[(df['hardware_type'] == hw_b) & (df['n_rows'] > n_rows_b)]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # define model
    model = create_pipeline(create_mlp(X_train.shape[1], intervals=True))

    # train model
    model.fit(X_train, y_train)

    # evaluate model
    y_bar, y_std = check_std(model.predict(X_test))
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    mae = mean_absolute_error(y_test, y_bar)
    mpe = mean_absolute_percentage_error(y_test, y_bar)
    cov = coverage_error(y_test, y_lower, y_upper)

    # save metrics for plots
    df_scores.append({
        'name': name,
        'mae': mae,
        'mpe': mpe,
        'cov': cov
    })

    # save predictions for plots
    y_preds[name] = y_bar, y_std

# save results
df_scores = pd.DataFrame(df_scores)

# plot evaluation scores for each model
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

plt.subplot(2, 1, 1)
sns.barplot(x='name', y='mpe', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('MAPE (%)')
xmin, xmax = plt.xlim()
plt.plot(plt.xlim(), [20, 20], 'r--')
plt.xlim(xmin, xmax)

plt.subplot(2, 1, 2)
sns.barplot(x='name', y='cov', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('Coverage Error (%)')
xmin, xmax = plt.xlim()
plt.plot(plt.xlim(), [5, 5], 'r--')
plt.xlim(xmin, xmax)

plt.tight_layout()
plt.savefig('03-kinc-palmetto-palmetto-summary.pdf')
plt.savefig('03-kinc-palmetto-palmetto-summary.png')
plt.show()

# plot expected vs predicted target values for each model
fig, axes = plt.subplots(4, 3, figsize=(4 * 3, 4 * 4))

for name, (n_rows_a, n_rows_b), ax in zip(names, ab_values, axes.flatten()):
    # extract datasets for system A, system B
    df_a = df[((df['hardware_type'] == hw_a) & (df['n_rows'] <= n_rows_a)) | ((df['hardware_type'] == hw_b) & (df['n_rows'] <= n_rows_b))]
    df_b = df[(df['hardware_type'] == hw_b) & (df['n_rows'] > n_rows_b)]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # get model predictions
    y_bar, y_std = y_preds[name]
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    # save model predictions
    target_pred = '%s | %s' % (target, name)
    df_b[target_pred] = y_bar

    # save anomaly mask
    anomaly_pred = 'anomaly | %s' % (name)
    y_anomaly = anomaly_score(y_test, y_bar, y_std)
    df_b[anomaly_pred] = (np.abs(y_anomaly) > 0.997)

    # compute error bars
    yerr = np.stack([
        y_bar - y_lower,
        y_upper - y_bar
    ])

    # create scatterplot
    mask = ~df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')

    mask = df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:red', c='tab:red', ls='', marker='o', mec='w')

    vmax = max(df_b[target].max(), df_b[target_pred].max())
    ax.plot([0, vmax], [0, vmax], 'k--', zorder=0)
    ax.set_xlabel(target)
    ax.set_ylabel(target_pred)

    cov = df_scores.loc[df_scores['name'] == name, 'cov']
    ax.set_title('Coverage = %0.0f %%' % (100 - cov))

plt.tight_layout()
plt.savefig('03-kinc-palmetto-palmetto-scatter.pdf')
plt.savefig('03-kinc-palmetto-palmetto-scatter.png')
plt.show()

### Palmetto/Nautilus

In [None]:
process_name, target = 'similarity_chunk', 'runtime_hr'

# get performance data for pipeline / process
df_palmetto = data_map['breast-palmetto'][process_name]
df_palmetto = df_palmetto[df_palmetto['hardware_type'] != 'cpu']
df_nautilus_a = data_map['breast-nautilus'][process_name]
df_nautilus_b = data_map['unified-nautilus'][process_name]

df = pd.concat([
    df_palmetto,
    df_nautilus_a,
    df_nautilus_b
])
inputs = config['train_inputs'][process_name]

# remove samples with missing data
df = df.dropna()

# remove inputs that have constant value
inputs = [c for c in inputs if df[c].nunique() > 1]

# append system metrics if appropriate
if 'hardware_type' in inputs:
    inputs.remove('hardware_type')
    inputs += [
        'cpu_flops',
        'cpu_mem_bw',
        'disk_read',
        'disk_write',
        'gpu_flops',
        'gpu_mem_bw',
    ]

# perform parameter sweep
df_scores = []
y_preds = {}

n_rows_max = df_palmetto['n_rows'].max()
n_rows_values = sorted(df_palmetto['n_rows'].unique())
names = ['%0.2f' % (n_rows / n_rows_max) for n_rows in n_rows_values]

for name, n_rows in zip(names, n_rows_values):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'nautilus') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'nautilus') & (~df['dataset'].str.startswith('breast.001'))]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # define model
    model = create_pipeline(create_mlp(X_train.shape[1], intervals=True))

    # train model
    model.fit(X_train, y_train)

    # evaluate model
    y_bar, y_std = check_std(model.predict(X_test))
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    mae = mean_absolute_error(y_test, y_bar)
    mpe = mean_absolute_percentage_error(y_test, y_bar)
    cov = coverage_error(y_test, y_lower, y_upper)

    # save metrics for plots
    df_scores.append({
        'name': name,
        'mae': mae,
        'mpe': mpe,
        'cov': cov
    })

    # save predictions for plots
    y_preds[name] = y_bar, y_std

# save results
df_scores = pd.DataFrame(df_scores)

# plot evaluation scores for each model
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

plt.subplot(1, 3, 1)
sns.barplot(x='name', y='mae', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('MAE (%s)' % (UNITS[target]))
plt.plot(plt.xlim(), [df_b[target].median(), df_b[target].median()], 'r--')

plt.subplot(1, 3, 2)
sns.barplot(x='name', y='mpe', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('MAPE (%)')
plt.plot(plt.xlim(), [20, 20], 'r--')

plt.subplot(1, 3, 3)
sns.barplot(x='name', y='cov', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('Coverage Error (%)')
plt.plot(plt.xlim(), [5, 5], 'r--')

plt.tight_layout()
plt.show()

# plot coverage profile for each model
plt.figure(figsize=(4, 4))

for name, n_rows in zip(names, n_rows_values):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'nautilus') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'nautilus') & (~df['dataset'].str.startswith('breast.001'))]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # get model predictions
    y_bar, y_std = y_preds[name]

    # compute coverage profile
    ci_values = np.arange(0.00, 1.00, 0.01)
    cov_values = np.zeros_like(ci_values)

    for i, ci in enumerate(ci_values):
        y_lower, y_upper = predict_intervals(y_bar, y_std, ci=ci)
        cov_values[i] = prediction_interval_coverage(y_test, y_lower, y_upper)

    # plot coverage profile
    plt.plot(100 * ci_values, 100 * cov_values, label=name)

plt.plot([0, 100], [0, 100], 'k--', zorder=0)
plt.legend(title='model')
plt.xlabel('Confidence Interval (%)')
plt.ylabel('Coverage (%)')
plt.tight_layout()
plt.show()

# plot expected vs predicted target values for each model
fig, axes = plt.subplots(1, len(names), figsize=(4 * len(names), 4), squeeze=False)

for name, n_rows, ax in zip(names, n_rows_values, axes.flatten()):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'nautilus') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'nautilus') & (~df['dataset'].str.startswith('breast.001'))]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # get model predictions
    y_bar, y_std = y_preds[name]
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    # save model predictions
    target_pred = '%s | %s' % (target, name)
    df_b[target_pred] = y_bar

    # save anomaly mask
    anomaly_pred = 'anomaly | %s' % (name)
    y_anomaly = anomaly_score(y_test, y_bar, y_std)
    df_b[anomaly_pred] = (np.abs(y_anomaly) > 0.997)

    # compute error bars
    yerr = np.stack([
        y_bar - y_lower,
        y_upper - y_bar
    ])

    # create scatterplot
    mask = ~df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')

    mask = df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:red', c='tab:red', ls='', marker='o', mec='w')

    vmax = max(df_b[target].max(), df_b[target_pred].max())
    ax.plot([0, vmax], [0, vmax], 'k--', zorder=0)
    ax.set_xlabel(target)
    ax.set_ylabel(target_pred)

    cov = df_scores.loc[df_scores['name'] == name, 'cov']
    ax.set_title('Coverage = %0.0f %%' % (100 - cov))

plt.tight_layout()
plt.savefig('03-kinc-palmetto-nautilus-scatter.pdf')
plt.savefig('03-kinc-palmetto-nautilus-scatter.png')
plt.show()