## Getting Started

In [None]:
import sys
sys.path.append('../bin')

from IPython.display import display
import json
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
from tensorflow import keras

from sklearn.metrics import \
    mean_absolute_error

from utils import \
    UNITS, \
    anomaly_score, \
    check_std, \
    predict_intervals, \
    resample

from train import \
    load_dataset, \
    load_datasets, \
    is_categorical, \
    create_dataset, \
    create_gb, \
    create_lr, \
    asym_loss, \
    create_mlp, \
    create_rf, \
    create_pipeline, \
    mean_absolute_percentage_error, \
    prediction_interval_coverage, \
    coverage_error, \
    evaluate_trials, \
    evaluate_cv

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Load Trace Data

In [None]:
# load config file
config = json.load(open('../workflows/kinc/params.json'))

# load trace data
dataset_names = [
    'breast-palmetto',
    'breast-google',
    'unified-google'
]
data_map = {}

for dataset_name in dataset_names:
    base_dir = '../_datasets/kinc-%s' % (dataset_name)

    # aggregate minibench data before merging
    df = load_dataset('%s/minibench.minibench.trace.txt' % (base_dir))
    df = df.drop(columns=['hash', 'cpus', 'time', 'disk', 'memory'], errors='ignore')
    df['gpu_flops'] = df['gpu_flops'].astype(np.float64)
    df['gpu_mem_bw'] = df['gpu_mem_bw'].astype(np.float64)
    df.loc[df['gpu_flops'].abs() > 1e7, 'gpu_flops'] = 0
    df.loc[df['gpu_mem_bw'].abs() > 1e4, 'gpu_mem_bw'] = 0
    df = df.groupby('node_type').mean()
    df.to_csv('%s/minibench.minibench.trace.txt' % (base_dir), sep='\t')

    process_names = config['train_inputs'].keys()
    merge_files = [arg.split(' ') for arg in config['train_merge_args']]

    data_map[dataset_name] = load_datasets('kinc', process_names, base_dir=base_dir, merge_files=merge_files)

# remove additional unused columns
for dataset_name in data_map.keys():
    for process_name, df in data_map[dataset_name].items():
        df['platform'] = dataset_name.split('-')[1]
        df = df.drop(columns=['hash', 'cpus', 'time', 'disk', 'memory', 'workdir', 'hostname'], errors='ignore')
        data_map[dataset_name][process_name] = df

In [None]:
# select one similarity_chunk run for each unique workflow run
for dataset_name in ['breast-palmetto']:
    df = data_map[dataset_name]['similarity_chunk']

    df = resample(df, ['dataset', 'n_rows', 'n_cols', 'hardware_type', 'chunks', 'threads', 'platform', 'node_type'])

    display(df)

    data_map[dataset_name]['similarity_chunk'] = df

## Cost Prediction

### Define Cost Model

In [None]:
# define hourly rates for gcp resources (us-central1)
gcp_rates = {
    'e2_standard': {
        'cpu': { 'ondemand': 0.021811, 'preempt': 0.006543 }, # per vCPU per hour
        'mem': { 'ondemand': 0.002923, 'preempt': 0.000877 }  # per GB per hour
    },
    'e2_custom': {
        'cpu': { 'ondemand': 0.022890, 'preempt': 0.006867 },
        'mem': { 'ondemand': 0.003067, 'preempt': 0.000920 }
    },
    'n1_standard': {
        'cpu': { 'ondemand': 0.031611, 'preempt': 0.006655 },
        'mem': { 'ondemand': 0.004237, 'preempt': 0.000892 }
    },
    'n1_custom': {
        'cpu': { 'ondemand': 0.033174, 'preempt': 0.00698 },
        'mem': { 'ondemand': 0.004446, 'preempt': 0.00094 }
    },
    'n1_extended': {
        'mem': { 'ondemand': 0.009550, 'preempt': 0.002014 }
    },
    'gpu': {
        'p100': { 'ondemand': 1.46, 'preempt': 0.43 }, # per GPU per hour
        'v100': { 'ondemand': 2.48, 'preempt': 0.74 }
    },
    'disk': {
        'standard': 0.040 / 730, # per GB per hour
        'ssd':      0.170 / 730,
        'balanced': 0.100 / 730,
        'extreme':  0.125 / 730
    },
    'storage': {
        'standard': 0.0200 / 730, # per GB per hour
        'nearline': 0.0100 / 730,
        'coldline': 0.0040 / 730,
        'archive':  0.0012 / 730
    },
    'storage_egress': {
        '0.TB':  0.12, # per GB
        '1.TB':  0.11,
        '10.TB': 0.08
    },
    'storage_ops': {
        'standard': { 'class_a': 0.050, 'class_b': 0.004 }, # per 10,000 ops
        'nearline': { 'class_a': 0.100, 'class_b': 0.010 },
        'coldline': { 'class_a': 0.100, 'class_b': 0.050 },
        'archive':  { 'class_a': 0.500, 'class_b': 0.500 }
    }
}



def gcp_cost_per_hour(x):
    cost_per_hour = 0.0

    # compute cpu cost
    cpus = x.cpus if 'cpus' in x.index else 2 
    cost_per_hour += gcp_rates['n1_custom']['cpu']['preempt'] * cpus

    # compute memory cost
    memory_GB = x.memory if 'memory' in x.index else 8
    cost_per_hour += gcp_rates['n1_custom']['mem']['preempt'] * min(memory_GB, cpus * 6.5)

    # compute extended memory cost
    if memory_GB > cpus * 6.5:
        cost_per_hour += gcp_rates['n1_extended']['mem']['preempt'] * (memory_GB - cpus * 6.5)

    # compute gpu cost
    if 'hardware_type' in x.index and x.hardware_type != 'cpu':
        gpus = 1
        gpu_type = x.hardware_type if x.hardware_type in {'p100', 'v100'} else 'v100'
        cost_per_hour += gcp_rates['gpu'][gpu_type]['preempt'] * gpus

    # compute disk cost
    disk_GB = x.disk if 'disk' in x.index else 520
    cost_per_hour += gcp_rates['disk']['standard'] * disk_GB

    return cost_per_hour



def gcp_storage_cost(storage_GB, storage_hr=0.0, egress=True, n_ops_a=0, n_ops_b=0):
    storage_cost = 0.0

    # compute storage cost
    storage_cost += gcp_rates['storage']['standard'] * storage_GB * storage_hr

    # compute egress cost
    if egress:
        storage_cost += gcp_rates['storage_egress']['0.TB'] * min(storage_GB, 1024)
        storage_cost += gcp_rates['storage_egress']['1.TB'] * min(max(0, storage_GB - 1024), 10240)
        storage_cost += gcp_rates['storage_egress']['10.TB'] * max(0, storage_GB - 10240)

    # compute ops cost
    storage_cost += gcp_rates['storage_ops']['standard']['class_a'] * n_ops_a / 10000
    storage_cost += gcp_rates['storage_ops']['standard']['class_b'] * n_ops_b / 10000

    return storage_cost

In [None]:
# compute gcp costs for a few example operations
x = pd.Series({
    'cpus': 2,
    'memory': 8,
    'hardware_type': 'v100',
    'disk': 520
})
storage_GB = 1000

print('custom VM w/ V100: $%0.2f / hour' % (gcp_cost_per_hour(x)))
print('store 1 TB:        $%0.3f / hour' % (gcp_storage_cost(storage_GB, storage_hr=1, egress=False)))
print('store 1 TB:        $%0.0f / month' % (gcp_storage_cost(storage_GB, storage_hr=730, egress=False)))
print('download 1 TB:     $%0.0f' % (gcp_storage_cost(storage_GB, storage_hr=0, egress=True)))

### Estimate Cost (Palmetto)

In [None]:
process_name, target = 'similarity_chunk', 'runtime_hr'

# get performance data for pipeline / process
df = data_map['breast-palmetto'][process_name]

# HACK: minor adjustments
df = df.sort_values('hardware_type')

# estimate cost
cost_per_hour = df.apply(gcp_cost_per_hour, axis=1)

df['total_runtime_hr'] = df['runtime_hr'] * df['chunks']
df['gcp_cost_usd'] = df['total_runtime_hr'] * cost_per_hour

# plot runtime for various dataset sizes
col = 'n_rows'
x = 'n_cols'
y = 'total_runtime_hr'
hue = 'hardware_type'

colors = plt.get_cmap('tab10').colors
palette = {
    'cpu': colors[0],
    'p100': colors[1],
    'v100': colors[2]
}

g = sns.FacetGrid(
    df,
    col=col,
    sharey=False
)
g.map_dataframe(
    sns.pointplot,
    x=x,
    y=y,
    hue=hue,
    markers='x',
    linestyles='--',
    palette=palette
)
g.add_legend(title=hue)
plt.savefig('04-kinc-palmetto-runtime.pdf')
plt.savefig('04-kinc-palmetto-runtime.png')
plt.show()

# plot estimated cast for various dataset sizes
y = 'gcp_cost_usd'

g = sns.FacetGrid(
    df,
    col=col,
    sharey=False
)
g.map_dataframe(
    sns.pointplot,
    x=x,
    y=y,
    hue=hue,
    markers='x',
    linestyles='--',
    palette=palette
)
g.add_legend(title=hue)
plt.savefig('04-kinc-palmetto-cost.pdf')
plt.savefig('04-kinc-palmetto-cost.png')
plt.show()

### Estimate Cost (Google)

In [None]:
output_processes = {'similarity_merge'}
compute_cost = 0.0
storage_GB = 0.0

for process_name, df in data_map['unified-google'].items():
    print()
    print(process_name)

    # get performance data for pipeline / process
    if len(df) == 0:
        print('no data, skipping')
        continue

    # estimate runtime cost
    df['runtime_cost'] = df['runtime_hr'] * df.apply(gcp_cost_per_hour, axis=1)

    # print results
    print('compute cost: $%0.2f' % (df['runtime_cost'].sum()))
    print('output data: %0.0f GB' % (df['disk_GB'].sum()))

    # update totals
    compute_cost += df['runtime_cost'].sum()

    if process_name in output_processes:
        storage_GB += df['disk_GB'].sum()
    else:
        print('note: intermediate outputs excluded from total storage/egress cost')

# estimate storage cost
storage_cost = gcp_storage_cost(storage_GB, storage_hr=730, egress=False)
egress_cost = gcp_storage_cost(storage_GB, storage_hr=0, egress=True)

# print results
print()
print('total output data: %0.0f GB' % (storage_GB))
print()
print('total compute cost: $%0.2f' % (compute_cost))
print('total storage cost: $%0.2f / month' % (storage_cost))
print('total egress cost:  $%0.2f' % (egress_cost))

### Estimate Cost (Palmetto/Google)

In [None]:
process_name, target = 'similarity_chunk', 'runtime_hr'

# get performance data for pipeline / process
df_palmetto = data_map['breast-palmetto'][process_name]
df_palmetto = df_palmetto[df_palmetto['hardware_type'] != 'cpu']
df_google_a = data_map['breast-google'][process_name]
df_google_b = data_map['unified-google'][process_name]

df = pd.concat([
    df_palmetto,
    df_google_a,
    df_google_b
])
inputs = config['train_inputs'][process_name]

# remove samples with missing data
df = df.drop(columns=['gcp_cost', 'runtime_cost'], errors='ignore')
df = df.dropna()

# compute ground truth cost
df['gcp_cost_usd'] = df[target] * df.apply(gcp_cost_per_hour, axis=1)

# remove inputs that have constant value
inputs = [c for c in inputs if df[c].nunique() > 1]

# append system metrics if appropriate
if 'hardware_type' in inputs:
    inputs.remove('hardware_type')
    inputs += [
        'cpu_flops',
        'cpu_mem_bw',
        'disk_read',
        'disk_write',
        'gpu_flops',
        'gpu_mem_bw',
    ]

# perform parameter sweep
df_scores = []
y_preds = {}
c_preds = {}

n_rows_max = df_palmetto['n_rows'].max()
n_rows_values = sorted(df_palmetto['n_rows'].unique())
names = ['%0.2f' % (n_rows / n_rows_max) for n_rows in n_rows_values]

for name, n_rows in zip(names, n_rows_values):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'google') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'google') & (~df['dataset'].str.startswith('breast.001'))]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # define model
    model = create_pipeline(create_mlp(X_train.shape[1], intervals=True))

    # train model
    model.fit(X_train, y_train)

    # evaluate model
    y_bar, y_std = check_std(model.predict(X_test))
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    mae = mean_absolute_error(y_test, y_bar)
    mpe = mean_absolute_percentage_error(y_test, y_bar)
    cov = coverage_error(y_test, y_lower, y_upper)

    # save metrics for plots
    df_scores.append({
        'name': name,
        'mae': mae,
        'mpe': mpe,
        'cov': cov
    })

    # save predictions for plots
    y_preds[name] = y_bar, y_std

    # save cost predictions
    cost_per_hour = df_b.apply(gcp_cost_per_hour, axis=1)

    c_bar = y_bar * cost_per_hour
    c_std = y_std * cost_per_hour
    c_preds[name] = c_bar, c_std

# save results
df_scores = pd.DataFrame(df_scores)

# plot evaluation scores for each model
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

plt.subplot(1, 3, 1)
sns.barplot(x='name', y='mae', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('MAE (%s)' % (UNITS[target]))
plt.plot(plt.xlim(), [df_b[target].median(), df_b[target].median()], 'r--')

plt.subplot(1, 3, 2)
sns.barplot(x='name', y='mpe', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('MAPE (%)')
plt.plot(plt.xlim(), [20, 20], 'r--')

plt.subplot(1, 3, 3)
sns.barplot(x='name', y='cov', data=df_scores, ci=68, color='tab:blue')
plt.xlabel('Model')
plt.ylabel('Coverage Error (%)')
plt.plot(plt.xlim(), [5, 5], 'r--')

plt.tight_layout()
plt.show()

# plot coverage profile for each model
plt.figure(figsize=(4, 4))

for name, n_rows in zip(names, n_rows_values):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'google') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'google') & (~df['dataset'].str.startswith('breast.001'))]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # get model predictions
    y_bar, y_std = y_preds[name]

    # compute coverage profile
    ci_values = np.arange(0.00, 1.00, 0.01)
    cov_values = np.zeros_like(ci_values)

    for i, ci in enumerate(ci_values):
        y_lower, y_upper = predict_intervals(y_bar, y_std, ci=ci)
        cov_values[i] = prediction_interval_coverage(y_test, y_lower, y_upper)

    # plot coverage profile
    plt.plot(100 * ci_values, 100 * cov_values, label=name)

plt.plot([0, 100], [0, 100], 'k--', zorder=0)
plt.legend(title='model')
plt.xlabel('Confidence Interval (%)')
plt.ylabel('Coverage (%)')
plt.tight_layout()
plt.show()

# plot expected vs predicted target values for each model
fig, axes = plt.subplots(1, len(names), figsize=(4 * len(names), 4), squeeze=False)

for name, n_rows, ax in zip(names, n_rows_values, axes.flatten()):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'google') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'google') & (~df['dataset'].str.startswith('breast.001'))]

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # get model predictions
    y_bar, y_std = y_preds[name]
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    # save model predictions
    target_pred = '%s | %s' % (target, name)
    df_b[target_pred] = y_bar

    # save anomaly mask
    anomaly_pred = 'anomaly | %s' % (name)
    y_anomaly = anomaly_score(y_test, y_bar, y_std)
    df_b[anomaly_pred] = (np.abs(y_anomaly) > 0.997)

    # compute error bars
    yerr = np.stack([
        y_bar - y_lower,
        y_upper - y_bar
    ])

    # create scatterplot
    mask = ~df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')

    mask = df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:red', c='tab:red', ls='', marker='o', mec='w')

    vmax = max(df_b[target].max(), df_b[target_pred].max())
    ax.plot([0, vmax], [0, vmax], 'k--', zorder=0)
    ax.set_xlabel(target)
    ax.set_ylabel(target_pred)

    cov = df_scores.loc[df_scores['name'] == name, 'cov']
    ax.set_title('Coverage = %0.0f %%' % (100 - cov))

plt.tight_layout()
plt.savefig('04-kinc-palmetto-google-runtime.pdf')
plt.savefig('04-kinc-palmetto-google-runtime.png')
plt.show()

# plot expected vs predicted cost for each model
fig, axes = plt.subplots(1, len(names), figsize=(4 * len(names), 4), squeeze=False)

for name, n_rows, ax in zip(names, n_rows_values, axes.flatten()):
    # extract datasets for system A, system B
    df_a = df[((df['platform'] == 'palmetto') & (df['n_rows'] <= n_rows)) | ((df['platform'] == 'google') & df['dataset'].str.startswith('breast.001'))]
    df_b = df[(df['platform'] == 'google') & (~df['dataset'].str.startswith('breast.001'))]

    # HACK: change target from runtime to cost
    target = 'gcp_cost_usd'

    # create train/test sets for df_a, df_b
    X_train, y_train, columns, _ = create_dataset(df_a, inputs, target)
    X_test, y_test, _, _ = create_dataset(df_b, inputs, target)

    # get model predictions
    y_bar, y_std = c_preds[name]
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    # save model predictions
    target_pred = 'gcp_cost_usd | %s' % (name)
    df_b[target_pred] = y_bar

    # save anomaly mask
    anomaly_pred = 'anomaly | %s' % (name)
    y_anomaly = anomaly_score(y_test, y_bar, y_std)
    df_b[anomaly_pred] = (np.abs(y_anomaly) > 0.997)

    # compute error bars
    yerr = np.stack([
        y_bar - y_lower,
        y_upper - y_bar
    ])

    # create scatterplot
    mask = ~df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')

    mask = df_b[anomaly_pred]
    ax.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr[:, mask],
        data=df_b[mask],
        ecolor='tab:red', c='tab:red', ls='', marker='o', mec='w')

    vmax = max(df_b[target].max(), df_b[target_pred].max())
    ax.plot([0, vmax], [0, vmax], 'k--', zorder=0)
    ax.set_xlabel(target)
    ax.set_ylabel(target_pred)

plt.tight_layout()
plt.savefig('04-kinc-palmetto-google-cost.pdf')
plt.savefig('04-kinc-palmetto-google-cost.png')
plt.show()