## Getting Started

In [None]:
import sys
sys.path.append('../bin')

from IPython.display import display
import json
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
from tensorflow import keras

from utils import \
    UNITS, \
    anomaly_score, \
    check_std, \
    predict_intervals

from train import \
    load_datasets, \
    is_categorical, \
    create_dataset, \
    create_gb, \
    create_lr, \
    asym_loss, \
    create_mlp, \
    create_rf, \
    create_pipeline, \
    mean_absolute_percentage_error, \
    prediction_interval_coverage, \
    coverage_error, \
    evaluate_trials, \
    evaluate_cv

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Load Datasets

In [None]:
pipelines = [
    'gemmaker',
    'gene-oracle',
    'hemelb',
    'kinc',
    'tspg'
]

config_map = {p: json.load(open('../workflows/%s/params.json' % (p))) for p in pipelines}
data_map = {}

for p, c in config_map.items():
    process_names = c['train_inputs'].keys()
    merge_files = [arg.split(' ') for arg in c['train_merge_args']]

    data_map[p] = load_datasets(p, process_names, base_dir='../_datasets', merge_files=merge_files)

In [None]:
for pipeline, dfs in data_map.items():
    for process_name, df in dfs.items():
        print('%16s %24s %8d' % (pipeline, process_name, len(df.index)))

In [None]:
for pipeline, dfs in data_map.items():
    for process_name, df in dfs.items():
        print(pipeline, process_name)
        print()
        display(df)
        print()

## Select Prediction Targets

In [None]:
df_stats = []

for pipeline, dfs in data_map.items():
    config = config_map[pipeline]

    for process_name, df in dfs.items():
        inputs = config['train_inputs'][process_name]

        for target in ['runtime_hr', 'memory_GB', 'disk_GB']:
            df_stats.append({
                'pipeline': pipeline,
                'process_name': process_name,
                'target': target,
                'min': df[target].min(),
                'median': df[target].median(),
                'max': df[target].max(),
                'mean': df[target].mean(),
                'std': df[target].std()
            })

df_stats = pd.DataFrame(df_stats)

display(df_stats)

targets_incl = [(r.pipeline, r.process_name, r.target) for i, r in df_stats[df_stats['std'] > 0.1].iterrows()]
targets_excl = [(r.pipeline, r.process_name, r.target) for i, r in df_stats[df_stats['std'] < 0.1].iterrows()]

print('Selected %d prediction targets' % (len(targets_incl)))

## Resource Prediction

In [None]:
def make_plots(x_axes, y_axes, data, titles=None, outfile=None, **kwargs):
    fig, axes = plt.subplots(
        len(x_axes), len(y_axes),
        figsize=(4 * len(y_axes), 4 * len(x_axes)),
        squeeze=False,
        **kwargs)

    for i, x in enumerate(x_axes):
        for j, y in enumerate(y_axes):
            ax = axes[i][j]

            if x == None:
                sns.histplot(data[y], ax=ax)
            elif is_categorical(data, x):
                sns.stripplot(x=x, y=y, data=data, ax=ax)
            else:
                sns.scatterplot(x=x, y=y, data=data, ax=ax)

            if i == 0 and titles != None:
                ax.set_title(titles[j])

    plt.tight_layout()

    if outfile != None:
        plt.savefig(outfile)

    plt.show()

### Evaluate Excluded Targets

In [None]:
# initialize subplots
n_cols = 3
n_rows = 4
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))

# flatten list of axes
axes = [ax for i in range(len(axes)) for ax in axes[i]]

# sample from excluded prediction targets
targets_sampled = random.sample(targets_excl, n_rows * n_cols)

for (pipeline, process_name, target), ax in zip(targets_sampled, axes):
    # get target data
    df = data_map[pipeline][process_name]
    y_true = df[target]

    # compute mean and std
    y_bar = np.mean(y_true)
    y_std = np.std(y_true)
    y_err = 2.0 * y_std

    # get reccomended resource request
    y_recc = np.ceil(np.max(y_true))

    # plot target data with mean and 95% CI
    ax.plot(np.arange(len(y_true)), y_true, ls='', marker='.')
    ax.plot([0, len(y_true) - 1], [y_bar, y_bar], 'r--')
    ax.plot([0, len(y_true) - 1], [y_bar + y_err, y_bar + y_err], 'r:')
    ax.plot([0, len(y_true) - 1], [y_bar - y_err, y_bar - y_err], 'r:')
    ax.set_title('ceil(max) = %0.0f %s' % (y_recc, UNITS[target]))
    ax.set_ylabel('%s / %s / %s' % (pipeline, process_name, target))

plt.tight_layout()
plt.savefig('01-excluded-targets.pdf')
plt.savefig('01-excluded-targets.png')
plt.show()

### Evaluate Selected Targets

In [None]:
for pipeline, process_name, target in targets_incl:
    print()
    print(pipeline, process_name, target)

    # get performance data for pipeline / process
    df = data_map[pipeline][process_name]
    inputs = config_map[pipeline]['train_inputs'][process_name]

    # remove inputs that have constant value
    inputs = [c for c in inputs if df[c].nunique() > 1]

    # skip if there are no input features
    if len(inputs) == 0:
        print('no input features, skipping')
        continue

    # extract performance dataset
    X, y, columns, _ = create_dataset(df, inputs, target)

    # define models
    models = [
        ('mlp', create_mlp(X.shape[1])),
        ('rf', create_rf()),
    ]

    # prepend scaler to each model
    models = [(name, create_pipeline(model)) for name, model in models]

    # evaluate each model on dataset
    df_scores = []
    y_preds = {}

    for name, model in models:
        # evaluate model
        scores, y_bar, y_std = evaluate_cv(model, X, y)

        # save metrics for plots
        df_scores.append({
            'name': name,
            'mae': scores['mae'],
            'mpe': scores['mpe'],
            'cov': scores['cov']
        })

        # save predictions for plots
        y_preds[name] = y_bar, y_std

    # save results
    df_scores = pd.DataFrame(df_scores)

    # plot evaluation scores for each model
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))

    plt.subplot(1, 3, 1)
    sns.barplot(x='name', y='mae', data=df_scores, ci=68, color='tab:blue')
    plt.xlabel('Model')
    plt.ylabel('MAE (%s)' % (UNITS[target]))
    plt.plot(plt.xlim(), [df.median(), df.median()], 'r--')

    plt.subplot(1, 3, 2)
    sns.barplot(x='name', y='mpe', data=df_scores, ci=68, color='tab:blue')
    plt.xlabel('Model')
    plt.ylabel('MAPE (%)')
    plt.plot(plt.xlim(), [20, 20], 'r--')

    plt.subplot(1, 3, 3)
    sns.barplot(x='name', y='cov', data=df_scores, ci=68, color='tab:blue')
    plt.xlabel('Model')
    plt.ylabel('Coverage Error (%)')
    plt.plot(plt.xlim(), [5, 5], 'r--')

    plt.tight_layout()
    plt.show()

    # plot expected vs predicted target values for each model
    fig, axes = plt.subplots(1, len(models), figsize=(4 * len(models), 4))

    for (name, model), ax in zip(models, axes):
        # get model predictions
        y_bar, y_std = y_preds[name]

        # save model predictions
        target_pred = '%s | %s' % (target, name)
        df[target_pred] = y_bar

        # create scatterplot
        ax.errorbar(
            x=target,
            y=target_pred,
            data=df,
            ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')

        vmax = max(df[target].max(), df[target_pred].max())
        ax.plot([0, vmax], [0, vmax], 'k--', zorder=0)
        ax.plot([0, vmax / 1.2], [0, vmax], 'r--', zorder=0)
        ax.plot([0, vmax], [0, vmax / 1.2], 'r--', zorder=0)
        ax.set_xlabel(target)
        ax.set_ylabel(target_pred)

    plt.tight_layout()
    plt.show()

    # plot side-by-side of each input feature
    x_axes = inputs
    y_axes = [target] + ['%s | %s' % (target, name) for name, model in models]
    titles = ['Expected'] + ['Predicted (%s)' % (name) for name, model in models]
    data = df

    make_plots(
        inputs,
        y_axes,
        data,
        titles=titles,
        sharey='row')

### Evaluate Selected Targets (with Intervals)

In [None]:
df_results = []

for pipeline, process_name, target in targets_incl:
    print()
    print(pipeline, process_name, target)

    # get performance data for pipeline / process
    df = data_map[pipeline][process_name]
    inputs = config_map[pipeline]['train_inputs'][process_name]

    # remove inputs that have constant value
    inputs = [c for c in inputs if df[c].nunique() > 1]

    # skip if there are no input features
    if len(inputs) == 0:
        print('no input features, skipping')
        continue

    # compute summary statistics
    row = {
        'pipeline': pipeline,
        'process': process_name,
        'target': target,
        'median': df[target].median()
    }

    # extract performance dataset
    X, y, columns, _ = create_dataset(df, inputs, target)

    # define models
    models = [
        ('mlp', create_mlp(X.shape[1], intervals=True)),
        ('rf', create_rf(intervals=True)),
    ]

    # prepend scaler to each model
    models = [(name, create_pipeline(model)) for name, model in models]

    # evaluate each model on dataset
    df_scores = []
    y_preds = {}

    for name, model in models:
        # evaluate model
        scores, y_bar, y_std = evaluate_cv(model, X, y)

        # save metrics for results dataframe
        row['%s | mae' % (name)] = scores['mae']
        row['%s | mpe' % (name)] = scores['mpe']
        row['%s | cov' % (name)] = scores['cov']

        # save metrics for plots
        df_scores.append({
            'name': name,
            'mae': scores['mae'],
            'mpe': scores['mpe'],
            'cov': scores['cov']
        })

        # save predictions for plots
        y_preds[name] = y_bar, y_std

    # save results
    df_results.append(row)
    df_scores = pd.DataFrame(df_scores)

    # plot evaluation scores for each model
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))

    plt.subplot(1, 3, 1)
    sns.barplot(x='name', y='mae', data=df_scores, ci=68, color='tab:blue')
    plt.xlabel('Model')
    plt.ylabel('MAE (%s)' % (UNITS[target]))
    plt.plot(plt.xlim(), [row['median'], row['median']], 'r--')

    plt.subplot(1, 3, 2)
    sns.barplot(x='name', y='mpe', data=df_scores, ci=68, color='tab:blue')
    plt.xlabel('Model')
    plt.ylabel('MAPE (%)')
    plt.plot(plt.xlim(), [20, 20], 'r--')

    plt.subplot(1, 3, 3)
    sns.barplot(x='name', y='cov', data=df_scores, ci=68, color='tab:blue')
    plt.xlabel('Model')
    plt.ylabel('Coverage Error (%)')
    plt.plot(plt.xlim(), [5, 5], 'r--')

    plt.tight_layout()
    plt.show()

    # plot coverage profile for each model
    plt.figure(figsize=(4, 4))

    for name, model in models:
        # get model predictions
        y_bar, y_std = y_preds[name]

        # compute coverage profile
        ci_values = np.arange(0.00, 1.00, 0.01)
        cov_values = np.zeros_like(ci_values)

        for i, ci in enumerate(ci_values):
            y_lower, y_upper = predict_intervals(y_bar, y_std, ci=ci)
            cov_values[i] = prediction_interval_coverage(y, y_lower, y_upper)

        # plot coverage profile
        plt.plot(100 * ci_values, 100 * cov_values, label=name)

    plt.plot([0, 100], [0, 100], 'k--', zorder=0)
    plt.legend(title='model')
    plt.xlabel('Confidence Interval (%)')
    plt.ylabel('Coverage (%)')
    plt.tight_layout()
    plt.show()

    # plot expected vs predicted target values for each model
    fig, axes = plt.subplots(1, len(models), figsize=(4 * len(models), 4))

    for (name, model), ax in zip(models, axes):
        # get model predictions
        y_bar, y_std = y_preds[name]
        y_lower, y_upper = predict_intervals(y_bar, y_std)

        # save model predictions
        target_pred = '%s | %s' % (target, name)
        df[target_pred] = y_bar

        # save anomaly mask
        anomaly_pred = 'anomaly | %s' % (name)
        y_anomaly = anomaly_score(y, y_bar, y_std)
        df[anomaly_pred] = (np.abs(y_anomaly) > 0.997)

        # compute error bars
        yerr = np.stack([
            y_bar - y_lower,
            y_upper - y_bar
        ])

        # create scatterplot
        mask = ~df[anomaly_pred]
        ax.errorbar(
            x=target,
            y=target_pred,
            yerr=yerr[:, mask],
            data=df[mask],
            ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')

        mask = df[anomaly_pred]
        ax.errorbar(
            x=target,
            y=target_pred,
            yerr=yerr[:, mask],
            data=df[mask],
            ecolor='tab:red', c='tab:red', ls='', marker='o', mec='w')

        vmax = max(df[target].max(), df[target_pred].max())
        ax.plot([0, vmax], [0, vmax], 'k--', zorder=0)
        ax.set_xlabel(target)
        ax.set_ylabel(target_pred)

    plt.tight_layout()
    plt.savefig('%s-%s-%s-scatter.pdf' % (pipeline, process_name, target))
    plt.savefig('%s-%s-%s-scatter.png' % (pipeline, process_name, target))
    plt.show()

    # categorize anomalies
    df['anomaly'] = df['anomaly | mlp'] | df['anomaly | rf']
    df['anomaly_type'] = 'none'
    df.loc[ df['anomaly | mlp'] & ~df['anomaly | rf'], 'anomaly_type'] = 'mlp'
    df.loc[~df['anomaly | mlp'] &  df['anomaly | rf'], 'anomaly_type'] = 'rf'
    df.loc[ df['anomaly | mlp'] &  df['anomaly | rf'], 'anomaly_type'] = 'mlp+rf'

    # plot side-by-side of each input feature
    x_axes = inputs
    y_axes = [target] + ['%s | %s' % (target, name) for name, model in models]
    titles = ['Expected'] + ['Predicted (%s)' % (name) for name, model in models]
    data = df

    fig, axes = plt.subplots(
        len(x_axes), len(y_axes),
        figsize=(4 * len(y_axes), 4 * len(x_axes)),
        squeeze=False,
        sharey='row')

    for i, x in enumerate(x_axes):
        for j, y in enumerate(y_axes):
            ax = axes[i][j]

            if j == 0:
                hue = 'anomaly_type'
                hue_order = ['none', 'mlp', 'rf', 'mlp+rf']
                sizes = {'none': 30, 'mlp': 120, 'rf': 120, 'mlp+rf': 120}
                markers = {'none': 'o', 'mlp': 'X', 'rf': 'X', 'mlp+rf': 'X'}
                palette = {'none': 'tab:blue', 'mlp': 'tab:orange', 'rf': 'tab:red', 'mlp+rf': 'tab:pink'}
            elif j == 1:
                hue = 'anomaly | mlp'
                hue_order = None
                sizes = {False: 30, True: 120}
                markers = {False: 'o', True: 'X'}
                palette = {False: 'tab:blue', True: 'tab:orange'}
            elif j == 2:
                hue = 'anomaly | rf'
                hue_order = None
                sizes = {False: 30, True: 120}
                markers = {False: 'o', True: 'X'}
                palette = {False: 'tab:blue', True: 'tab:red'}

            if is_categorical(data, x):
                sns.stripplot(x=x, y=y, hue=hue, hue_order=hue_order, data=data, dodge=True, palette=palette, ax=ax)
            else:
                sns.scatterplot(x=x, y=y, hue=hue, hue_order=hue_order, size=hue, sizes=sizes, style=hue, markers=markers, data=data, palette=palette, ax=ax)

            if i == 0 and titles != None:
                ax.set_title(titles[j])

    plt.tight_layout()
    plt.savefig('%s-%s-%s-marginals.pdf' % (pipeline, process_name, target))
    plt.savefig('%s-%s-%s-marginals.png' % (pipeline, process_name, target))
    plt.show()

# save results to dataframe
df_results = pd.DataFrame(df_results)
df_results.set_index(['pipeline', 'process', 'target'], inplace=True)
df_results.to_csv('01-resource-prediction.csv', sep='\t')

### Evaluate Train/Test Splits

In [None]:
df_results = []

for pipeline, process_name, target in targets_incl:
    print()
    print(pipeline, process_name, target)

    # get performance data for pipeline / process
    df = data_map[pipeline][process_name]
    inputs = config_map[pipeline]['train_inputs'][process_name]

    # remove inputs that have constant value
    inputs = [c for c in inputs if df[c].nunique() > 1]

    # skip if there are no input features
    if len(inputs) == 0:
        print('no input features, skipping')
        continue

    # extract performance dataset
    X, y, columns, _ = create_dataset(df, inputs, target)

    # create model
    model = create_pipeline(create_mlp(X.shape[1], intervals=True))

    # evaluate each train/test split
    train_sizes = np.arange(0.1, 1.0, 0.1)
    scores_map = evaluate_trials(model, X, y, train_sizes=train_sizes, n_trials=3)

    # collect scores
    df_scores = []
    min_train_size_mpe = 1.0
    min_train_size_cov = 1.0

    for train_size in train_sizes:
        # get scores for this split
        scores = scores_map[train_size]

        # update minimum samples
        if np.mean(scores['mpe']) <= 20:
            min_train_size_mpe = min(train_size, min_train_size_mpe)

        if np.mean(scores['cov']) <= 5:
            min_train_size_cov = min(train_size, min_train_size_cov)

        # save metrics
        scores = pd.DataFrame({
            'name': '%d / %d' % (X.shape[0] * train_size, X.shape[0]),
            'mpe': scores['mpe'],
            'cov': scores['cov']
        })
        df_scores.append(scores)

    df_scores = pd.concat(df_scores)

    # save minimum samples
    df_results.append({
        'name': '%s / %s / %s' % (pipeline, process_name, target),
        'model': 'mlp',
        'min_samples_mpe': min_train_size_mpe * X.shape[0],
        'min_samples_cov': min_train_size_cov * X.shape[0],
        'n_samples': X.shape[0],
    })

    # plot results
    plt.subplots(2, 1, figsize=(12, 8))

    plt.subplot(2, 1, 1)
    sns.barplot(x='name', y='mpe', data=df_scores, color='tab:blue')
    xmin, xmax = plt.xlim()
    plt.plot([xmin, xmax], [20, 20], 'r--')
    plt.xlabel('Training Samples / Total Samples')
    plt.ylabel('MAPE (%)')
    plt.xlim(xmin, xmax)

    plt.subplot(2, 1, 2)
    sns.barplot(x='name', y='cov', data=df_scores, color='tab:blue')
    xmin, xmax = plt.xlim()
    plt.plot([xmin, xmax], [5, 5], 'r--')
    plt.xlabel('Training Samples / Total Samples')
    plt.ylabel('Coverage Error (%)')
    plt.xlim(xmin, xmax)

    plt.tight_layout()
    plt.savefig('%s-%s-%s-trainsize.pdf' % (pipeline, process_name, target))
    plt.savefig('%s-%s-%s-trainsize.png' % (pipeline, process_name, target))
    plt.show()

# save results to dataframe
df_results = pd.DataFrame(df_results)
df_results.set_index('name', inplace=True)
df_results.to_csv('01-train-size.csv', sep='\t')

### Visualize Evaluation Summary

In [None]:
# load resource prediction results
df = pd.read_csv('01-resource-prediction.csv', sep='\t')

for idx, row in df.iterrows():
    df.loc[idx, 'name'] = '%s / %s / %s' % (row.pipeline, row.process, row.target)

fig, axes = plt.subplots(1, 2, figsize=(12, 12), sharey=True)

# plot mean relative error for each prediction target
ax = axes[0]
data = df.copy()
data['20'] = 20
sns.pointplot(y='name', x='20', data=data, color='tab:red', markers='', linestyles='--', ax=ax)

data['mlp'] = data['mlp | mpe']
data['rf'] = data['rf | mpe']
data = data.melt(id_vars=['name'], value_vars=['mlp', 'rf'], var_name='model', value_name='mpe')
sns.barplot(y='name', x='mpe', hue='model', data=data, ax=ax)
ax.set_xlim(0, 100)
ax.set_xlabel('MAPE (%)')
ax.set_ylabel('Name')

# plot coverage error for aech prediction target
ax = axes[1]
data = df.copy()
data['5'] = 5
sns.pointplot(y='name', x='5', data=data, color='tab:red', markers='', linestyles='--', ax=ax)

data['mlp'] = data['mlp | cov']
data['rf'] = data['rf | cov']
data = data.melt(id_vars=['name'], value_vars=['mlp', 'rf'], var_name='model', value_name='cov')
sns.barplot(y='name', x='cov', hue='model', data=data, ax=ax)
ax.set_xlim(0, 100)
ax.set_xlabel('Coverage Error (%)')
ax.set_ylabel('Name')

plt.tight_layout()
plt.savefig('01-resource-prediction.pdf')
plt.savefig('01-resource-prediction.png')
plt.show()

In [None]:
# load training size results
df = pd.read_csv('01-train-size.csv', sep='\t')

fig, axes = plt.subplots(1, 2, figsize=(12, 12), sharey=True)

# plot minimum samples required (mpe) for each target
ax = axes[0]
sns.pointplot(y='name', x='n_samples', data=df, color='tab:red', markers='x', linestyles='', ax=ax)
sns.barplot(y='name', x='min_samples_mpe', hue='model', data=df, zorder=-1, ax=ax)
ax.set_xlabel('Minimum Samples (MAPE)')
ax.set_ylabel('Name')

# plot minimum samples required (cov) for each target
ax = axes[1]
sns.pointplot(y='name', x='n_samples', data=df, color='tab:red', markers='x', linestyles='', ax=ax)
sns.barplot(y='name', x='min_samples_cov', hue='model', data=df, zorder=-1, ax=ax)
ax.set_xlabel('Minimum Samples (Coverage Error)')
ax.set_ylabel('Name')

plt.tight_layout()
plt.savefig('01-train-size.pdf')
plt.savefig('01-train-size.png')
plt.show()

### Evaluate MLP (with Training History)

In [None]:
for pipeline, process_name, target in targets_incl:
    print()
    print(pipeline, process_name, target)

    # get performance data for pipeline / process
    df = data_map[pipeline][process_name]
    inputs = config_map[pipeline]['train_inputs'][process_name]

    # remove inputs that have constant value
    inputs = [c for c in inputs if df[c].nunique() > 1]

    # skip if there are no input features
    if len(inputs) == 0:
        print('no input features, skipping')
        continue

    # extract performance dataset
    X, y, columns, _ = create_dataset(df, inputs, target)

    # create train/test split
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

    # normalize data
    scaler = sklearn.preprocessing.MaxAbsScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # train model
    model = create_mlp(X.shape[1], intervals=True)
    history = model.fit(X_train, y_train)

    # evaluate model
    y_bar, y_std = check_std(model.predict(X_test))
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    mae = sklearn.metrics.mean_absolute_error(y_test, y_bar)
    mpe = mean_absolute_percentage_error(y_test, y_bar)
    cov = coverage_error(y_test, y_lower, y_upper)

    print()
    print('mae: %0.3f %s' % (mae, UNITS[target]))
    print('mpe: %0.3f %%' % (mpe))
    print('cov: %0.3f %%' % (cov))

    # generate predictions for entire dataset
    model = create_pipeline(create_mlp(X.shape[1], intervals=True))
    _, y_bar, y_std = evaluate_cv(model, X, y)
    y_lower, y_upper = predict_intervals(y_bar, y_std)

    target_pred = '%s | mlp' % (target)
    df[target_pred] = y_bar

    plt.subplots(1, 2, figsize=(8, 4))

    # plot training history
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Training History')
    plt.ylabel('MAE (%s)' % (UNITS[target]))
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper left')

    # plot expected vs predicted target values
    cov = coverage_error(y, y_lower, y_upper)
    yerr = [
        y_bar - y_lower,
        y_upper - y_bar
    ]

    plt.subplot(1, 2, 2)
    plt.errorbar(
        x=target,
        y=target_pred,
        yerr=yerr,
        data=df,
        ecolor='tab:blue', c='tab:blue', ls='', marker='o', mec='w')
    vmax = max(df[target].max(), df[target_pred].max())
    plt.plot([0, vmax], [0, vmax], 'k--', zorder=0)
    plt.title('cov = %0.3f %%' % (cov))
    plt.xlabel(target)
    plt.ylabel(target_pred)

    plt.tight_layout()
    plt.show()

### Create Specialized Visualizations

In [None]:
plot_configs = {
    'download_runs': {
        'x': 'n_spots',
        'hue': 'n_remote_run_ids',
        'row': None
    },
    'fastq_dump': {
        'x': 'sra_bytes',
        'hue': None,
        'row': None
    },
    'hemelb': {
        'x': 'n_sites',
        'hue': 'np',
        'row': 'hardware_type'
    },
    'similarity_chunk': {
        'x': 'n_rows',
        'hue': 'chunks',
        'row': 'hardware_type'
    },
    'similarity_merge': {
        'x': 'n_rows',
        'hue': 'n_cols',
        'row': None
    },
    'similarity_mpi': {
        'x': 'n_rows',
        'hue': 'np',
        'row': 'hardware_type'
    },
    'corrpower': {
        'x': 'n_rows',
        'hue': 'n_cols',
        'row': None
    }
}

for pipeline, process_name, target in targets_incl:
    print()
    print(pipeline, process_name, target)

    # get performance data for pipeline / process
    df = data_map[pipeline][process_name]
    inputs = config_map[pipeline]['train_inputs'][process_name]

    # remove inputs that have constant value
    inputs = [c for c in inputs if df[c].nunique() > 1]

    # skip if there are no input features
    if len(inputs) == 0:
        print('no input features, skipping')
        continue

    # skip if there is no plots config
    if process_name not in plot_configs:
        print('no plots config, skipping')
        continue

    # plot target data by itself
    config = plot_configs[process_name]
    x   = config['x']
    hue = config['hue']
    row = config['row']

    if row != None:
        df.sort_values(by=row, inplace=True, kind='mergesort')

    if df[x].dtype.kind in 'biuOSUV':
        plot_func = sns.stripplot
    else:
        plot_func = sns.scatterplot

    g = sns.FacetGrid(
        df,
        row=row,
        sharex=True,
        sharey='row',
        height=3,
        aspect=2,
        margin_titles=True
    )
    g.map_dataframe(
        plot_func,
        x=x,
        y=target,
        hue=hue,
        palette='viridis',
        dodge=True
    )
    g.set_axis_labels(x, target)
    g.add_legend(title=hue)
    plt.savefig('%s-%s-%s.pdf' % (pipeline, process_name, target))
    plt.savefig('%s-%s-%s.png' % (pipeline, process_name, target))
    plt.show()

    continue

    # extract performance dataset
    X, y, columns, _ = create_dataset(df, inputs, target)

    # train model
    model = create_pipeline(create_mlp(X.shape[1]))
    model.fit(X, y)

    # create dataframe of model predictions
    df_true = df
    df_pred = df.copy()
    df_pred[target] = model.predict(X)

    # create merged dataframe
    df_true['data_type'] = 'true'
    df_pred['data_type'] = 'pred'
    data = pd.concat([df_true, df_pred])

    # create facet grid of data and model distributions
    config = plot_configs[process_name]
    x   = config['x']
    hue = config['hue']
    row = config['row']

    if row != None:
        data.sort_values(by=row, inplace=True, kind='mergesort')

    g = sns.FacetGrid(
        data,
        row=row,
        col='data_type',
        sharex=True,
        sharey='row',
        height=3,
        aspect=2,
        margin_titles=True
    )
    g.map_dataframe(
        sns.stripplot,
        x=x,
        y=target,
        hue=hue,
        palette='viridis'
    )
    g.set_axis_labels(x, target)
    g.add_legend(title=hue)
    plt.show()