In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
serif = False
render_format = "pdf"
if serif:
    dir_postfix = ""
    plt.rcParams["font.family"] = "serif"
else:
    dir_postfix = "sans"
    plt.rcParams["font.family"] = "Liberation Sans"
plt.rcParams["font.size"] = 10
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42

from results import *

# Erratic Failure Analysis: Close Box Domain

In [2]:
close_exp_keys = []

# Generate temporal consistency experiment keys.
close_exp_keys += get_temporal_consistency_exp_keys(
    pred_horizons=[16],
    sample_sizes=[32],
    error_fns=["mmd_rbf_all", "kde_kl_all_rev", "kde_kl_all_for", "mse_all"],
    aggr_fns=["min"],
)

# Generate loss function experiment keys.
close_exp_keys += get_loss_function_exp_keys(
    loss_fns=["noise_pred_all", "temporal_noise_pred_all"],
    sample_sizes=[10],
)

# Generate reconstruction experiment keys.
close_exp_keys += get_loss_function_exp_keys(
    loss_fns=["action_rec_all", "temporal_action_rec_all"],
    sample_sizes=[4],
)

# Generate embedding experiment keys.
close_exp_keys += get_embedding_exp_keys(
    embeddings=["encoder_feat", "clip_feat", "resnet_feat"],
    score_fns=["mahal"],
)

# Generate ensemble experiment keys.
close_exp_keys += get_ensemble_exp_keys(
    pred_horizons=[16],
    sample_sizes=[32],
    action_spaces=["all"],
)

# Load results (main result, over three seeds).
close_metrics_0 = compile_metrics(
    domain="0527_close_4",
    splits=["na", "hh"],
    exp_keys=close_exp_keys,
    return_test_data=True
)
close_aggr_metrics_0 = aggregate_metrics(
    splits=["na", "hh"],
    exp_keys=close_exp_keys,
    data=close_metrics_0,
)

close_metrics_1 = compile_metrics(
    domain="0528_close_4",
    splits=["na", "hh"],
    exp_keys=close_exp_keys,
    return_test_data=True,
)
close_aggr_metrics_1 = aggregate_metrics(
    splits=["na", "hh"],
    exp_keys=close_exp_keys,
    data=close_metrics_1,
)

close_metrics_2 = compile_metrics(
    domain="0529_close_4",
    splits=["na", "hh"],
    exp_keys=close_exp_keys,
    return_test_data=True,
)
close_aggr_metrics_2 = aggregate_metrics(
    splits=["na", "hh"],
    exp_keys=close_exp_keys,
    data=close_metrics_2,
)

# Generate sentinel experiment keys.
stac_exp_keys = get_temporal_consistency_exp_keys(
    pred_horizons=[16],
    sample_sizes=[32],
    error_fns=["mmd_rbf_all"],
    aggr_fns=[""],
)
vlm_exp_keys = get_vlm_exp_keys(
    models=[
        "gpt-4o", 
        "claude-3-5-sonnet-20240620", 
        "gemini-1-5-pro"
    ],
    templates={
        "gpt-4o": ["image_qa", "video_qa"],
        "claude-3-5-sonnet-20240620": ["image_qa", "video_qa"],
        "gemini-1-5-pro": ["image_qa", "video_qa"],
    }
)

# Load results (VLM result, over one seed).
close_metrics_3 = compile_metrics(
    domain="0914_close_4",
    splits=["na", "hh"],
    exp_keys=stac_exp_keys + vlm_exp_keys,
    return_test_data=True,
    return_test_frame=True,
)
close_aggr_metrics_3 = aggregate_metrics(
    splits=["na", "hh"],
    exp_keys=stac_exp_keys + vlm_exp_keys,
    data=close_metrics_3
)

## Main Table Result

In [None]:
for exp_key in close_exp_keys:
    row = f"{exp_key} & "

    # Dataset split result.
    for split in ["na", "hh"]:
        for metric in ["TPR", "TNR", "TP Time Mean"]:
            stat = 0
            denom = 0
            
            # Average metric over seeds.
            for i, d in enumerate([close_metrics_0, close_metrics_1, close_metrics_2]):
                if metric == "TPR" and d[split][exp_key]["metrics"]["TPR"] == 0 and i == 0:
                    continue
                elif metric == "TP Time Mean" and "TP Time Mean" not in d[split][exp_key]["metrics"]:
                    continue
                stat += d[split][exp_key]["metrics"][metric]
                denom += 1

            if denom == 0:
                stat = -1
            else:
                stat = stat / denom
            if metric == "TP Time Mean":
                stat = stat / 5
            row += f"{round(stat, 2):0.2f} & "
        row += "& "
    row += "& "
    
    # Aggregate result.
    for metric in ["TPR", "TNR", "Accuracy"]:
        stat = 0
        denom = 0

        # Average metric over seeds.
        for i, d in enumerate([close_aggr_metrics_0, close_aggr_metrics_1, close_aggr_metrics_2]):
            stat += d[exp_key]["metrics"][metric]
            denom += 1

        if denom == 0:
            stat = -1
        else:
            stat = stat / denom
        row += f"{stat:0.2f} & "
    
    print(row[:-2] + "\\\\")


for exp_key in vlm_exp_keys:
    row = f"{exp_key} & "

    # Dataset split result.
    for split in ["na", "hh"]:
        for metric in ["TPR", "TNR", "TP Time Mean"]:
            stat = close_metrics_3[split][exp_key]["metrics"][metric]
            if metric == "TP Time Mean":
                stat = stat / 5
            
            row += f"{round(stat, 2):0.2f} & "
        row += "& & "
    
    # Aggregate result.
    for metric in ["TPR", "TNR", "Accuracy"]:
        stat = close_aggr_metrics_3[exp_key]["metrics"][metric]
        row += f"{stat:0.2f} & "
    
    print(row[:-2] + "\\\\")

In [None]:
# Generate sentinel experiment keys.
stac_exp_key = get_temporal_consistency_exp_keys(
    pred_horizons=[16],
    sample_sizes=[32],
    error_fns=["mmd_rbf_all"],
    aggr_fns=[""],
)[0]
vlm_exp_keys = get_vlm_exp_keys(
    models=["gpt-4o"],
    templates={"gpt-4o": ["video_qa"]}
)

# Compute sentinel results.
_, _, _ = compute_sentinel_result(
    stac_exp_key=stac_exp_key,
    vlm_exp_keys_list=[vlm_exp_keys],
    splits_list=[["na", "hh"]],
    metrics_list=[close_metrics_3],
    time_mod=5.0,
)

In [None]:
exp_key = close_exp_keys[0]
P_TOT = N_TOT = 0
for split in ["na", "hh"]:
    P = N = 0
    for d in [close_metrics_0, close_metrics_1, close_metrics_2, close_metrics_3]:
        labels: np.ndarray = d[split][exp_key]["data"]["test_labels"]
        N += np.sum(labels == True)
        P += np.sum(labels == False)
    
    print(f"Split: {split} | Success: {N} | Failures: {P} | Rate: {N / (P + N):.2f}")
    N_TOT += N
    P_TOT += P

print(f"Split: Combined | Success: {N_TOT} | Failures: {P_TOT} | Rate: {N_TOT / (P_TOT + N_TOT):.2f}")


## Ablation Result: Cumulative Score Function

In [6]:
mmd_exp_key = "pred_horizon_16_sample_size_32_error_fn_mmd_rbf_all"
close_metrics_0_hh = compile_metrics(
    domain="0527_close_4",
    splits=["hh"],
    exp_keys=[mmd_exp_key],
    return_test_data=True,
    return_test_frame=True,
    return_demo_frame=True,
)

# Extract test scores.
test_scores = get_detection_scores("hh", mmd_exp_key, close_metrics_0_hh)
p_scores = test_scores["P_scores"][:, 2:]
n_scores = test_scores["N_scores"][:, 2:]
upper = max(p_scores.max(), n_scores.max())
lower = min(p_scores.min(), n_scores.min())

# Normalize scores and compute quantiles.
p_scores: np.ndarray = (p_scores - lower) / (upper - lower)
n_scores: np.ndarray = (n_scores - lower) / (upper - lower)
p_mean = p_scores.mean(axis=0)
n_mean = n_scores.mean(axis=0)
p_qh = np.quantile(p_scores, quantile, axis=0)
n_qh = np.quantile(n_scores, quantile, axis=0)
p_ql = np.quantile(p_scores, 1 - quantile, axis=0)
n_ql = np.quantile(n_scores, 1 - quantile, axis=0)

# Threshold
demo_frame = close_metrics_0_hh["hh"][mmd_exp_key]["demo_frame"]
thresh = np.quantile(data_utils.aggr_episode_key_data(demo_frame, f"{mmd_exp_key}_cum_score"), quantile)
thresh = (thresh - lower) / (upper - lower)

In [None]:
fig, ax = plt.subplots(figsize=(7, 5.5))

failure_color = "orange"
success_color = "blue"
thresh_color = "red"

x = np.linspace(0, 1, len(p_mean))
ax.plot(x, p_mean, label="Policy Failure", color=failure_color, linewidth=4)
ax.plot(x, n_mean, label="Policy Success", color=success_color, linewidth=4)
ax.fill_between(x, p_ql, p_qh, color=failure_color, alpha=0.15)
ax.fill_between(x, n_ql, n_qh, color=success_color, alpha=0.15)
ax.axhline(y=thresh, color=thresh_color, linestyle='--', linewidth=5, label="Detection Threshold")

ax.set_title("Temporal Consistency Score", fontsize=26)
ax.set_ylabel("Normalized Score ($\\eta_t$)", fontsize=22)
ax.set_ybound(0, 1.0)
ax.tick_params(axis="y", labelsize=16)
ax.set_xlabel("Normalized Trajectory Time (%)", fontsize=22)
xticks = np.linspace(0, 1, 6)
xticklabels = [f"{x:.1f}" for x in xticks]
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels, fontsize=16)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(3)
ax.spines['bottom'].set_linewidth(3)
ax.legend(loc='upper left', fancybox=True, framealpha=0.7, fontsize=20, edgecolor='gray')

plt.tight_layout()
save_path = CWD / ".." / f"figures_{dir_postfix}" / f"error-result.{render_format}"
plt.savefig(save_path, format=render_format, dpi=300, bbox_inches='tight', transparent=True)

plt.show()

#

# Hyperparameter Sweep: Close Box Domain
We conduct a hyperparameter sweep for the failure detector on a hold-out dataset. The selected hyperparameters must then generalize to the test sets above. 

In [8]:
# Generate temporal consistency experiment keys.
pred_horizons = [16]
sample_sizes = [32, 64, 128, 256]

# MMD sweeps.
mmd_error_fns = [
    "mmd_rbf_all",
    "mmd_rbf_all_median",
    "mmd_rbf_all_eig",
    "mmd_rbf_all_0.1",
    "mmd_rbf_all_0.5",
    "mmd_rbf_all_1.0",
    "mmd_rbf_all_5.0",
    "mmd_rbf_all_10.0" ,
    "mmd_rbf_all_100.0" ,
]
mmd_exp_keys = get_temporal_consistency_exp_keys(
    pred_horizons=pred_horizons,
    sample_sizes=sample_sizes,
    error_fns=mmd_error_fns,
    aggr_fns=["min"],
)

# KDE For. sweeps.
kde_for_error_fns = [
    "kde_kl_all_for",
    "kde_kl_all_for_eig",
    "kde_kl_all_for_0.1",
    "kde_kl_all_for_0.5",
    "kde_kl_all_for_5.0",
    "kde_kl_all_for_10.0",
    "kde_kl_all_for_100.0",
]
kde_for_exp_keys = get_temporal_consistency_exp_keys(
    pred_horizons=pred_horizons,
    sample_sizes=sample_sizes,
    error_fns=kde_for_error_fns,
    aggr_fns=["min"],
)

# KDE Rev.
kde_rev_error_fns = [
    "kde_kl_all_rev",
    "kde_kl_all_rev_eig",
    "kde_kl_all_rev_0.1",
    "kde_kl_all_rev_0.5",
    "kde_kl_all_rev_5.0",
    "kde_kl_all_rev_10.0",
    "kde_kl_all_rev_100.0",
]
kde_rev_exp_keys = get_temporal_consistency_exp_keys(
    pred_horizons=pred_horizons,
    sample_sizes=sample_sizes,
    error_fns=kde_rev_error_fns,
    aggr_fns=["min"],
)

# Compile metrics.
mmd_metrics = compile_metrics(
    domain="0525_close_4_sweep",
    splits=["na", "ll", "hh"],
    exp_keys=mmd_exp_keys,
)
mmd_metrics_aggr = aggregate_metrics(
    splits=["na", "ll", "hh"],
    exp_keys=mmd_exp_keys,
    data=mmd_metrics,
)

kde_for_metrics = compile_metrics(
    domain="0525_close_4_sweep",
    splits=["na", "ll", "hh"],
    exp_keys=kde_for_exp_keys,
)
kde_for_metrics_aggr = aggregate_metrics(
    splits=["na", "ll", "hh"],
    exp_keys=kde_for_exp_keys,
    data=kde_for_metrics,
)

kde_rev_metrics = compile_metrics(
    domain="0525_close_4_sweep",
    splits=["na", "ll", "hh"],
    exp_keys=kde_rev_exp_keys,
)
kde_rev_metrics_aggr = aggregate_metrics(
    splits=["na", "ll", "hh"],
    exp_keys=kde_rev_exp_keys,
    data=kde_rev_metrics,
)

In [None]:
# MMD result.
for e, a in list(zip(*sort_metrics(exp_keys=mmd_exp_keys, data=mmd_metrics_aggr, metric="Balanced Accuracy")))[:30]:
    print("Method:", e, "| Score:", a)

In [None]:
# KDE For. result.
for e, a in list(zip(*sort_metrics(exp_keys=kde_for_exp_keys, data=kde_for_metrics_aggr, metric="Balanced Accuracy")))[:30]:
    print("Method:", e, "| Score:", a)

In [None]:
# KDE Rev. result.
for e, a in list(zip(*sort_metrics(exp_keys=kde_rev_exp_keys, data=kde_rev_metrics_aggr, metric="Balanced Accuracy")))[:30]:
    print("Method:", e, "| Score:", a)