In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from rich import print as printr
import numpy as np
from pathlib import Path
from instance_dac.utils.data_loading import load_performance_data, load_eval_data

# LEVELS OF AGGREGATION
# 1. eval episode
# 2. seed
# 3. instance

# data.csv missing? Run
path = "../runs/Sigmoid"
agent_name = "ppo"
train_instance_set = "2D3M_train"
train_instance_set_id = "sigmoid_2D3M_train"
test_instance_set = "2D3M_test"
test_instance_set_id = "sigmoid_2D3M_test"
benchmark_id = "Sigmoid"

path = "../runs/CMA-ES"
agent_name = "ppo_sb3"
train_instance_set = "seplow_train"
test_instance_set_id = "test"
train_instance_set_id = "train"
test_instance_set = "seplow_test"
benchmark_id = "CMA-ES"

path = Path(path) / train_instance_set / agent_name
# data = load_eval_data(path=path, instance_set_id=test_instance_set_id, instance_set=test_instance_set)

data = pd.read_csv(f"eval_data_{test_instance_set_id}.csv")

ids = data["origin"] == "selector"
data["origin"][ids] = data["instance_set_id"][ids].apply(lambda x: f"selector_{x}")


In [None]:
ids = data["origin"].str.startswith("selector")
# Prepare to aggregate across runs and seeds
data.loc[ids, "origin"] = data["instance_set_id"][ids].apply(lambda x: "selector__" + "__".join(x.split("__")[:-2]))

In [None]:
data["origin"].unique()
data

In [None]:
# Aggregate across eval episodes
D = []
for gid, gdf in data.groupby(["origin", "seed", "instance", "selector_run"]):
    if gid[0].startswith("selector"):
        D.append(pd.Series({
            "origin": gid[0],
            "seed": gid[1],
            "instance": gid[2],
            "overall_performance": gdf["overall_performance"].mean()
        }))
    else:
        print(gid)
        D.append(gdf)
perf = pd.concat(D, axis=1).T
perf = pd.concat([perf, data[~data["origin"].str.startswith("selector")]])

perf = pd.DataFrame(perf.groupby(["origin", "seed", "instance"])["overall_performance"].mean())
perf_dict = {}
for gid, gdf in perf.groupby("origin"):
    gdf = gdf.reset_index()
    P = gdf["overall_performance"].to_numpy()
    P = P.reshape((gdf["seed"].nunique(), gdf["instance"].nunique()))
    perf_dict[gid] = P

In [None]:
perf_dict.keys()

In [None]:
perf_dict["selector__2D3M_train__Train__DS__Catch22__I__0.8"].shape

In [None]:
from rliable import library as rly
from rliable import metrics
from rliable import plot_utils

metric_names = ['Median', 'IQM', 'Mean']#, 'Optimality Gap']

algorithms = list(perf_dict.keys())
# Load ALE scores as a dictionary mapping algorithms to their human normalized
# score matrices, each of which is of size `(num_runs x num_games)`.
aggregate_func = lambda x: np.array([
  metrics.aggregate_median(x),
  metrics.aggregate_iqm(x),
  metrics.aggregate_mean(x),
  #metrics.aggregate_optimality_gap(x)
  ])
aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(
  perf_dict, aggregate_func, reps=5000)


In [None]:
import pickle
with open(f'perf_dict_{benchmark_id}.pickle', 'wb') as handle:
    pickle.dump(perf_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'aggregate_scores_{benchmark_id}.pickle', 'wb') as handle:
    pickle.dump(aggregate_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'aggregate_score_cis_{benchmark_id}.pickle', 'wb') as handle:
    pickle.dump(aggregate_score_cis, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# best_selector = "selector_2D3M_train__Train__DS__Raw__A__0.7__9__2"
# algos = [a for a in algorithms if a in ["full", "oracle", "random", best_selector]]

fig, axes = plot_utils.plot_interval_estimates(
  aggregate_scores, aggregate_score_cis,
  metric_names=metric_names,
  algorithms=algorithms, xlabel='Performance')
fig.savefig(f"plot_{benchmark_id}.pdf", dpi=300, bbox_inches="tight")

In [None]:
D = []
for key, value in aggregate_scores.items():
    d = {
        "origin": key,
    }
    d.update({m: v for m, v in zip(metric_names, value)})
    D.append(d)
df_scores = pd.DataFrame(D)

# Highest IQM
printr("Highest IQM")
ind = np.argpartition(df_scores["IQM"], -2)[-2:]  # select 2nd best because best is oracle
printr(df_scores.iloc[ind])
printr(df_scores.iloc[ind]["origin"])

# IQM Full
printr("IQM full")
printr(df_scores[df_scores["origin"] == "full"].iloc[0])


# Sort by IQM
df_scores = df_scores.sort_values(by="IQM")

df_scores.to_csv(f"scores_{benchmark_id}.csv", index=False)



In [None]:
tau_list = np.linspace(1, 5, 21)
score_distributions, score_distributions_cis = rly.create_performance_profile(
    perf_dict, tau_list=tau_list)
# Plot score distributions
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
plot_utils.plot_performance_profiles(
  score_distributions, tau_list=tau_list,
  performance_profile_cis=score_distributions_cis,
  # colors=dict(zip(algorithms, sns.color_palette('colorblind'))),
  xlabel=r'Overall Performance $(\tau)$',
  ax=ax)

In [None]:
perf = pd.DataFrame(data.groupby(["origin", "instance"])["overall_performance"].mean())
perf

In [None]:
from functools import partial

def l1_dist(a, b) -> float:
    return a - b 

def l2_dist(a, b) -> float:
    return (a - b) ** 2

def calc_dist(x: pd.DataFrame, distance_function) -> pd.DataFrame:
    x = x.reset_index()
    instance_id = x["instance"][0]
    origins = x["origin"]
    groundtruth = "oracle"
    idx_gt = list(origins).index(groundtruth)
    dist = distance_function(x["overall_performance"][idx_gt], x["overall_performance"])
    comparison_names = [f"{groundtruth} - {origin}" for origin in origins]
    ret = pd.DataFrame({
        "instance": instance_id,
        "distance_name": distance_function.__name__,
        "distance": dist,
        "compared": comparison_names
    })
    return ret

distance_functions=[l1_dist, l2_dist]

# Compute distance between oracle performance and performance on full training set
diffs_per_instance = pd.concat([perf.groupby("instance").apply(calc_dist, distance_function=func) for func in distance_functions], axis=0).reset_index(drop=True)
diffs_per_instance

In [None]:
# Sort df by agg_fun
agg_fun = np.mean

sorted_series = diffs_per_instance.groupby(["compared"]).apply(lambda x: agg_fun(x["distance"])).sort_values()
sorter = list(sorted_series.index)

df = diffs_per_instance.sort_values(by="compared", key=lambda column: column.map(lambda e: sorter.index(e)))
df = df[df["distance_name"] == "l2_dist"]

fig = plt.figure(figsize=(12,5), dpi=300)
ax = fig.add_subplot(111)
ax = sns.boxplot(data=df, x="compared", y="distance", ax=ax, fliersize=2)
ax.tick_params(axis='x', labelrotation=90, labelsize=5)
# ax.set_ylim(0, ax.get_ylim()[1])
plt.show()

fig = plt.figure(figsize=(12,5), dpi=300)
ax = fig.add_subplot(111)
ax = sns.barplot(data=df, x="compared", y="distance", ax=ax) #, err_kws={"color": ".5", "linewidth": 1},)
ax.tick_params(axis='x', labelrotation=90, labelsize=5)
# ax.set_ylim(0, ax.get_ylim()[1])
plt.show()


In [None]:
agg_fun = np.mean
diffs = diffs_per_instance.groupby(["compared"]).apply(lambda x: agg_fun(x["distance"]))
diffs = diffs.sort_values()
sns.set_style("whitegrid")
sns.set_palette("colorblind")
fig = plt.figure(figsize=(12,5), dpi=300)
ax = fig.add_subplot(111)
ax = sns.barplot(x=diffs.index, y=diffs.values, ax=ax)
ax.tick_params(axis='x', labelrotation=90, labelsize=5)
plt.show()
