In [None]:
%load_ext autoreload
%autoreload 2

# basic imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats


def get_data(dataset_name):
    dataset = pd.read_csv("../../Datasets/" + dataset_name + ".csv")
    instances = sorted(dataset["instance"].unique())
    all_arm_index_list = dataset["arm_index"].unique()
    print(dataset["arm_name"].unique())
    valid_arm_index_list = [item for item in all_arm_index_list if item >= 0]
    number_of_arms = len(valid_arm_index_list)
    number_of_trails = len(dataset["repetition"].unique())
    horizon_time = len(dataset["iteration"].unique())

    df = dataset[(dataset["arm_index"] >= 0)]
    df = df[["instance", "arm_index", "repetition", "iteration", "loss"]]
    real_data = df.sort_values(by=["instance", "arm_index", "repetition", "iteration"])[
        "loss"
    ].values.reshape(len(instances), number_of_arms, number_of_trails, horizon_time)
    return -real_data


real_datas = []
real_datas.append(get_data("Reshuffling"))

real_data = []
for d in real_datas:
    arr = d.copy().reshape(
        d.shape[0],
        d.shape[1],
        d.shape[2],
        d.shape[3],
    )
    real_data.append(arr)
real_data = np.concatenate(real_data)

In [None]:
# normalization
min_max_results = []
for i in range(real_data.shape[0]):
    for j in range(real_data.shape[1]):
        d = real_data[i, j, :].flatten()
        min_max_results.append([d.min(), d.max()])
min_max_results = np.asarray(min_max_results)

In [None]:
norm_real_data = real_data.copy()
# normalization
min_max_results = []
for i in range(real_data.shape[0]):
    d = real_data[i, :].flatten()
    norm_real_data[i, :] = (norm_real_data[i, :] - d.min()) / (d.max() - d.min())

# norm_real_data[norm_real_data < 0] = 0

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d

arm_names = ["MLP", "Logreg", "CatBoost", "XGBoost"]

for arm in range(norm_real_data.shape[1]):
    # for id in range(norm_real_data.shape[0]):
    print(id, arm)

    data = norm_real_data[:, arm, :, :].reshape(
        norm_real_data.shape[0] * norm_real_data.shape[2], norm_real_data.shape[3]
    )  # shape (instances, repetitions, time)
    num_reps = data.shape[0]

    # Plot CDF every 10 time steps
    steps = 50
    time_points = [(x, x + steps) for x in np.arange(0, 250, steps)]
    time_points = [(0, 5), (5, 10), (10, 15), (15, 20), (20, 250)]

    x_common = np.linspace(data.min(), data.max(), 100)

    plt.figure(figsize=(6, 4))
    plt.rcParams.update({"font.size": 14})

    for t_start, t_stop in time_points:
        ecdfs = []
        for r in range(num_reps):
            values = data[r, t_start:t_stop]  # values for one repetition at time t
            sorted_vals = np.sort(values)
            cdf = np.arange(1, len(sorted_vals) + 1) / len(sorted_vals)

            # Interpolate to the common x grid
            interp = interp1d(sorted_vals, cdf, bounds_error=False, fill_value=(0, 1))
            ecdfs.append(interp(x_common))

        # Average ECDF across repetitions
        mean_ecdf = np.mean(ecdfs, axis=0)
        plt.plot(x_common, mean_ecdf, label=f"t $\\in$ ({t_start}, {t_stop})")
        std_ecdf = np.std(ecdfs, axis=0)
        ci95 = 1.96 * std_ecdf / np.sqrt(num_reps)

        # Plot ±1 std shaded area
        plt.fill_between(x_common, mean_ecdf - ci95, mean_ecdf + ci95, alpha=0.1)

    plt.xlabel("Performance")
    plt.ylabel("eCDF")
    plt.title("HPO Trajectories on " + arm_names[arm])
    plt.legend()
    plt.tight_layout()

    plt.savefig("./figures/Reshuffling_" + arm_names[arm] + ".pdf")