In [11]:
from pathlib import Path
import pandas as pd
from carps.analysis.gather_data import normalize_logs, get_interpolated_performance_df, load_logs, process_logs
from carps.analysis.utils import filter_only_final_performance

def load_set(paths: list[str], set_id: str = "unknown") -> tuple[pd.DataFrame, pd.DataFrame]:
    logs = []
    for p in paths:
        fn = Path(p) / "trajectory.parquet"
        if not fn.is_file():
            fn = Path(p) / "logs.parquet"
        logs.append(pd.read_parquet(fn))

    df = pd.concat(logs).reset_index(drop=True)
    df_cfg = pd.concat([pd.read_parquet(Path(p) / "logs_cfg.parquet") for p in paths]).reset_index(drop=True)
    df["set"] = set_id
    return df, df_cfg

scenario = "blackbox"
set_id = "full"
subset_size = 30
paths = ["../runs/SMAC3-BlackBoxFacade", "../runs/RandomSearch", "../runs/Nevergrad-CMA-ES"]

scenario = "multi-fidelity-objective"
set_id = "full"
subset_size = 9
paths = ["../runs_MOMF/SMAC3-MOMF-GP", "../runs_MOMF/RandomSearch", "../runs_MOMF/Nevergrad-DE"]

# scenario = "multi-objective"
# set_id = "full"
# subset_size = 10
# paths = ["../runs_MO/Optuna-MO", "../runs_MO/RandomSearch", "../runs_MO/Nevergrad-DE"]

# scenario = "multi-fidelity"
# set_id = "full"
# subset_size = 20
# paths = ["../runs/SMAC3-Hyperband", "../runs/SMAC3-MultiFidelityFacade", "../runs_/DEHB"]

fn = f"../data/{scenario}_{set_id}_logs.parquet"
if Path(fn).is_file():
    df = pd.read_parquet(fn)
else:
    D = []
    for rundir in paths:
        df, df_cfg = load_set([rundir], set_id="full")
        D.append(df)

    df = pd.concat(D).reset_index(drop=True)
    del D

    df = normalize_logs(df)

    normalize_performance = False
    perf_col = "trial_value__cost_inc_norm" if normalize_performance else "trial_value__cost_inc"

    df.to_parquet(f"../data/{scenario}_{set_id}_logs.parquet")
df = filter_only_final_performance(df=df)

FileNotFoundError: [Errno 2] No such file or directory: '../runs_MOMF/SMAC3-MOMF-GP/logs.parquet'

In [None]:
subset_size = 9
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from carps.utils.pareto_front import pareto

problem_ids = df["problem_id"].unique()
# print(problem_ids)
benchmark_ids = [pid.split("/")[0] for pid in problem_ids]
n_problems = len(problem_ids)

print(n_problems)

n_splits = n_problems // (subset_size * 2)
print(n_splits)
X = problem_ids
y = benchmark_ids

seeds = np.arange(0, 5)

split_classes = [StratifiedShuffleSplit, ShuffleSplit]

new_subset_performance = []


for split_class in split_classes:
    for seed in seeds:
        sss = split_class(n_splits=n_splits, test_size=0.5, random_state=seed)
        sss.get_n_splits(X, y)
        for i, (train_index, test_index) in enumerate(sss.split(X, y)):
            # print(split_class, seed, i)

            ids_dev = problem_ids[train_index]
            ids_test = problem_ids[test_index]
            for _set_id, ids in zip(["dev", "test"], (ids_dev, ids_test)):
                new_df = df[df["problem_id"].isin(ids)].copy()
                set_id = f"split_{i}_seed_{seed}_subset_{_set_id}"
                print(set_id)
                new_df["set"] = set_id
                new_subset_performance.append(new_df)

df_new = pd.concat(new_subset_performance).reset_index(drop=True)

121
6
split_0_seed_0_subset_dev
split_0_seed_0_subset_test
split_1_seed_0_subset_dev
split_1_seed_0_subset_test
split_2_seed_0_subset_dev
split_2_seed_0_subset_test
split_3_seed_0_subset_dev
split_3_seed_0_subset_test
split_4_seed_0_subset_dev
split_4_seed_0_subset_test
split_5_seed_0_subset_dev
split_5_seed_0_subset_test
split_0_seed_1_subset_dev
split_0_seed_1_subset_test
split_1_seed_1_subset_dev
split_1_seed_1_subset_test
split_2_seed_1_subset_dev
split_2_seed_1_subset_test
split_3_seed_1_subset_dev
split_3_seed_1_subset_test
split_4_seed_1_subset_dev
split_4_seed_1_subset_test
split_5_seed_1_subset_dev
split_5_seed_1_subset_test
split_0_seed_2_subset_dev
split_0_seed_2_subset_test
split_1_seed_2_subset_dev
split_1_seed_2_subset_test
split_2_seed_2_subset_dev
split_2_seed_2_subset_test
split_3_seed_2_subset_dev
split_3_seed_2_subset_test
split_4_seed_2_subset_dev
split_4_seed_2_subset_test
split_5_seed_2_subset_dev
split_5_seed_2_subset_test
split_0_seed_3_subset_dev
split_0_seed_3

In [None]:
# _df = pd.concat([df, df_new]).reset_index(drop=True)
_df = df_new

In [None]:
from autorank._util import get_sorted_rank_groups, RankResult
from carps.analysis.plot_ranking import calc_critical_difference

rank_results = {}
for (scenario, set_id), gdf in _df.groupby(by=["scenario", "set"]):
    perf_col: str = "trial_value__cost_inc_norm"
    identifier = f"{scenario}_{set_id}"
    result = calc_critical_difference(gdf, identifier=identifier, figsize=(8, 3), perf_col=perf_col, plot_diagram=False)
    sorted_ranks, names, groups = get_sorted_rank_groups(result, reverse=False)
    rank_results[(scenario, set_id)] = result

Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000042)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.633333  0.007830  0.007803  0.001327  0.044524         0.0   
Optuna-MO     2.077778  0.009641  0.009613  0.002813  0.058713   -0.139513   
RandomSearch  2.288889  0.011237   0.01114  0.002854  0.051639   -0.238931   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000098)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.673684  0.008228  0.008137  0.003283  0.046377         0.0   
Optuna-MO     2.031579  0.009992  0.009963  0.004852  0.053638   -0.130817   
RandomSearch  2.294737  0.011273  0.011179  0.006476  0.060899   -0.210063   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000080)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.666667  0.007476  0.007172  0.001766   0.02972         0.0   
Optuna-MO     2.032258  0.008571  0.008465   0.00268  0.048084   -0.094145   
RandomSearch  2.301075  0.010482  0.010169  0.003702  0.045352    -0.23042   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000026)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.659341  0.007503  0.007291  0.001327  0.037349         0.0   
Optuna-MO     2.000000  0.008597   0.00856  0.002813  0.052336   -0.092771   
RandomSearch  2.340659  0.010496  0.010396  0.002854  0.060729   -0.224817   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000049)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.659341  0.007503  0.007412  0.002258  0.046377         0.0   
Optuna-MO     2.021978  0.009487  0.009454  0.003011  0.052336   -0.157504   
RandomSearch  2.318681  0.011273  0.011179  0.005451  0.060729   -0.268122   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000019)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.641304  0.007414  0.007061  0.001766   0.02972         0.0   
Optuna-MO     2.032609  0.008520  0.008455  0.003011  0.048084   -0.095737   
RandomSearch  2.326087  0.010582  0.010227  0.003702  0.028433   -0.243195   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000056)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.673913  0.007875  0.007668  0.002514  0.037349         0.0   
Optuna-MO     2.000000  0.008520  0.008486  0.003648  0.052336   -0.053728   
RandomSearch  2.326087  0.011087  0.010794  0.005525  0.051639   -0.231358   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000055)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.655914  0.007491  0.007289   0.00182  0.037349         0.0   
Optuna-MO     2.043011  0.009795  0.009762  0.003648  0.051167   -0.180373   
RandomSearch  2.301075  0.011200    0.0111  0.005451  0.047586   -0.266407   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000058)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.632184  0.007352   0.00715  0.001766   0.03076         0.0   
Optuna-MO     2.080460  0.008597   0.00856  0.003011  0.050737   -0.106412   
RandomSearch  2.287356  0.010683  0.010583  0.002854  0.045352   -0.248752   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000062)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.647727  0.007535  0.007213  0.002514  0.044524         0.0   
Optuna-MO     2.045455  0.009641   0.00949  0.004782  0.051167    -0.16853   
RandomSearch  2.306818  0.011705  0.010872  0.005525  0.060729   -0.304841   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000019)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.611111  0.007414  0.006987  0.002486  0.046377         0.0   
Optuna-MO     2.000000  0.009226  0.009102  0.003648  0.052571   -0.150638   
RandomSearch  2.388889  0.010828  0.010229  0.005519  0.060899   -0.262903   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.001621)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.684932  0.009913  0.009886  0.003887  0.070285         0.0   
Optuna-MO     2.041096  0.023827  0.023491  0.004907  0.070068   -0.520744   
RandomSearch  2.273973  0.015528  0.014663  0.005525  0.073889   -0.302839   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO         medium  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000070)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.620253  0.007084  0.006598  0.001017   0.02417         0.0   
Optuna-MO     2.075949  0.007520  0.007312  0.001679  0.033073   -0.042228   
RandomSearch  2.303797  0.009791  0.008878  0.002452  0.024613   -0.233356   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000098)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.658537  0.007844  0.007784  0.002258  0.064612         0.0   
Optuna-MO     2.012195  0.009641  0.009606  0.003549  0.053638   -0.138652   
RandomSearch  2.329268  0.011087   0.01099  0.005451  0.068263   -0.229704   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000472)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.698630  0.008545  0.008241  0.002486  0.070688         0.0   
Optuna-MO     1.958904  0.011259  0.011222  0.003011  0.065094   -0.185947   
RandomSearch  2.342466  0.011273  0.011086  0.005451   0.07539   -0.188411   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch  negligible  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000017)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.594937  0.007476  0.007136  0.001327  0.034136         0.0   
Optuna-MO     2.075949  0.008468  0.008436  0.002813  0.053638   -0.085711   
RandomSearch  2.329114  0.010683  0.010371  0.002854  0.051639   -0.243042   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000022)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.674699  0.007567  0.007355  0.003283  0.037349         0.0   
Optuna-MO     1.939759  0.008966   0.00886  0.004782  0.050737   -0.115897   
RandomSearch  2.385542  0.011200  0.010888  0.006476  0.047586   -0.263793   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000969)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.710843  0.007352  0.007049  0.001258  0.044524         0.0   
Optuna-MO     2.000000  0.008597  0.008504  0.002813  0.053638    -0.10745   
RandomSearch  2.289157  0.010482  0.010169  0.002692  0.060729   -0.241238   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.001919)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.707317  0.008420  0.008401   0.00182  0.065317         0.0   
Optuna-MO     2.036585  0.011295  0.011275  0.003549  0.058713   -0.195057   
RandomSearch  2.256098  0.013212  0.013118  0.004708  0.068263   -0.293463   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000041)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.666667  0.007567  0.007364  0.001766  0.037349         0.0   
Optuna-MO     1.962963  0.008597  0.008504  0.003011  0.052336   -0.087345   
RandomSearch  2.370370  0.010973  0.010874  0.005451  0.060729   -0.247448   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000446)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.691358  0.008545  0.008517  0.003887  0.064612         0.0   
Optuna-MO     2.000000  0.009992   0.00996  0.004593  0.052571   -0.105373   
RandomSearch  2.308642  0.013045  0.012733  0.005525  0.068263   -0.280234   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000365)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.710843  0.007491  0.007289  0.002258  0.062983         0.0   
Optuna-MO     1.963855  0.009795  0.009765  0.004782  0.064963   -0.180343   
RandomSearch  2.325301  0.010683  0.010583  0.005525  0.065524   -0.236921   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.002447)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.743590  0.008578  0.008431  0.001327  0.064612         0.0   
Optuna-MO     1.961538  0.009641  0.009614  0.003549  0.064963   -0.079285   
RandomSearch  2.294872  0.011705  0.011613  0.005451  0.068263   -0.207817   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.002844)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE    1.7500  0.007897   0.00764  0.002258  0.037349         0.0   
Optuna-MO       1.9625  0.008341   0.00827  0.003549  0.050737   -0.037561   
RandomSearch    2.2875  0.010590  0.010234  0.005451  0.047586   -0.201073   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000020)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.658228  0.007491  0.007125  0.001327  0.065317         0.0   
Optuna-MO     1.949367  0.008571  0.008478  0.003011  0.063016   -0.092974   
RandomSearch  2.392405  0.010496  0.009696  0.003702   0.07337   -0.238188   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000122)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE    1.6375  0.008980  0.008952   0.00182  0.044524         0.0   
Optuna-MO       2.0625  0.011295  0.011268  0.003549  0.052571   -0.153469   
RandomSearch    2.3000  0.012591   0.01245  0.003702  0.060729    -0.22461   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.001382)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.701299  0.007503  0.007476  0.002258  0.065317         0.0   
Optuna-MO     2.012987  0.009992  0.009962  0.003549  0.058713   -0.190646   
RandomSearch  2.285714  0.012136  0.012108  0.004708   0.07337   -0.310561   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000044)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.654321  0.007567  0.007355  0.001327  0.037349         0.0   
Optuna-MO     1.987654  0.008966  0.008933  0.003011  0.051167   -0.115337   
RandomSearch  2.358025  0.010683  0.010371  0.003702  0.047586   -0.233821   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000064)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.670886  0.008228  0.008208  0.002258  0.070688         0.0   
Optuna-MO     1.962025  0.009487  0.009472  0.003648  0.065094   -0.095763   
RandomSearch  2.367089  0.011200  0.011172  0.004708   0.07539   -0.204484   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000353)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.696203  0.007567   0.00708  0.002486  0.034136         0.0   
Optuna-MO     1.974684  0.008571  0.008362  0.003648  0.058713   -0.087422   
RandomSearch  2.329114  0.011200    0.0104  0.005525  0.051639   -0.275487   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000446)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.691358  0.008184  0.007973   0.00182  0.046377         0.0   
Optuna-MO     2.000000  0.008571  0.008478  0.003011  0.053638   -0.031679   
RandomSearch  2.308642  0.011200  0.010888  0.004708  0.060899   -0.213183   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000195)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.662651  0.008228  0.008137  0.002486  0.044524         0.0   
Optuna-MO     2.036145  0.009992  0.009962  0.004782  0.053638   -0.130823   
RandomSearch  2.301205  0.013001  0.012689  0.004708  0.060729   -0.302049   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000578)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.746835  0.008545  0.008178  0.002258  0.062983         0.0   
Optuna-MO     1.911392  0.009795  0.009762  0.004593  0.058713    -0.09366   
RandomSearch  2.341772  0.011273  0.010961  0.005519  0.065524   -0.190317   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch  negligible  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000786)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.691358  0.007567  0.007475  0.001327  0.046377         0.0   
Optuna-MO     2.024691  0.009487  0.009457   0.00268  0.058713   -0.151929   
RandomSearch  2.283951  0.011273  0.011179  0.003702  0.060899   -0.262912   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000098)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.658537  0.007414  0.007207  0.001766  0.037349         0.0   
Optuna-MO     2.012195  0.009226  0.009165  0.004593  0.050737   -0.148254   
RandomSearch  2.329268  0.010828  0.010578  0.005451  0.051639    -0.25443   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000142)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE    1.6500  0.009013  0.008954  0.002486  0.046377         0.0   
Optuna-MO       2.0375  0.009641  0.009613  0.003648  0.052571   -0.045585   
RandomSearch    2.3125  0.011237  0.010943  0.005519  0.060899    -0.15001   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch  negligible  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.002854)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.692308  0.009044  0.009017   0.00182  0.062983         0.0   
Optuna-MO     2.089744  0.009739   0.00972  0.003011  0.053638    -0.05002   
RandomSearch  2.217949  0.011237  0.011199  0.004708  0.068263   -0.145452   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch  negligible  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000005)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE    1.6375  0.007535  0.007213  0.001258  0.034136         0.0   
Optuna-MO       1.9500  0.009894  0.009829  0.003648  0.052571   -0.184561   
RandomSearch    2.4125  0.011237  0.010881  0.005451  0.051639    -0.27049   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000132)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.671053  0.007218  0.007159  0.000963  0.044524         0.0   
Optuna-MO     1.973684  0.008584  0.008552  0.002491  0.053638   -0.116769   
RandomSearch  2.355263  0.010489  0.010345  0.002461  0.060899   -0.247961   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000076)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.618421  0.007535  0.007044   0.00182  0.037349         0.0   
Optuna-MO     2.065789  0.008520  0.008363  0.003549  0.052336   -0.085913   
RandomSearch  2.315789  0.011237  0.010404  0.004708  0.051639   -0.281033   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.002641)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.708861  0.008545  0.008453  0.002258  0.065317         0.0   
Optuna-MO     2.037975  0.009992  0.009962  0.004593  0.058713   -0.105693   
RandomSearch  2.253165  0.013001  0.012814  0.006476  0.065524   -0.276927   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000042)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.653846  0.007260  0.006925  0.001017   0.03076         0.0   
Optuna-MO     1.974359  0.008341   0.00827  0.001679  0.052571   -0.095565   
RandomSearch  2.371795  0.010119  0.009573  0.002692  0.051639   -0.230781   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000772)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.717949  0.008206  0.007949  0.002258  0.062983         0.0   
Optuna-MO     1.961538  0.009894  0.009829  0.003648  0.052571   -0.127337   
RandomSearch  2.320513  0.011555  0.011458  0.005451  0.065524   -0.229036   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.001267)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.697368  0.008056  0.007996   0.00182  0.065317         0.0   
Optuna-MO     2.013158  0.009029  0.009001  0.003011  0.058713   -0.077089   
RandomSearch  2.289474  0.011237  0.011096  0.003702  0.065524   -0.221856   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000586)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.670886  0.007503  0.007476  0.001327  0.062983         0.0   
Optuna-MO     2.050633  0.011259  0.011234  0.003011  0.063016   -0.265502   
RandomSearch  2.278481  0.011273  0.011179  0.002692  0.065524   -0.267413   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO          small  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000075)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.628205  0.008206  0.008179  0.000963  0.064612         0.0   
Optuna-MO     2.051282  0.009894  0.009869   0.00268   0.07138   -0.125585   
RandomSearch  2.320513  0.010978  0.010835  0.004708   0.07337   -0.194772   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch  negligible  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000206)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE    1.6625  0.007875  0.007668  0.002486  0.064612         0.0   
Optuna-MO       2.0250  0.008781  0.008748  0.003549  0.063016    -0.07427   
RandomSearch    2.3125  0.011123  0.010673  0.005451  0.060899   -0.235728   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000133)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.658537  0.007844  0.007586  0.002258  0.037349         0.0   
Optuna-MO     2.024390  0.009226  0.009157  0.003648  0.052571   -0.110899   
RandomSearch  2.317073  0.011237  0.010404  0.004708  0.051639   -0.251354   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.003366)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.741176  0.007567  0.007364  0.002486  0.044524         0.0   
Optuna-MO     2.000000  0.008597   0.00856  0.003011  0.052336   -0.087015   
RandomSearch  2.258824  0.011200    0.0111  0.005451  0.060729   -0.260192   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000188)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.682353  0.007503  0.007291  0.002258  0.046377         0.0   
Optuna-MO     2.000000  0.009487   0.00945  0.004593  0.053638   -0.158516   
RandomSearch  2.317647  0.011200  0.010888  0.005519  0.060899   -0.269132   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000097)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.628205  0.007218  0.006934  0.001131  0.034136         0.0   
Optuna-MO     2.064103  0.008584  0.008552  0.002491  0.052336   -0.118288   
RandomSearch  2.307692  0.010200    0.0096  0.002461  0.047586   -0.240151   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000055)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE      1.65  0.007529  0.007382  0.002258  0.044524         0.0   
Optuna-MO         2.00  0.010626  0.010591  0.004593  0.052336   -0.228806   
RandomSearch      2.35  0.012569  0.012107  0.005519  0.060899   -0.339016   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO          small  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000340)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.657895  0.007484  0.007162  0.001258  0.037349         0.0   
Optuna-MO     2.039474  0.009894  0.009795  0.003011  0.058713   -0.189477   
RandomSearch  2.302632  0.011087  0.010469  0.003702  0.051639   -0.270959   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000703)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.714286  0.007476  0.007264  0.001327  0.034136         0.0   
Optuna-MO     1.961039  0.008571  0.008534  0.003549  0.051167   -0.093206   
RandomSearch  2.324675  0.010683  0.010583  0.002854  0.051639   -0.238346   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000031)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.643678  0.007352  0.007049  0.001258   0.03076         0.0   
Optuna-MO     2.022989  0.008966  0.008873   0.00268  0.052571   -0.135797   
RandomSearch  2.333333  0.010496  0.010097  0.003702  0.045352   -0.243499   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000137)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.717647  0.008545  0.008333  0.003887  0.064612         0.0   
Optuna-MO     1.929412  0.009795  0.009702  0.004852  0.052571   -0.093261   
RandomSearch  2.352941  0.013045  0.012646  0.007458  0.068263   -0.283457   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.002263)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.701299  0.007491    0.0074  0.001327  0.044524         0.0   
Optuna-MO     2.038961  0.008966  0.008933  0.002813  0.053638   -0.121237   
RandomSearch  2.259740  0.011200    0.0111  0.002461  0.060729   -0.265182   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000546)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE    1.6875  0.007897   0.00764  0.002486  0.062983         0.0   
Optuna-MO       2.0125  0.009894  0.009862  0.004593  0.058713   -0.152642   
RandomSearch    2.3000  0.011555  0.011411  0.005519  0.060899   -0.254032   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.003504)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.807692  0.008578  0.008321  0.003283  0.064612         0.0   
Optuna-MO     1.884615  0.009641  0.009576  0.004593  0.053638   -0.079914   
RandomSearch  2.307692  0.013190  0.012591  0.007458   0.07337   -0.291523   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


Rejecting null hypothesis that data is normal for column Nevergrad-DE (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column Optuna-MO (p=0.000000<0.016667)
Rejecting null hypothesis that data is normal for column RandomSearch (p=0.000000<0.016667)
Using Friedman test as omnibus test
Rejecting null hypothesis that there is no difference between the distributions (p=0.000127)
Using Nemenyi post-hoc test. Differences are significant,if the distance between the mean ranks is greater than the critical distance.
              meanrank    median       mad  ci_lower  ci_upper effect_size  \
Nevergrad-DE  1.645570  0.007503  0.007412  0.001258   0.03076         0.0   
Optuna-MO     2.037975  0.008966  0.008935  0.002813  0.053638   -0.120181   
RandomSearch  2.316456  0.010683  0.010496  0.003702  0.051639   -0.236084   

               magnitude  
Nevergrad-DE  negligible  
Optuna-MO     negligible  
RandomSearch       small  


  if abs(sorted_ranks[i] - sorted_ranks[j]) <= critical_difference:


In [None]:
R = []
for i, (k, v) in enumerate(rank_results.items()):
    d = pd.DataFrame({
        "scenario": k[0],
        "set": k[1],
        **v.rankdf["meanrank"]
    }, index=[i]
    ).melt(id_vars=["scenario", "set"], var_name="optimizer_id", value_name="meanrank")
    d["order"] = d.rank(method="max", numeric_only=True).astype(int)
    R.append(d)
    # break

df_rank = pd.concat(R).reset_index(drop=True)
df_rank = df_rank[df_rank["set"] != "full"]
def rename(x: str) -> str:
    if x in ["dev", "test"]:
        return f"discrepancy_{x}"
    return x
df_rank["set"] = df_rank["set"].apply(rename)
df_rank

Unnamed: 0,scenario,set,optimizer_id,meanrank,order
0,multi-objective,split_0_seed_0_subset_dev,Nevergrad-DE,1.633333,1
1,multi-objective,split_0_seed_0_subset_dev,Optuna-MO,2.077778,2
2,multi-objective,split_0_seed_0_subset_dev,RandomSearch,2.288889,3
3,multi-objective,split_0_seed_0_subset_test,Nevergrad-DE,1.673684,1
4,multi-objective,split_0_seed_0_subset_test,Optuna-MO,2.031579,2
...,...,...,...,...,...
175,multi-objective,split_5_seed_4_subset_dev,Optuna-MO,1.884615,2
176,multi-objective,split_5_seed_4_subset_dev,RandomSearch,2.307692,3
177,multi-objective,split_5_seed_4_subset_test,Nevergrad-DE,1.645570,1
178,multi-objective,split_5_seed_4_subset_test,Optuna-MO,2.037975,2


In [None]:

original_optimizers = {
    "blackbox": ["RandomSearch", "SMAC3-BlackBoxFacade", "Nevergrad-CMA-ES"],
    "multi-objective": ["RandomSearch", "Optuna-MO", "Nevergrad-DE"],
    "multi-fidelity": ["SMAC3-Hyperband", "SMAC3-MultiFidelityFacade", "DEHB"],
    "multi-fidelity-objective": ["RandomSearch", "SMAC3-MOMF-GP", "Nevergrad-DE"],
}


is_different = {}
for scenario, gdf in df_rank.groupby(by="scenario"):
    set_origins = gdf["set"].apply(lambda x: "_".join(x.split("_")[:-2])).unique()   
    for set_origin in set_origins:
        origs = original_optimizers[scenario]
        df_dev = gdf[gdf["set"] == f"{set_origin}_subset_dev"]
        df_test = gdf[gdf["set"] == f"{set_origin}_subset_test"]
        order_dev = []
        order_test = []
        for orig in origs:
            order_dev.append(df_dev[df_dev["optimizer_id"] == orig]["order"].values[0])
            order_test.append(df_test[df_test["optimizer_id"] == orig]["order"].values[0])
        _is_different = order_dev != order_test
        is_different[(scenario, set_origin)] = _is_different
is_different

{('multi-objective', 'split_0_seed_0'): False,
 ('multi-objective', 'split_0_seed_1'): False,
 ('multi-objective', 'split_0_seed_2'): False,
 ('multi-objective', 'split_0_seed_3'): False,
 ('multi-objective', 'split_0_seed_4'): False,
 ('multi-objective', 'split_1_seed_0'): False,
 ('multi-objective', 'split_1_seed_1'): False,
 ('multi-objective', 'split_1_seed_2'): False,
 ('multi-objective', 'split_1_seed_3'): False,
 ('multi-objective', 'split_1_seed_4'): False,
 ('multi-objective', 'split_2_seed_0'): False,
 ('multi-objective', 'split_2_seed_1'): False,
 ('multi-objective', 'split_2_seed_2'): False,
 ('multi-objective', 'split_2_seed_3'): False,
 ('multi-objective', 'split_2_seed_4'): False,
 ('multi-objective', 'split_3_seed_0'): False,
 ('multi-objective', 'split_3_seed_1'): False,
 ('multi-objective', 'split_3_seed_2'): False,
 ('multi-objective', 'split_3_seed_3'): False,
 ('multi-objective', 'split_3_seed_4'): False,
 ('multi-objective', 'split_4_seed_0'): False,
 ('multi-obje

In [None]:
import pandas as pd
import numpy as np



fn_template = "ranks_per_set_{scenario}.csv"

decimal_places = 2

final_str = r"""
\begin{{table}}[h]
    \caption{{{caption}}}
    \label{{{label}}}
    \centering
    %\resizebox{{0.4\textwidth}}{{!}}{{
    {table_string}
    %}}
\end{{table}}
"""

float_format = lambda x: ("{:0." + str(decimal_places) + "f}").format(x) if not np.isnan(x) else "-"


for scenario, gdf in df_rank.groupby("scenario"):
    fn = fn_template.format(scenario=scenario)
    sorter = gdf[gdf["set"]=="dev"].sort_values("meanrank")["optimizer_id"].to_list()

    R = gdf.pivot_table(index="set", columns="optimizer_id", values="order").map(int)
    origs = original_optimizers[scenario]
    origs.sort(key=lambda x: sorter.index(x))
    cols = origs + [c for c in R.columns if c not in original_optimizers[scenario]]
    R = R[cols]


    MR = gdf.pivot_table(index="set", columns="optimizer_id", values="meanrank").map(lambda x: f"{x:.2f}" if not isinstance(x, str) else x)
    MR = MR[cols]
    for i, ((idx, row), (idx2, row2)) in enumerate(zip(MR.iterrows(), R.iterrows())):
        for j in range(len(row)):
            row.iloc[j] = row.iloc[j] + f" ({int(row2.iloc[j])})"

    print(MR)
    # table_str = MR.to_latex(float_format=float_format, na_rep="-").strip()
    # caption = f"Mean Ranking for Scenario {scenario}"
    # label = f"tab:ranking_validation_{scenario}"
    # table_str = final_str.format(table_string=table_str, label=label, caption=caption)
    # table_str = table_str.replace("_", "\_")

    # with open(fn + ".tex", "w") as file:
    #     file.write(table_str)
    # print(table_str)

ValueError: 'RandomSearch' is not in list

In [None]:
# from pathlib import Path
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
# from carps.utils.pareto_front import pareto

# def read_set(fn: Path) -> pd.DataFrame:
#     df = pd.read_csv(fn)
#     df["problem_id"] = df["problem_id"].apply(lambda x: "bbob/" + x if x.startswith("noiseless") else x)
#     df["benchmark_id"] = df["problem_id"].apply(lambda x: x.split("/")[0])
#     df = df.melt(id_vars=["problem_id", "benchmark_id"], value_vars=p_cols, var_name="optimizer_id", value_name="performance")    
#     return df

# performance_fn = Path("/scratch/hpc-prf-intexml/cbenjamins/repos/CARP-S-Experiments/lib/CARP-S/subselection/data/BB/default/df_crit.csv")
# performance = pd.read_csv(performance_fn)
# performance["problem_id"] = performance["problem_id"].apply(lambda x: "bbob/" + x if x.startswith("noiseless") else x)
# problem_ids = list(performance["problem_id"])
# n_problems = len(problem_ids)
# subset_size = 30

# rankings = []

# performance["benchmark_id"] = performance["problem_id"].apply(lambda x: x.split("/")[0])
# p_cols = [c for c in performance.columns if c not in ["problem_id", "benchmark_id"]]
# performance = performance.melt(id_vars=["problem_id", "benchmark_id"], value_vars=p_cols, var_name="optimizer_id", value_name="performance")


# path_subset_dev = performance_fn.parent / f"subset_{subset_size}.csv"
# subset_dev = read_set(path_subset_dev)

# path_subset_test = performance_fn.parent / f"subset_complement_subset_{subset_size}.csv"
# subset_test = read_set(path_subset_test)

# problem_ids_dev = subset_dev["problem_id"].to_list()
# problem_ids_test = subset_test["problem_id"].to_list()

# seeds = np.arange(0, 10)
# performance["benchmark_id"] = performance["problem_id"].apply(lambda x: x.split("/")[0])
# benchmark_ids = list(performance["benchmark_id"].unique())


# def add_rank(df: pd.DataFrame) -> pd.DataFrame:
#     D = []
#     for problem_id, gdf in df.groupby(["problem_id"]):
#         pivot = gdf.pivot(index=["problem_id", "benchmark_id"], columns="optimizer_id", values="performance").reset_index()
#         if not all([p in pivot.columns for p in p_cols]):
#             print(gdf.columns)
#             continue
#         rank = pivot[p_cols].rank(axis=1).melt(value_vars=p_cols, var_name="optimizer_id", value_name="rank")
#         gdf = pd.merge(gdf, rank, on=["optimizer_id"])
#         D.append(gdf)
#     D = pd.concat(D).reset_index(drop=True)
#     return D

# def get_ranking(df: pd.DataFrame, set_id: str, split: int = -1, seed: int = -1) -> pd.DataFrame:
#     df = add_rank(df)
#     df["set"] = set_id
#     df["split"] = split
#     df["seed"] = seed
#     return df


# rankings = []

# aggregate_func = np.mean

# mean_ranking_full = get_ranking(performance, "full")
# rankings.append(mean_ranking_full)

# mean_ranking_dev = get_ranking(subset_dev, "discrepancy_dev")
# rankings.append(mean_ranking_dev)

# mean_ranking_test = get_ranking(subset_test, "discrepancy_test")
# rankings.append(mean_ranking_test)  

# n_splits = n_problems // (subset_size * 2)
# print(n_splits)
# X = performance.values
# y = performance["benchmark_id"].to_numpy()

# split_classes = [StratifiedShuffleSplit, ShuffleSplit]

# # for split_class in split_classes:
# #     for seed in seeds:
# #         sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.5, random_state=seed)
# #         sss.get_n_splits(X, y)
# #         for i, (train_index, test_index) in enumerate(sss.split(X, y)):
# #             D_dev = performance.iloc[train_index]
# #             D_dev = get_ranking(D_dev, f"{split_class.__name__}_dev", i, seed)
# #             rankings.append(D_dev)

# #             D_test = performance.iloc[test_index]
# #             D_test = get_ranking(D_dev, f"{split_class.__name__}_dev", i, seed)
# #             rankings.append(D_test)


# df_ranking = pd.concat(rankings).reset_index(drop=True)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.set_style("whitegrid")
# sns.set_palette("colorblind")

# gdf = df_ranking
# fig = plt.figure(figsize=(10,4))
# ax = fig.add_subplot(111)
# ax = sns.violinplot(data=gdf, x="optimizer_id", y="rank", hue="set", cut=0, ax=ax)
# # ax.set_title(f"use_mean: {gid}")
# plt.show()

In [None]:
# gdf = df_ranking
# fig = plt.figure(figsize=(10,4))
# ax = fig.add_subplot(111)
# ax = sns.histplot(data=gdf, x="performance", hue="optimizer_id", ax=ax,  element="step")
# ax = fig.add_subplot(211)
# ax = sns.histplot(data=gdf, x="rank", hue="optimizer_id", ax=ax,  element="step")
# # ax.set_title(f"use_mean: {gid}")
# plt.show()

In [None]:
# def distance_bhattacharaya(p, q):
#     return -np.log(np.sum(np.sqrt(p*q)))

# def distance_kullback_leibler(p, q):
#     return np.sum(p * np.log(p / q))

# # Performance
# distances = {}
# bin_edges = None
# for gid, gdf in df_ranking.groupby(["optimizer_id", "set"]):
#     values = gdf["performance"].values
#     q = np.histogram(values, bins=20, range=[0,1], density=True)
#     bin_edges = q[1]
#     distances[gid] = q[0]

# distributions = pd.DataFrame(distances).T
# distributions.index.name = ("optimizer_id", "set")
# distributions = distributions.reset_index(names=["optimizer_id", "set"])
# # print(distributions.columns)

# dist = {}
# for gid, gdf in distributions.groupby(by=["optimizer_id", "set"]):
#     for gid2, gdf2 in distributions.groupby(by=["optimizer_id", "set"]):
#         if gid[0] != gid2[0]:
#             continue
#         p = gdf.iloc[0].values[2:].astype(float)
#         q = gdf2.iloc[0].values[2:].astype(float)
#         d = distance_bhattacharaya(p, q)
#         # print(f"{gid} vs {gid2}: {d}")
#         dist[(gid, gid2)] = d

# heatmaps = {}
# n_sets = distributions["set"].nunique()
# sets = distributions["set"].unique()
# for ((opt, set_id), (opt, set_id2)), distance in dist.items():
#     # print(f"{opt} {set_id} vs {set_id2}: {distance}") 
#     if not opt in heatmaps:
#         heatmaps[opt] = np.zeros((n_sets, n_sets))
#     heatmaps[opt][np.where(sets == set_id)[0][0], np.where(sets == set_id2)[0][0]] = distance

# fig = plt.figure(figsize=(12, 4))
# axes = fig.subplots(1, len(heatmaps), sharex=True, sharey=True)
# for i, (opt, heatmap) in enumerate(heatmaps.items()):
#     ax = axes[i]
#     sns.heatmap(heatmap, ax=ax, xticklabels=sets, yticklabels=sets)
#     ax.set_title(opt)
# plt.show()

In [None]:


 
# for use_mean in [False]:
#     aggregate_func = lambda x: x
#     if use_mean:
#         aggregate_func = np.mean

    

#     mean_ranking_full = performance[p_cols].rank(axis=1).apply(aggregate_func)
#     mean_ranking_full = performance[p_cols]
#     mean_ranking_full["problem_id"] = performance["problem_id"] if not use_mean else "mean_problem"
#     mean_ranking_full["set"] = "full"
#     mean_ranking_full["split"] = -1
#     mean_ranking_full["seed"] = -1
#     mean_ranking_full["use_mean"] = use_mean
#     mean_ranking_full = melt(mean_ranking_full)
#     mean_ranking_full = add_performance(mean_ranking_full)
    

#     mean_ranking_dev = subset_dev[p_cols].rank(axis=1).apply(aggregate_func)
#     mean_ranking_dev["problem_id"] = subset_dev["problem_id"] if not use_mean else "mean_problem"
#     mean_ranking_dev["set"] = "discrepancy_dev"
#     mean_ranking_dev["split"] = -1
#     mean_ranking_dev["seed"] = -1
#     mean_ranking_dev["use_mean"] = use_mean
#     mean_ranking_dev = melt(mean_ranking_dev)
#     mean_ranking_dev = add_performance(mean_ranking_dev)

#     mean_ranking_test = subset_test[p_cols].rank(axis=1).apply(aggregate_func)
#     mean_ranking_test["problem_id"] = subset_test["problem_id"] if not use_mean else "mean_problem"
#     mean_ranking_test["set"] = "discrepancy_test"
#     mean_ranking_test["split"] = -1
#     mean_ranking_test["seed"] = -1
#     mean_ranking_test["use_mean"] = use_mean
#     mean_ranking_test = melt(mean_ranking_test)
#     mean_ranking_test = add_performance(mean_ranking_test)

#     # easiest
#     costs = performance[p_cols].values
#     ids = pareto(-costs)
#     pareto_performance = performance.iloc[ids]
#     mean_ranking_pareto_easiest = pareto_performance[p_cols].rank(axis=1).apply(aggregate_func)
#     mean_ranking_pareto_easiest["problem_id"] = pareto_performance["problem_id"] if not use_mean else "mean_problem"
#     mean_ranking_pareto_easiest["set"] = "pareto_full_easiest"
#     mean_ranking_pareto_easiest["split"] = -1
#     mean_ranking_pareto_easiest["seed"] = -1
#     mean_ranking_pareto_easiest["use_mean"] = use_mean
#     mean_ranking_pareto_easiest = melt(mean_ranking_pareto_easiest)
#     mean_ranking_pareto_easiest = add_performance(mean_ranking_pareto_easiest)

#     # hardest
#     costs = performance[p_cols].values
#     ids = pareto(costs)
#     pareto_performance = performance.iloc[ids]
#     mean_ranking_pareto_hardest = pareto_performance[p_cols].rank(axis=1).apply(aggregate_func)
#     mean_ranking_pareto_hardest["problem_id"] = pareto_performance["problem_id"] if not use_mean else "mean_problem"
#     mean_ranking_pareto_hardest["set"] = "pareto_full_hardest"
#     mean_ranking_pareto_hardest["split"] = -1
#     mean_ranking_pareto_hardest["seed"] = -1
#     mean_ranking_pareto_hardest["use_mean"] = use_mean
#     mean_ranking_pareto_hardest = melt(mean_ranking_pareto_hardest)
#     mean_ranking_pareto_hardest = add_performance(mean_ranking_pareto_hardest)



#     ranking = []
#     ranking.append(mean_ranking_full)
#     ranking.append(mean_ranking_dev)
#     ranking.append(mean_ranking_test)
#     ranking.append(mean_ranking_pareto_easiest)
#     ranking.append(mean_ranking_pareto_hardest)

#     n_splits = n_problems // (subset_size * 2)
#     print(n_splits)
#     X = performance.values
#     y = performance["benchmark_id"].to_numpy()

#     split_classes = [StratifiedShuffleSplit, ShuffleSplit]

#     for split_class in split_classes:
#         for seed in seeds:
#             sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.5, random_state=seed)
#             sss.get_n_splits(X, y)
#             for i, (train_index, test_index) in enumerate(sss.split(X, y)):
#                 D_dev = performance.iloc[train_index][p_cols].rank(axis=1).apply(aggregate_func)
#                 D_dev["problem_id"] = performance.iloc[train_index]["problem_id"] if not use_mean else "mean_problem"
#                 D_dev["set"] = f"{split_class.__name__}_dev"
#                 D_dev["split"] = i
#                 D_dev["seed"] = seed
#                 D_dev["use_mean"] = use_mean
#                 D_dev = melt(D_dev)
#                 D_dev = add_performance(D_dev)
                
#                 D_test = performance.iloc[test_index][p_cols].rank(axis=1).apply(aggregate_func)
#                 D_test["problem_id"] = performance.iloc[test_index]["problem_id"] if not use_mean else "mean_problem"
#                 D_test["set"] = f"{split_class.__name__}_test"
#                 D_test["split"] = i
#                 D_test["seed"] = seed
#                 D_test["use_mean"] = use_mean
#                 D_test = melt(D_test)
#                 D_test = add_performance(D_test)
#                 ranking.extend([D_dev, D_test])

#     df_ranking = pd.DataFrame(ranking) if use_mean else pd.concat(ranking).reset_index(drop=True)
#     df_ranking = df_ranking.melt(id_vars=["set", "split", "seed", "problem_id", "use_mean"], value_vars=p_cols, var_name="optimizer_id", value_name="rank")
#     rankings.append(df_ranking)

# df_ranking = pd.concat(rankings).reset_index(drop=True)
# df_ranking
