# How to select a subset

Based on different subset sizes (k) and log-transform or not.

In [None]:
original_optimizers = {
    "blackbox": ["RandomSearch", "SMAC3-BlackBoxFacade", "Nevergrad-CMA-ES"],
    "multi-objective": ["RandomSearch", "Optuna-MO-TPE", "Nevergrad-DE"],
    "multi-fidelity": ["SMAC3-Hyperband", "SMAC3-MultiFidelityFacade", "DEHB"],
    "multi-fidelity-objective": ["RandomSearch", "SMAC3-MOMF-GP", "Nevergrad-DE"],
}

# Collect Subsets

In [None]:
from pathlib import Path
import pandas as pd
from ast import literal_eval
from create_subset_configs import fix_legacy_task_id

def parse_stats_str(stats_str: str) -> dict:
    stats_str = stats_str.strip("\n").split(",")
    stats_str = [x.strip(" ") for x in stats_str]
    stats_str = [x.split("=") for x in stats_str]
    return {x[0]: literal_eval(x[1]) for x in stats_str}

path_task_type = Path("data/2024_11")

path_task_type = Path("data_subselection/MF/lognorm")
path_task_type = Path("data_subselection/MOMF/lognorm")

is_df_crit_log = True
scenario = "momf"

def load_subsets(path_task_type: str | Path, is_df_crit_log: bool, scenario: str) -> pd.DataFrame:
    if isinstance(path_task_type, str):
        path_task_type = Path(path_task_type)
    data = []
    # gather old style:
    csv_files = [f for f in path_task_type.glob("*.csv") if f.name.startswith("subset")]
    for subset_file in csv_files:
        subset = pd.read_csv(subset_file)
        if "complement" in subset_file.name and not "complement_subset" in subset_file.name:
            continue
        if "problem_id" in subset.columns:
            subset = subset.rename(columns={"problem_id": "task_id"})

        subset_id = "dev" if "complement" not in subset_file.name else "test"

        subset["task_id"] = subset["task_id"].apply(fix_legacy_task_id)
        task_ids = list(subset["task_id"])
        txt_file = subset_file.with_suffix(".txt")
        if txt_file.exists():
            first_line = txt_file.read_text().split("\n")[0]
            stats = parse_stats_str(first_line)
        else:
            info_csv = subset_file.parent / "info.csv"
            all_stats = pd.read_csv(info_csv)
            stats_subset_id = "s1" if subset_id == "dev" else "s2"
            k_from_fn = subset_file.stem.split("_")[-1]
            all_stats["k"] = all_stats["k"].astype(int)
            stats = all_stats[(all_stats["which"]==stats_subset_id) & (all_stats["k"] == int(k_from_fn))].to_dict(orient="records")[0]
            del stats["which"]

        info = {
            "subset_id": subset_id,
            "scenario": scenario,
            "is_df_crit_log": is_df_crit_log,
            "task_ids": task_ids,
            **stats,
        }
        data.append(info)
    subset_data = pd.DataFrame(data)
    return subset_data


def load_subsets_v2():
    path = Path("data/2024_11")

    columns_bb = ["task_id", "RandomSearch", "SMAC3-BlackBoxFacade", "Nevergrad-CMA-ES"]
    columns_mo = ["task_id", "RandomSearch", "Optuna-MO-TPE", "Nevergrad-DE"]


    result_files = list(path.glob("*.txt"))
    result_files.sort()

    results = []
    for result_file in result_files:
        with result_file.open() as f:
            lines = f.readlines()
            # print(lines[0])
            subset_raw = "".join(lines[2:]).replace(" ", ",")
            stats_raw = lines[0].strip("\n").split(",")
            stats_raw = [x.strip(" ") for x in stats_raw]
            stats_raw = [x.split("=") for x in stats_raw]
            stats = {x[0]: literal_eval(x[1]) for x in stats_raw}

            stem = result_file.stem
            scenario = stem.split("_")[0]
            scenario = scenario.replace("Mo", "MO")
            is_df_crit_log = "log" in stem
            subset_id = "test" if "TEST" in stem else "dev"
            stats["subset_id"] = subset_id
            stats["scenario"] = scenario
            stats["is_df_crit_log"] = is_df_crit_log
            result = {
                "scenario": scenario,
                "is_df_crit_log": is_df_crit_log,
                "stats": stats,
                "subset": subset_raw,
                "subset_id": subset_id
            }

            subset_fn = path / f"{scenario}_{is_df_crit_log}_{subset_id}_{stats['k']}.csv"
            cols = columns_mo if "MO" in scenario else columns_bb
            col_line = ",".join(cols)
            subset_raw = col_line + "\n" + subset_raw
            with open(subset_fn, "w") as f:
                f.write(subset_raw)


            # print(result)
        results.append(result)
    results = pd.DataFrame(results)

    stats_df = pd.DataFrame(results["stats"].tolist())
    stats_df["task_ids"] = results["subset"].apply(lambda x: [l.split(",")[0] for l in x.split("\n") if l])
    stats_df["task_ids"] = stats_df["task_ids"].apply(lambda x: [fix_legacy_task_id(xk) for xk in x])
    stats_df.loc[stats_df["scenario"] == "BBv2", "scenario"] = "blackbox"
    stats_df.loc[stats_df["scenario"] == "MOv2", "scenario"] = "multi-objective"
    return stats_df


subset_data_mf = load_subsets("data_subselection/MF/lognorm", True, "multi-fidelity")
subset_data_momf = load_subsets("data_subselection/MOMF/lognorm", True, "multi-fidelity-objective")
subset_data_bb_mo = load_subsets_v2()

subset_data = pd.concat([subset_data_mf, subset_data_momf, subset_data_bb_mo])
subset_data.to_csv("subset_data.csv", index=False)

# Calculate Ranks and Significances of Subselections

In [None]:
import ast

import pandas as pd
from autorank._util import get_sorted_rank_groups
from carps.analysis.plot_ranking import calc_critical_difference
from create_subset_configs import fix_legacy_task_id

# Load source data
df_crit_fns = [
    "data/BBv2_norm/df_crit.csv",
    "data/MOv2_norm/df_crit.csv",
    "data_subselection/MOMF/default/df_crit.csv",
    "data_subselection/MF/lognorm/df_crit.csv",
]
task_types = ["blackbox", "multi-objective", "multi-fidelity-objective", "multi-fidelity"]
df_crit = pd.concat([pd.read_csv(fn).melt(id_vars=["problem_id"], var_name="optimizer_id", value_name="performance") for fn in df_crit_fns]).rename({"problem_id": "task_id"}, axis=1)
df_crit["task_id"] = df_crit["task_id"].apply(fix_legacy_task_id)
# df_crit = df_crit.set_index("task_id")
df_crit.to_csv("tmp_df_crit.csv", index=False)

subset_data = pd.read_csv("subset_data.csv")

def calc_ranks(rundata: pd.DataFrame, scenario: str, set_id: str):
    perf_col: str = "trial_value__cost_inc_norm"
    identifier = f"{scenario}_{set_id}"
    result = calc_critical_difference(rundata, identifier=identifier, figsize=(8, 3), perf_col=perf_col, calc_df_crit=False, plot_diagram=False)
    sorted_ranks, names, groups = get_sorted_rank_groups(result, reverse=False)
    return result

decision_data = []
for (scenario, is_df_crit_log, k), gdf in subset_data.groupby(["scenario", "is_df_crit_log", "k"]):
    discrepancy_sum = gdf["discrepancy"].sum()

    is_significant = {}
    rank = {}
    rankorder = {}
    for set_id, sgdf in gdf.groupby("subset_id"):
        allowed_optimizers = original_optimizers[scenario]
        allowed_task_ids = ast.literal_eval(sgdf["task_ids"].iloc[0])
        rundata_subset = df_crit[
            # (rundata["task_type"] == scenario)
            (df_crit["optimizer_id"].isin(allowed_optimizers))
            & (df_crit["task_id"].isin(allowed_task_ids))        ]
        rundata_subset = rundata_subset.pivot_table(index="task_id", columns="optimizer_id", values="performance")
        ranks = calc_ranks(rundata_subset, scenario, set_id)
        # from rich import inspect
        # inspect(ranks)
        is_significant["is_significant_" + set_id] = bool(ranks.pvalue < ranks.alpha)
        rank["rank_" + set_id] = ranks.rankdf["meanrank"]
        rankorder["rankorder_" + set_id] = ranks.rankdf["meanrank"].rank(ascending=True)
    rankorder_is_same = rankorder["rankorder_test"].equals(rankorder["rankorder_dev"])
    decision_data.append({
        "scenario": scenario,
        "is_df_crit_log": is_df_crit_log,
        "k": k,
        "discrepancy_sum": discrepancy_sum,
        "rankorder_is_same": rankorder_is_same,
        **is_significant
    })
decision_df = pd.DataFrame(decision_data)
decision_df.to_csv("decision_data.csv", index=False)
decision_df

# Decide

In [None]:
decision_df = pd.read_csv("decision_data.csv")

# Requirements/Filtering:
# (1) The rank order must stay the same between dev and test.
# (2) Dev and test must show significant differences based on the non-parametric test.
decision_df = decision_df[
    (decision_df["rankorder_is_same"] == True)
    & (decision_df["is_significant_dev"] == True)
    & (decision_df["is_significant_test"] == True)
]

# Decision Rule: Pick the k and log transformation with the lowest discrepancy sum of dev and test.
choices = decision_df.groupby("scenario").apply(lambda gdf: gdf[gdf["discrepancy_sum"] == gdf["discrepancy_sum"].min()])

# Add task ids from subset_data
for idx, row in choices.iterrows():
    scenario = row["scenario"]
    is_df_crit_log = row["is_df_crit_log"]
    k = row["k"]
    gdf = subset_data[(subset_data["scenario"] == scenario) & (subset_data["is_df_crit_log"] == is_df_crit_log) & (subset_data["k"] == k)]
    for set_id in ["dev", "test"]:
        task_ids = gdf[gdf["subset_id"] == set_id]["task_ids"].iloc[0]
        choices.at[idx, f"task_ids_{set_id}"] = task_ids
choices = choices.reset_index(drop=True)

choices.to_csv("choices.csv", index=False)
choices



In [None]:
# Create subset configs

from __future__ import annotations

from pathlib import Path

import fire
import numpy as np
import pandas as pd
from omegaconf import OmegaConf

def fix_legacy_task_id(task_id: str) -> str:
    task_id = "bbob/" + task_id if task_id.startswith("noiseless") else task_id
    return task_id.replace(
         "noiseless/", "").replace(
              "bb/tab/", "blackbox/tabular/").replace(
                   "MO/tab/", "multiobjective/tabular/").replace("hpobench/mf/", "hpobench/multifidelity/")

def write_subset(task_ids: list[str], subset_id: str, scenario: str, config_target_path: str | Path = "carps/configs/task/subselection") -> None:
    config_target_path = (Path(config_target_path) / scenario.replace("-", "")).resolve()
    config_target_path.mkdir(exist_ok=True, parents=True)

    task_ids = [fix_legacy_task_id(task_id) for task_id in task_ids]

    subset_size = len(task_ids)

    index_fn = config_target_path.parent.parent / "index.csv"
    if not index_fn.is_file():
        raise ValueError(f"Could not find {index_fn}. ObjectiveFunction ids have not been indexed. Run `python -m carps.utils.index_configs`.")
    task_index = pd.read_csv(index_fn)
    not_found = [pid for pid in task_ids if pid not in task_index["task_id"].to_list()]
    if not_found:
            raise ValueError(f"Could not find {not_found} in {index_fn}. ObjectiveFunction ids have not been indexed. Run `python -m carps.utils.index_configs`.")

    ids = [np.where(task_index["task_id"]==pid)[0][0] for pid in task_ids]
    config_fns = task_index["config_fn"][ids].to_list()

    for config_fn in config_fns:
        cfg = OmegaConf.load(config_fn)
        new_name = f"subset_{cfg.task_id}.yaml".replace("/", "_")
        new_task_id = f"{scenario}/{subset_size}/{subset_id}/{cfg.task_id}"
        cfg.task_id = new_task_id
        new_fn = config_target_path / subset_id / new_name
        new_fn.parent.mkdir(exist_ok=True, parents=True)
        yaml_str = OmegaConf.to_yaml(cfg)
        yaml_str = f"# @package _global_\ntask_type: {scenario}\nsubset_id: {subset_id}\n" + yaml_str
        new_fn.write_text(yaml_str)


choices = pd.read_csv("choices.csv")
choices["task_ids_dev"] = choices["task_ids_dev"].apply(ast.literal_eval)
choices["task_ids_test"] = choices["task_ids_test"].apply(ast.literal_eval)
for _, row in choices.iterrows():
    print(row)
    write_subset(row["task_ids_dev"], "dev", row["scenario"], config_target_path="../carps/configs/task/subselection")
    write_subset(row["task_ids_test"], "test", row["scenario"], config_target_path="../carps/configs/task/subselection")