# Combine into One DataFrame the Runtime Measurements from Experiments of All Datasets

In [None]:
experiments_with_weighting = [f"exp-{i}" for i in range(118, 126)]
experiments_no_weighting = [f"exp-{i}" for i in range(126, 134)]
experiments_0_1_weighting = [f"exp-{i}" for i in range(134, 142)]
experiments_0_3_weighting = [f"exp-{i}" for i in range(142, 150)]

## Papermill Parameters

In [None]:
PARAM_DEFINITIONS_PATH = "../definitions"
PARAM_OUTPUT_PATH = "../outputs"

PARAM_WEIGHTINGS_EXPERIMENTS = {
    "1": experiments_with_weighting,
    "0.3": experiments_0_3_weighting,
    "0.1": experiments_0_1_weighting,
    "0": experiments_no_weighting,
}

PARAM_INPUT_STAGE = "stage-06"
PARAM_INPUT_FILENAME = "statistics_merged_runtimes.csv"
PARAM_DEPTH = 4

## Load and Combine Results

In [None]:
import pandas as pd
import yaml
import os

meta_reports_path = f"{PARAM_OUTPUT_PATH}/meta_reports"
if not os.path.exists(meta_reports_path):
    os.mkdir(meta_reports_path)

out_dir = f"{meta_reports_path}/optimistic_estimates"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

## Load and Combine Results

In [None]:
def oe_to_string(oe):
    return r"\multicolumn{1}{c}{On}" if oe else r"\multicolumn{1}{c}{Off}"


def get_results(weighting_experiments):
    stage_dicts = []

    for weighting, experiments in weighting_experiments.items():
        for exp in experiments:
            with open(f"{PARAM_DEFINITIONS_PATH}/{exp}/stage-00/global_params.yaml") as global_params_file:
                global_params = yaml.load(global_params_file, Loader=yaml.FullLoader)

            exp_base_row = {
                "weighting": weighting,
                "dataset": global_params["PARAM_DATASET_NAME"],
            }

            try:
                stage_df = pd.read_csv(f"{PARAM_OUTPUT_PATH}/{exp}/{PARAM_INPUT_STAGE}/{PARAM_INPUT_FILENAME}")

                for _, row in stage_df[stage_df["depth"] == PARAM_DEPTH].iterrows():
                    row_part = {"qf_name": row["qf_name"], "optimistic_estimate": oe_to_string(row["optimistic_estimate"]), "time_median": row["time_median"], "time_std": row["time_std"]}
                    stage_dicts.append(exp_base_row | row_part)
            except FileNotFoundError:
                null_row_part = {"qf_name": None, "num_visited_subgroups_speedup": None, "time_speedup": None}
                stage_dicts.append(exp_base_row | null_row_part)
                print(f"{PARAM_OUTPUT_PATH}/{exp}/{PARAM_INPUT_STAGE}/{PARAM_INPUT_FILENAME} not found")

    all_outputs_df = pd.DataFrame(stage_dicts)

    return all_outputs_df


all_outputs_df = get_results(PARAM_WEIGHTINGS_EXPERIMENTS)
all_outputs_df.to_csv(f"{out_dir}/optimistic_estimate_runtimes_combined.csv", index=False)
all_outputs_df.set_index(["dataset", "qf_name", "weighting", "optimistic_estimate"], inplace=True)
all_outputs_df

## Turn into LaTeX Table Code

In [None]:
import numpy as np
import itertools
import matplotlib
import matplotlib.pyplot as plt

performance_measure_name_replacements = {
        "average_ranking_loss": "ARL",
        "roc_auc_score": "ROC AUC",
        "prc_auc_score": "PR AUC"
    }


def str_format_time_median_speedup(this_median, this_std):
    this_str = f"{this_median:.1f}"
    return this_str


latex_time_median_dicts = {dataset: {} for dataset in all_outputs_df.index.levels[0]}

std_over_median_list = []
for dataset, qf_name, weighting, optimistic_estimate in itertools.product(*all_outputs_df.index.levels):
    if (dataset, qf_name, weighting, optimistic_estimate) not in all_outputs_df.index:
        continue

    time_median_with_weighting = all_outputs_df.loc[(dataset, qf_name, weighting, optimistic_estimate), "time_median"]
    time_std_with_weighting = all_outputs_df.loc[(dataset, qf_name, weighting, optimistic_estimate), "time_std"]

    latex_time_median_dicts[dataset][f"{performance_measure_name_replacements[qf_name]}*{weighting}*{optimistic_estimate}"] = str_format_time_median_speedup(time_median_with_weighting, time_std_with_weighting)

    std_over_median_list.append(time_std_with_weighting/time_median_with_weighting)
    std_threshold = 0.1
    if std_over_median_list[-1] > std_threshold:
        print(dataset, f"has std/median > {std_threshold}")

matplotlib.rcParams.update({'font.size': 8})
plt.gcf().set_size_inches(3, 3)
plt.hist(std_over_median_list, bins=50)
plt.xlabel("(Standard Deviation)/Median")
plt.ylabel("Count")
plt.subplots_adjust(left=0.2, bottom=0.15)
plt.savefig(f"{out_dir}/optimistic_estimates_time_median_std_histogram.pdf")

datasets_line_breaks_dict = {
   "UCI Credit Approval": r"UCI Credit Approval",
   "UCI Breast Cancer Wisconsin": r"UCI Breast\\Cancer Wisconsin",
   "Statlog (German Credit Data)": r"Statlog (German\\Credit Data)",
   "UCI Mushroom": r"UCI Mushroom",
   "UCI Credit Card Clients": r"UCI Credit\\Card Clients",
   "UCI Bank Marketing": r"UCI Bank\\Marketing",
   "OpenML Adult": r"OpenML Adult",
   "UCI Census-Income (KDD)": r"UCI Census-Income\\(KDD)",
}

latex_dataset_column_name = r"\cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}\cmidrule(lr){8-9}Dataset"
latex_time_median_dicts = [{latex_dataset_column_name: dataset} | rest_dict for dataset, rest_dict in latex_time_median_dicts.items()]
latex_time_median_df = pd.DataFrame(latex_time_median_dicts)
latex_time_median_df.set_index(latex_dataset_column_name, inplace=True)
latex_time_median_df.sort_index(ascending=False, inplace=True, key=lambda x: [list(datasets_line_breaks_dict.keys()).index(idx) for idx in x])

columns_multiindex_lists = []
for column in latex_time_median_df.columns.values:
    if len(column.split("*")) > 1:
        columns_multiindex_lists.append(column.split("*"))

print(columns_multiindex_lists)
columns_multiindex = pd.MultiIndex.from_arrays(np.array(columns_multiindex_lists).T)
latex_time_median_df.columns = columns_multiindex

for qf_name in all_outputs_df.index.levels[1]:
    qf_latex_time_median_df = latex_time_median_df[performance_measure_name_replacements[qf_name]].copy()

    columns = qf_latex_time_median_df.columns.values
    qf_latex_time_median_df[("", latex_dataset_column_name)] = qf_latex_time_median_df.index
    qf_latex_time_median_df = qf_latex_time_median_df[[("", latex_dataset_column_name), *columns]]

    qf_latex_time_median_df.to_latex(f"{out_dir}/optimistic_estimates_time_median_table_{qf_name}.tex", index=False, column_format=r"lrrrrrrrr", multicolumn_format="c")
    print(qf_latex_time_median_df)