# Combine into One DataFrame the Speedups in Runtime Measurements from Experiments of All Datasets

## Papermill Parameters

In [None]:
PARAM_DEFINITIONS_PATH = "../definitions"
PARAM_OUTPUT_PATH = "../outputs"

PARAM_EXPERIMENTS_PER_WEIGHT = {
    1:   [f"exp-{i}" for i in range(118, 126)],  # experiments including cover size and class balance weighting (alpha=1,   beta=1)
    0.3: [f"exp-{i}" for i in range(142, 150)],  # experiments including cover size and class balance weighting (alpha=0.3, beta=0.3)
    0.1: [f"exp-{i}" for i in range(134, 142)],  # experiments including cover size and class balance weighting (alpha=0.1, beta=0.1)
    0:   [f"exp-{i}" for i in range(126, 134)]   # experiments without cover size and class balance weighting   (alpha=0,   beta=0)
}

PARAM_INPUT_STAGE = "stage-07"
PARAM_INPUT_FILENAME = "speedups_statistics_merged_runtimes.csv"
PARAM_DEPTH = 4

## Load and Combine Results

In [None]:
import pandas as pd
import yaml
import os

meta_reports_path = f"{PARAM_OUTPUT_PATH}/meta_reports"
if not os.path.exists(meta_reports_path):
    os.mkdir(meta_reports_path)

out_dir = f"{meta_reports_path}/optimistic_estimates"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

## Load and Combine Results

In [None]:
def get_results(input_experiments, weight):
    stage_dicts = []

    for exp in input_experiments:
        with open(f"{PARAM_DEFINITIONS_PATH}/{exp}/stage-00/global_params.yaml") as global_params_file:
            global_params = yaml.load(global_params_file, Loader=yaml.FullLoader)

        exp_base_row = {
            "dataset": global_params["PARAM_DATASET_NAME"],
        }

        try:
            stage_df = pd.read_csv(f"{PARAM_OUTPUT_PATH}/{exp}/{PARAM_INPUT_STAGE}/{PARAM_INPUT_FILENAME}")

            for _, row in stage_df[stage_df["depth"] == PARAM_DEPTH].iterrows():
                row_part = {"qf_name": row["qf_name"], "num_visited_subgroups_speedup": row["num_visited_subgroups_speedup"], "time_speedup": row["time_speedup"]}
                print(row_part)
                stage_dicts.append(exp_base_row | row_part)
                print(stage_dicts)
        except FileNotFoundError:
            null_row_part = {"qf_name": None, "num_visited_subgroups_speedup": None, "time_speedup": None}
            stage_dicts.append(exp_base_row | null_row_part)
            print(f"{PARAM_OUTPUT_PATH}/{exp}/{PARAM_INPUT_STAGE}/{PARAM_INPUT_FILENAME} not found")

    all_outputs_df = pd.DataFrame(stage_dicts)

    all_outputs_df.to_csv(f"{out_dir}/optimistic_estimates_{weight}.csv", index=False)
    all_outputs_df.set_index(["dataset", "qf_name"], inplace=True)

    return all_outputs_df


all_outputs_joined_df = None
for weight, experiments in PARAM_EXPERIMENTS_PER_WEIGHT.items():
    outputs_df = get_results(experiments, weight)
    outputs_df.rename(columns=lambda column: f"{column}_{weight}", inplace=True)

    if all_outputs_joined_df is None:
        all_outputs_joined_df = outputs_df
        continue

    all_outputs_joined_df = all_outputs_joined_df.join(outputs_df, validate="1:1")

all_outputs_joined_df

## Turn into LaTeX Table Code

In [None]:
performance_measure_name_replacements = {
        None: "Missing",
        "average_ranking_loss": "ARL",
        "roc_auc_score": "ROC AUC",
        "prc_auc_score": "PR AUC"
    }


def makecell(string):
    return r"\makecell{" + string + r"}"


def rotatebox(string, rot):
    return r"\rotatebox{" + str(rot) + r"}{" + string + r"}"


def parbox(string, length_str):
    return string
    # return r"\parbox{" + length_str + r"}{" + string + r"}"


def str_format_num_visited_speedup(this):
    return fr"{100*this:.2f}\%"


def str_format_time_speedup(this):
    return f"{this:.1f}"


latex_num_visited_dicts = []
latex_time_dicts = []

for dataset in all_outputs_joined_df.index.levels[0]:
    for qf_name in all_outputs_joined_df.loc[dataset].index:
        if str(qf_name).lower() == "nan":
            continue
        
        for weight in PARAM_EXPERIMENTS_PER_WEIGHT.keys():
            num_visited_speedup = all_outputs_joined_df.loc[(dataset, qf_name), f"num_visited_subgroups_speedup_{weight}"]

            latex_num_visited_dicts.append({
                "dataset": dataset,
                r"Performance\\Measure": performance_measure_name_replacements[qf_name],
                r"Weight (\alpha=\beta)": str(weight),
                "num_visited_speedup": str_format_num_visited_speedup(num_visited_speedup)
            })

            time_speedup = all_outputs_joined_df.loc[(dataset, qf_name), f"time_speedup_{weight}"]

            latex_time_dicts.append({
                "dataset": dataset,
                r"Performance\\Measure": performance_measure_name_replacements[qf_name],
                r"Weight (\alpha=\beta)": str(weight),
                "time_speedup": str_format_time_speedup(time_speedup),
            })

latex_time_df = pd.DataFrame(latex_time_dicts)
latex_num_visited_df = pd.DataFrame(latex_num_visited_dicts)

datasets_line_breaks_dict = {
   "UCI Credit Approval": r"UCI Credit Approval",
   "UCI Breast Cancer Wisconsin": r"UCI Breast\\Cancer Wisconsin",
   "Statlog (German Credit Data)": r"Statlog (German\\Credit Data)",
   "UCI Mushroom": r"UCI Mushroom",
   "UCI Credit Card Clients": r"UCI Credit\\Card Clients",
   "UCI Bank Marketing": r"UCI Bank\\Marketing",
   "OpenML Adult": r"OpenML Adult",
   "UCI Census-Income (KDD)": r"UCI Census-Income\\(KDD)",
}

rotation = 90
rearranged_latex_time_df = latex_time_df.pivot(columns=["dataset"], index=[r"Performance\\Measure", r"Weight (\alpha=\beta)"], values=["time_speedup"])
rearranged_latex_time_df.columns = rearranged_latex_time_df.columns.levels[1]
rearranged_latex_time_df.sort_index(axis="columns", ascending=False, inplace=True, key=lambda x: [list(datasets_line_breaks_dict.keys()).index(idx) for idx in x])
rearranged_latex_time_df.rename(columns=lambda x: rotatebox(makecell(parbox(datasets_line_breaks_dict[x], "3cm")), rotation), inplace=True)
rearranged_latex_time_df.columns.name = None

rearranged_latex_num_visited_df = latex_num_visited_df.pivot(columns=["dataset"], index=[r"Performance\\Measure", r"Weight (\alpha=\beta)"], values=["num_visited_speedup"])
rearranged_latex_num_visited_df.columns = rearranged_latex_num_visited_df.columns.levels[1]
rearranged_latex_num_visited_df.sort_index(axis="columns", ascending=False, inplace=True, key=lambda x: [list(datasets_line_breaks_dict.keys()).index(idx) for idx in x])
rearranged_latex_num_visited_df.rename(columns=lambda x: rotatebox(makecell(parbox(datasets_line_breaks_dict[x], "3cm")), rotation), inplace=True)
rearranged_latex_num_visited_df.columns.name = None

rearranged_latex_num_visited_df.style.to_latex(f"{out_dir}/optimistic_estimates_num_visited_speedup_table.tex",
                                               column_format=r"lrrrrrrrrr",
                                               hrules=True,
                                               clines="skip-last;data")
rearranged_latex_time_df.style.to_latex(f"{out_dir}/optimistic_estimates_time_median_speedup_table.tex",
                                        column_format=r"lrrrrrrrrr",
                                        hrules=True,
                                        clines="skip-last;data")

latex_num_visited_df