In [None]:
from __future__ import annotations

from multiprocessing import Pool
from pathlib import Path

import pandas as pd
from carps.analysis.gather_data import add_scenario_type
from omegaconf import OmegaConf

config_folder = Path("../carps/configs/task")
paths = list(config_folder.glob("**/*.yaml"))
paths = [p for p in paths if "DUMMY" not in str(p)]

def read_task(p) -> dict:
    cfg = OmegaConf.load(p)
    task = OmegaConf.to_container(cfg.task)
    task["benchmark_id"] = cfg.benchmark_id
    task["task_id"] = cfg.task_id
    return task

with Pool() as pool:
    tasks = pool.map(read_task, paths)
tasks = pd.DataFrame(tasks)
tasks["is_multiobjective"] = tasks["n_objectives"] > 1
tasks = add_scenario_type(tasks, task_prefix="")
print(tasks.columns)

In [None]:
import numpy as np


def is_subset(task_id: str) -> bool:
    scenarios = ["blackbox", "multifidelity", "multiobjective", "momf"]
    return np.any([task_id.startswith(s) for s in scenarios])

def add_set_id(task_id: str) -> str:
    if "dev" in task_id:
        return "dev"
    if "test" in task_id:
        return "test"
    raise ValueError(f"Can't determine set for task_id {task_id}.")

subselections = tasks[tasks["task_id"].map(is_subset)]
subselections["set"] = subselections["task_id"].map(add_set_id)

cols_general = ["benchmark_id", "task_id", "dimensions", "n_trials",  "search_space_n_floats", "search_space_n_integers", "search_space_n_categoricals", "search_space_n_ordinals"]
cols_MF = ["fidelity_type", "min_fidelity", "max_fidelity"]
cols_MO = ["n_objectives"]


for pid, pset in subselections.groupby(by=["scenario", "set"]):
    cols = cols_general.copy()
    if pid[0] == "multi-fidelity":
        cols += cols_MF
    elif pid[0] == "multi-objective":
        cols += cols_MO
    elif pid[0] != "blackbox":
        cols += cols_MF
        cols += cols_MO
    pset = pset[cols]
    latex_str = pset.to_latex(index=False, caption="Selected tasks " + str(pid), label=f"tab:selectedtasks-{'-'.join(pid)}", float_format="%.2f")
    latex_str = latex_str.replace("task_id", "task")
    latex_str = latex_str.replace("search_space_", "")
    latex_str = latex_str.replace("_", r"\_")
    latex_str = latex_str.replace(r"\begin{tabular}", "\\centering\n\\resizebox{\\textwidth}{!}{\\begin{tabular}")
    latex_str = latex_str.replace(r"\end{tabular}", "\\end{tabular}}")
    pid = "_".join(pid)
    fn = Path("tables") / f"selected_tasks_{pid}.tex"
    fn.parent.mkdir(exist_ok=True, parents=True)
    fn.write_text(latex_str)
    print(latex_str)

In [None]:
def print_cat(x):
    print(x.name)
    print(x["search_space_n_categoricals"].sum())

tasks.groupby(by=["benchmark_id"]).apply(print_cat)

In [None]:
print("number of all tasks", len(tasks))
print("number of benchmarks", tasks["benchmark_id"].nunique())
for gid, gdf in tasks.groupby("scenario"):
    print(gid, len(gdf))

In [None]:
list(tasks.columns)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=2.2)
sns.set_style("whitegrid")
sns.set_palette("colorblind")




n_multiobjective = np.sum(tasks["n_objectives"] > 1)
n_multifidelity = np.sum(~tasks["fidelity_type"].isna())
n_conditional_search_spaces = np.sum(tasks["search_space_has_conditionals"])
n_tasks = len(tasks)


def make_pie(n: int, total: int, identifier: str):
    fig, ax = plt.subplots()
    sizes = [int(n), int(total - n)]
    labels = [f"{identifier} ({n*100/total:.0f}%)", "total"]
    ax.pie(sizes, labels=labels)
    plt.show()


def savefig(fnbase: str, fig):
    extensions = [".pdf", ".png"]
    for extension in extensions:
        fn = Path(fnbase + extension)
        fn.parent.mkdir(exist_ok=True, parents=True)
        fig.savefig(fn, dpi=300, bbox_inches="tight")


def plot_pie_value_counts(tasks: pd.DataFrame, key_column: str):
    fig, ax = plt.subplots()
    value_counts = tasks[key_column].value_counts()
    sizes = value_counts.values / np.sum(value_counts.values) * 100
    labels = list(value_counts.index)
    ax = sns.barplot(x=labels, y=sizes, ax=ax)
    ax.set_ylabel("%")
    ax.set_title(key_column)
    savefig(f"figures/benchmarks/pie_{key_column}", fig)
    plt.show()

def plot_pie_hp_types(tasks: pd.DataFrame):
    fig, ax = plt.subplots()
    sizes = [tasks["search_space_n_categoricals"].sum(), tasks["search_space_n_ordinals"].sum(), tasks["search_space_n_integers"].sum(), tasks["search_space_n_floats"].sum()]
    labels = ["categorical", "ordinal", "integer", "float"]
    ax.pie(sizes, labels=labels)
    ax.set_title("HP Types")
    savefig("figures/benchmarks/pie_HPtypes", fig)
    plt.show()

def plot_dimensions(tasks: pd.DataFrame):
    fig, ax = plt.subplots()
    sns.histplot(data=tasks, x="dimensions", hue="benchmark_id")
    savefig("figures/benchmarks/histogram_dimensions", fig)
    plt.show()



ax = sns.histplot(data=tasks, x="fidelity_type")
ax.tick_params(rotation=90)
ax.set_yscale("log")
plt.show()
objectives = np.concatenate(tasks["objectives"][tasks["objectives"].apply(lambda x: x is not None)])
ax = sns.histplot(objectives)
ax.tick_params(rotation=90)
ax.set_yscale("log")
plt.show()
print(n_tasks)
plot_pie_value_counts(tasks=tasks, key_column="is_multifidelity")
plot_pie_value_counts(tasks=tasks, key_column="n_objectives")
plot_pie_value_counts(tasks=tasks, key_column="search_space_has_conditionals")
plot_pie_value_counts(tasks=tasks, key_column="objective_function_approximation")
plot_pie_value_counts(tasks=tasks, key_column="domain")
plot_pie_hp_types(tasks)
plot_dimensions(tasks)



