In [None]:
# Get subset tasks
from omegaconf import OmegaConf
from pathlib import Path
from multiprocessing import Pool
import pandas as pd

subset_config_dir = Path("../carps/configs/task/subselection")
subset_config_fns = list(subset_config_dir.glob("**/*.yaml"))

keys = ["task_type", "subset_id", "task_id"]

def read_task_config(config_fn: Path) -> dict:
    config = OmegaConf.load(config_fn)
    return {k: config.get(k) for k in keys}

with Pool() as pool:
    task_infos = pool.map(read_task_config, subset_config_fns)

task_df = pd.DataFrame(task_infos)
task_df["benchmark_id"] = task_df["task_id"].apply(lambda x: x.split("/")[3].lower())
task_df.to_csv("subset_tasks.csv", index=False)
task_df["task_type"].unique()

In [None]:
from create_subset_configs import fix_legacy_task_id

paths = [
    "../subselection/data/BBv2_lognorm/df_crit.csv",
    "../subselection/data/MOv2_norm/df_crit.csv",
    "../subselection/data_subselection/MOMF/lognorm/df_crit.csv",
    "../subselection/data_subselection/MF/lognorm/df_crit.csv"
]
task_types = [
    "blackbox",
    "multi-objective",
    "multi-fidelity-objective",
    "multi-fidelity"
]
source_df_list = []
for task_type, path in zip(task_types, paths):
    df = pd.read_csv(path)  # noqa: PD901
    df["task_type"] = task_type
    df = df.rename(columns={"problem_id": "task_id"})
    df["task_id"] = df["task_id"].apply(fix_legacy_task_id)
    df["benchmark_id"] = df["task_id"].apply(lambda x: x.split("/")[0].lower())
    source_df_list.append(df)
source_df = pd.concat(source_df_list)
source_df.to_csv("source_df.csv", index=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from carps.analysis.utils import setup_seaborn, savefig

setup_seaborn()

source_df = pd.read_csv("source_df.csv")
task_df = pd.read_csv("subset_tasks.csv")

id_columns = ["task_type", "task_id", "subset_id", "benchmark_id"]
optimizer_columns = [c for c in source_df.columns if c not in id_columns]


fig = plt.figure(figsize=(3*4,2))
axes = fig.subplots(1, 4)

task_types = task_df["task_type"].unique()

for i, task_type in enumerate(task_types):
    task_subset_df = task_df[task_df["task_type"] == task_type]
    source_subset_df = source_df[source_df["task_type"] == task_type]
    source_subset_df = source_subset_df.dropna(axis=1, how="any")
    optimizer_columns_sub = [c for c in optimizer_columns if c in source_subset_df.columns]
    performance_tensor_source = source_subset_df[optimizer_columns_sub].to_numpy()
    print(performance_tensor_source.shape)

    original_task_ids = task_subset_df["task_id"].apply(
            lambda x: "/".join(x.split("/")[3:]) if x else x)
    task_subset_df["original_task_id"] = original_task_ids

    source_subset_task_df = source_subset_df[
        source_subset_df["task_id"].isin(task_subset_df["original_task_id"])
    ]
    source_subset_task_df.loc[:, "subset_id"] = source_subset_df["task_id"].apply(
        lambda x: task_subset_df["subset_id"][task_subset_df["original_task_id"] == x].to_numpy())
    source_subset_task_df.loc[:, "subset_id"] = source_subset_task_df.loc[:, "subset_id"].apply(lambda x: x[0])
    performance_tensor_subset = source_subset_task_df[optimizer_columns_sub].to_numpy()
    print(performance_tensor_subset.shape)



    def get_2d_tensor(tsne, tensor_3d: np.ndarray) -> np.ndarray:
        # tsne = TSNE(n_components=2, random_state=42, perplexity=10)
        # tsne.fit(tensor_3d)
        # return tsne.fit_transform(tensor_3d)
        return tsne.transform(tensor_3d)
    np.random.seed(42)
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    # tsne.fit(performance_tensor_source)
    tsne = PCA(n_components=2)
    tsne.fit(performance_tensor_source)

    tensor_2d_source = get_2d_tensor(tsne, performance_tensor_source)
    tensor_2d_subset_dev = get_2d_tensor(tsne,
        source_subset_task_df[source_subset_task_df["subset_id"] == "dev"][optimizer_columns_sub].to_numpy()
    )
    tensor_2d_subset_test = get_2d_tensor(tsne,
        source_subset_task_df[source_subset_task_df["subset_id"] == "test"][optimizer_columns_sub].to_numpy()
    )

    plot_df = pd.concat([
        pd.DataFrame({
            "x0": tensor_2d_source[:, 0],
            "x1": tensor_2d_source[:, 1],
            "subset_id": "Source"
        }),
        pd.DataFrame({
            "x0": tensor_2d_subset_dev[:, 0],
            "x1": tensor_2d_subset_dev[:, 1],
            "subset_id": "Dev"
        }),
        pd.DataFrame({
            "x0": tensor_2d_subset_test[:, 0],
            "x1": tensor_2d_subset_test[:, 1],
            "subset_id": "Test"
        })
    ])

    # Plot the 2D representation of the data
    marker_shapes = {"Source": "o", "Dev": "*", "Test": "p"}
    marker_sizes = {"Source": 60*1.25, "Dev": 50*1.25, "Test": 40*1.25}
    color_palette = {"Source": "grey", "Dev": "mediumvioletred", "Test": "forestgreen"}
    
    ax = axes[i]
    ax = sns.scatterplot(
        data=plot_df, x="x0", y="x1", hue="subset_id", ax=ax,
        style="subset_id",  # This will use different shapes for each hue
        markers=marker_shapes,  # Custom shapes for each hue
        size="subset_id",  # This will adjust the size based on the hue
        sizes=marker_sizes,  # Custom sizes for each hue
        linewidth=0.2,
        palette=color_palette,
        alpha=0.8,
    )

    ax.set_title(f"{task_type}")


    if i != len(task_types) - 1:
        ax.get_legend().remove()
    else:
        ax.legend(title="Subset", loc="upper right", bbox_to_anchor=(1.75, 1.0))
fig.set_tight_layout(True)
# fig.suptitle(rf"PCA (3d $\rightarrow$ 2d)")
savefig(fig, "subset_pca")
plt.show()


    # break

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from carps.analysis.utils import setup_seaborn, savefig
from omegaconf import DictConfig
from matplotlib.lines import Line2D

setup_seaborn()

source_df = pd.read_csv("source_df.csv")
task_df = pd.read_csv("subset_tasks.csv")

id_columns = ["task_type", "task_id", "subset_id", "benchmark_id"]
optimizer_columns = [c for c in source_df.columns if c not in id_columns]


fig = plt.figure(figsize=(3*4,2*3))
axes = fig.subplots(3, 4)

task_types = task_df["task_type"].unique()

for i, task_type in enumerate(task_types):
    task_subset_df = task_df[task_df["task_type"] == task_type]

    source_subset_df = source_df[source_df["task_type"] == task_type]
    source_subset_df = source_subset_df.dropna(axis=1, how="any")
    optimizer_columns_sub = [c for c in optimizer_columns if c in source_subset_df.columns]
    performance_tensor_source = source_subset_df[optimizer_columns_sub].to_numpy()
    print(performance_tensor_source.shape)

    original_task_ids = task_subset_df["task_id"].apply(
            lambda x: "/".join(x.split("/")[3:]) if x else x)
    task_subset_df["original_task_id"] = original_task_ids

    source_subset_task_df = source_subset_df[
        source_subset_df["task_id"].isin(task_subset_df["original_task_id"])
    ]
    source_subset_task_df.loc[:, "subset_id"] = source_subset_df["task_id"].apply(
        lambda x: task_subset_df["subset_id"][task_subset_df["original_task_id"] == x].to_numpy())
    source_subset_task_df.loc[:, "subset_id"] = source_subset_task_df.loc[:, "subset_id"].apply(lambda x: x[0])
    performance_tensor_subset = source_subset_task_df[optimizer_columns_sub].to_numpy()
    print(performance_tensor_subset.shape)



    def get_2d_tensor(tsne, tensor_3d: np.ndarray) -> np.ndarray:
        # tsne = TSNE(n_components=2, random_state=42, perplexity=10)
        # tsne.fit(tensor_3d)
        # return tsne.fit_transform(tensor_3d)
        return tsne.transform(tensor_3d)
    np.random.seed(42)
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    # tsne.fit(performance_tensor_source)
    tsne = PCA(n_components=2)
    tsne.fit(performance_tensor_source)

    tensor_2d_source = get_2d_tensor(tsne, performance_tensor_source)
    tensor_2d_subset_dev = get_2d_tensor(tsne,
        source_subset_task_df[source_subset_task_df["subset_id"] == "dev"][optimizer_columns_sub].to_numpy()
    )
    tensor_2d_subset_test = get_2d_tensor(tsne,
        source_subset_task_df[source_subset_task_df["subset_id"] == "test"][optimizer_columns_sub].to_numpy()
    )

    plot_df = pd.concat([
        pd.DataFrame({
            "x0": tensor_2d_source[:, 0],
            "x1": tensor_2d_source[:, 1],
            "subset_id": "Source"
        }),
        pd.DataFrame({
            "x0": tensor_2d_subset_dev[:, 0],
            "x1": tensor_2d_subset_dev[:, 1],
            "subset_id": "Dev"
        }),
        pd.DataFrame({
            "x0": tensor_2d_subset_test[:, 0],
            "x1": tensor_2d_subset_test[:, 1],
            "subset_id": "Test"
        })
    ])


    # SCATTERPLOT
    # Plot the 2D representation of the data
    marker_shapes = {"Source": "o", "Dev": "*", "Test": "p"}
    marker_sizes = {"Source": 60*1.25, "Dev": 50*1.25, "Test": 40*1.25}
    color_palette = {"Source": "grey", "Dev": "mediumvioletred", "Test": "forestgreen"}
    ax = axes[0,i]
    ax = sns.scatterplot(
        data=plot_df, x="x0", y="x1", hue="subset_id", ax=ax,
        style="subset_id",  # This will use different shapes for each hue
        markers=marker_shapes,  # Custom shapes for each hue
        size="subset_id",  # This will adjust the size based on the hue
        sizes=marker_sizes,  # Custom sizes for each hue
        linewidth=0.2,
        palette=color_palette,
        alpha=0.8,
    )
    n_source = len(tensor_2d_source)
    n_dev = len(tensor_2d_subset_dev)
    n_test = len(tensor_2d_subset_test)
    ax.set_title(f"{task_type}\n(Source: {n_source}, Dev: {n_dev}, Test: {n_test})")
    if i != len(task_types) - 1:
        ax.get_legend().remove()
    else:
        ax.legend(title="Subset", loc="upper right", bbox_to_anchor=(1.75, 1.0))


    custom_legend = [
        Line2D([0], [0], marker="s", color="w", markerfacecolor="mediumvioletred", markersize=10, label="Dev"),
        Line2D([0], [0], marker="s", color="w", markerfacecolor="forestgreen", markersize=10, label="Test")
    ]

    # HISTOGRAM BENCHMARK FAMILIES
    renamer = {"source": "Source", "dev": "Dev", "test": "Test"}
    task_subset_df["subset_id"] = task_subset_df["subset_id"].map(renamer)
    print(task_subset_df.columns)
    print(task_subset_df["benchmark_id"].count())
    ax = axes[1,i]
    ax = sns.histplot(data=task_subset_df, x="benchmark_id", hue="subset_id",
        multiple="dodge", shrink=0.8, ax=ax, palette=color_palette, discrete=True)
    if i != len(task_types) - 1:
        ax.get_legend().remove()
    else:
        ax.legend(handles=custom_legend, title="Subset", loc="upper right", bbox_to_anchor=(1.65, 1.0))
    
    # HISTOGRAM DIMENSIONS
    ax = axes[2,i]
    def get_dim(task_id: str) -> int:
        index_fn = Path("../carps/configs/task/index.csv")
        if not index_fn.is_file():
            raise ValueError("ObjectiveFunction ids have not been indexed. Run `python -m carps.utils.index_configs`.")
        task_index = pd.read_csv(index_fn)

        def load_task_cfg(task_id: str) -> DictConfig:
            subset = task_index["config_fn"][task_index["task_id"] == task_id]
            if len(subset) == 0:
                raise ValueError(
                    f"Can't find config_fn for {task_id}. Maybe the index is old. Run "
                    "`python -m carps.utils.index_configs` to refresh."
                )
            config_fn = subset.iloc[0]
            if not Path(config_fn).is_file():
                raise ValueError(
                    f"Can't find config_fn for {task_id}. Maybe the index is old. Run "
                    "`python -m carps.utils.index_configs` to refresh."
                )
            cfg = OmegaConf.load(config_fn)
            return cfg.task
        task = load_task_cfg(task_id)
        dimension = task.metadata.dimensions
        return dimension
    
    dimensions_df = pd.DataFrame([
        {"dimension": get_dim(row["task_id"]), "subset_id": row["subset_id"]} for _, row in task_subset_df.iterrows()])
    ax = sns.histplot(data=dimensions_df, x="dimension", hue="subset_id",
        multiple="dodge", shrink=0.8, ax=ax, palette=color_palette, discrete=False, bins=10)
    if i != len(task_types) - 1:
        ax.get_legend().remove()
    else:
        ax.legend(handles=custom_legend, title="Subset", loc="upper right", bbox_to_anchor=(1.65, 1.0))
    
fig.set_tight_layout(True)
# fig.suptitle(rf"PCA (3d $\rightarrow$ 2d)")
savefig(fig, "subset_stats")
plt.show()


    # break