In [None]:
from pathlib import Path
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import pickle as pkl
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score
from sklearn.manifold import MDS

In [None]:
PLOTS_PATH = Path("../results") / "plots"
PLOTS_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
def get_from_json(json_path: Path, key: str) -> pd.DataFrame:

    with open(json_path, "r") as file:
        data = json.load(file)
    key = key.split(".")
    output = data
    for el in key:
        output = output[el]
    return output

In [None]:
RESULTS_PATH = Path("../results")

### Synthetic

In [None]:
results_dfs = []
for method in (RESULTS_PATH / "synthetic").iterdir():
    if not method.is_dir():
        continue
    accuracies = [
        get_from_json(p, "mae_by_model") for p in method.rglob("*.json")
    ]
    corr_performances = [
        get_from_json(p, "spearmanr_corr") for p in method.rglob("*.json")
    ]
    results_dfs.append(
        pd.DataFrame(
            {
                "problem": ["synthetic"] * len(accuracies),
                "meta-model": [method.stem] * len(accuracies),
                "mae": accuracies,
                "corr": corr_performances,
            }
        )
    )
maes = []
for p in (RESULTS_PATH / "synthetic" / "gbdsim").iterdir():
    maes.append(get_from_json(p / "metrics.json", "mae_by_median"))
results_dfs.append(
    pd.DataFrame(
        {
            "problem": ["synthetic"] * len(maes),
            "meta-model": ["median"] * len(maes),
            "mae": maes,
        }
    )
)
synthetic_results_df = pd.concat(results_dfs, ignore_index=True)

NAMES_MAPPING = {
    "gbdsim": "GBDSim (ours)",
    "dataset2vec": "Dataset2Vec",
    "median": "Median",
}

synthetic_results_df["meta-model"] = synthetic_results_df["meta-model"].map(
    NAMES_MAPPING
)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(
    synthetic_results_df, x="meta-model", y="mae", errorbar="se", ax=ax
)
ax.set_ylabel("MAE (lower is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
fig.savefig(PLOTS_PATH / "synthetic_mae.png", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(
    synthetic_results_df.loc[~pd.isna(synthetic_results_df["corr"])],
    x="meta-model",
    y="corr",
    errorbar="se",
    ax=ax,
)
ax.set_ylabel("Correlation (higher is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
fig.savefig(PLOTS_PATH / "synthetic_corr.png", bbox_inches="tight")

In [None]:
results_dfs = []
for method in (RESULTS_PATH / "uci").iterdir():
    if not method.is_dir():
        continue
    accuracies = [get_from_json(p, "accuracy") for p in method.rglob("*.json")]
    results_dfs.append(
        pd.DataFrame(
            {
                "problem": ["uci"] * len(accuracies),
                "meta-model": [method.stem] * len(accuracies),
                "accuracy": accuracies,
            }
        )
    )
uci_results_df = pd.concat(results_dfs, ignore_index=True)

NAMES_MAPPING = {
    "gbdsim": "GBDSim (ours)",
    "dataset2vec": "Dataset2Vec",
    "median": "Median",
}

uci_results_df["meta-model"] = uci_results_df["meta-model"].map(NAMES_MAPPING)

### UCI

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(uci_results_df, x="meta-model", y="accuracy", errorbar="se", ax=ax)
ax.set_ylabel("Accuracy (higher is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
ax.set_ylim(0.0, 0.9)
fig.savefig(PLOTS_PATH / "uci_accuracy.png", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(25, 10))
for i, method in enumerate((RESULTS_PATH / "uci").iterdir()):
    indexes = []
    fig.text(
        -0.01,
        0.25 + i * 0.5,
        NAMES_MAPPING[method.stem],
        va="center",
        rotation="vertical",
        fontsize=12,
    )
    for j, result_path in enumerate(method.iterdir()):
        with open(result_path / "representations.pkl", "rb") as file:
            representations = torch.load(file, weights_only=False)
        with open(result_path / "representation_labels.pkl", "rb") as file:
            labels = pkl.load(file)
        representations = MDS(n_components=2).fit_transform(representations)
        sns.scatterplot(
            x=representations[:, 0],
            y=representations[:, 1],
            hue=labels,
            ax=ax[i, j],
        )
        if not (i == 0 and j == 0):
            ax[i, j].get_legend().remove()
fig.tight_layout()
fig.savefig(PLOTS_PATH / "uci_representations.png", bbox_inches="tight")

In [None]:
output_dfs = []
for method in (RESULTS_PATH / "uci").iterdir():
    indexes = []
    for result_path in method.iterdir():
        with open(result_path / "representations.pkl", "rb") as file:
            representations = torch.load(file, weights_only=False)
        with open(result_path / "representation_labels.pkl", "rb") as file:
            labels = pkl.load(file)
        clustering = KMeans(n_clusters=5, random_state=42)
        labels = clustering.fit_predict(representations)
        indexes.append(calinski_harabasz_score(representations, labels))
    output_dfs.append(
        pd.DataFrame(
            {
                "problem": ["uci"] * len(indexes),
                "meta-model": [method.stem] * len(indexes),
                "calinski_harabasz_score": indexes,
            }
        )
    )
output_df = pd.concat(output_dfs, ignore_index=True)
output_df["meta-model"] = output_df["meta-model"].map(NAMES_MAPPING)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(
    output_df,
    x="meta-model",
    y="calinski_harabasz_score",
    errorbar="se",
    ax=ax,
)
ax.set_ylabel("CH index (higher is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
ax.set_ylim(0.0, 250.0)
fig.savefig(PLOTS_PATH / "uci_ch.png", bbox_inches="tight")

### TabRepo

In [None]:
results_dfs = []

for method in (RESULTS_PATH / "tabrepo").iterdir():
    model_corrs = [
        get_from_json(p, "metric_estimation_results.model_mae")
        for p in method.rglob("*.json")
    ]
    results_dfs.append(
        pd.DataFrame(
            {
                "problem": ["tabrepo"] * len(model_corrs),
                "meta-model": [method.stem] * len(model_corrs),
                "mae": model_corrs,
            }
        )
    )

median_maes = [
    get_from_json(
        p,
        "metric_estimation_results.model_mae",
    )
    for p in (RESULTS_PATH / "tabrepo" / "gbdsim").rglob("*.json")
]

results_dfs.append(
    pd.DataFrame(
        {
            "problem": ["tabrepo"] * len(median_maes),
            "meta-model": ["median"] * len(median_maes),
            "mae": median_maes,
        }
    )
)

result_df = pd.concat(results_dfs, ignore_index=True)
result_df["meta-model"] = result_df["meta-model"].map(NAMES_MAPPING)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(result_df, x="meta-model", y="mae", errorbar="se", ax=ax)
ax.set_ylabel("MAE (lower is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
ax.set_ylim(0.0, 0.4)
fig.savefig(PLOTS_PATH / "tabrepo_mae.png", bbox_inches="tight")

In [None]:
results_dfs = []

for method in (RESULTS_PATH / "tabrepo").iterdir():
    model_corrs = [
        get_from_json(p, "metric_estimation_results.spearmanr_corr_model")
        for p in method.rglob("*.json")
    ]
    results_dfs.append(
        pd.DataFrame(
            {
                "problem": ["tabrepo"] * len(model_corrs),
                "meta-model": [method.stem] * len(model_corrs),
                "corr": model_corrs,
            }
        )
    )

result_df = pd.concat(results_dfs, ignore_index=True)
result_df["meta-model"] = result_df["meta-model"].map(NAMES_MAPPING)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(result_df, x="meta-model", y="corr", errorbar="se", ax=ax)
ax.set_ylabel("Spearmann correlation (higher is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
ax.set_ylim(0.0, 0.4)
fig.savefig(PLOTS_PATH / "tabrepo_corr.png", bbox_inches="tight")

In [None]:
results_dfs = []

for method in (RESULTS_PATH / "tabrepo").iterdir():
    for file in method.rglob("*.json"):
        results_dfs.append(
            pd.DataFrame(
                {
                    "problem": ["tabrepo"] * 4,
                    "method": [
                        "Landmarkers",
                        "Random pipeline",
                        "Best from random dataset",
                        "meta-model",
                    ],
                    "avg_rank_of_selected_pipeline": [
                        get_from_json(
                            file, "pipeline_selection_results.landmarkers"
                        )["mean"],
                        get_from_json(
                            file, "pipeline_selection_results.random_pipeline"
                        )["mean"],
                        get_from_json(
                            file, "pipeline_selection_results.random_dataset"
                        )["mean"],
                        get_from_json(
                            file, "pipeline_selection_results.model_based"
                        )["mean"],
                    ],
                    "model": [method.stem] * 4,
                }
            )
        )


result_df = pd.concat(results_dfs, ignore_index=True)
result_df["model"] = result_df["model"].map(NAMES_MAPPING)

In [None]:
result_df.loc[result_df.method == "meta-model", "method"] = result_df.loc[
    result_df.method == "meta-model", "model"
]

In [None]:
result_df.sort_values("method")
output_dfs = []
for method in result_df.method.unique():
    if method in ("GBDSim", "Dataset2Vec"):
        output_dfs.append(result_df.loc[result_df.method == method])
    else:
        output_dfs.append(result_df.loc[result_df.method == method][:5])
output_df = pd.concat(output_dfs)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(
    output_df,
    x="method",
    y="avg_rank_of_selected_pipeline",
    errorbar="se",
    ax=ax,
    order=[
        "Landmarkers",
        "GBDSim (ours)",
        "Dataset2Vec",
        "Best from random dataset",
        "Random pipeline",
    ],
)
ax.set_ylabel("Normalized rank (lower is better)", fontsize=14)
ax.set_xlabel("Meta-model", fontsize=14)
ax.set_ylim(0.0, 0.2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
fig.savefig(PLOTS_PATH / "tabrepo_rank.png", bbox_inches="tight")