In [None]:
import json

import numpy as np
import nevergrad as ng
import pandas as pd

from scipy.spatial.distance import cdist

In [None]:
SUBSET_SIZE = 50

In [None]:
with open("data/tabrepo/selected_pipelines.json", "r") as f:
    selected_pipelines = json.load(f)

with open("data/tabrepo/split.json", "r") as f:
    splits = json.load(f)

raw_ranks = pd.read_csv("data/tabrepo/raw_ranks.csv")

train_ranks = raw_ranks.loc[raw_ranks.dataset_id.isin(splits["train"])]
test_ranks = raw_ranks.loc[raw_ranks.dataset_id.isin(splits["test"])]

In [None]:
def search_closest_by_index(
    dataset_row_index: pd.DataFrame,
    datasets_with_landmarkers_to_search_from: pd.DataFrame,
    dataset_index: pd.DataFrame,
) -> float:
    distances = cdist(
        dataset_row_index.values.reshape(1, -1),
        dataset_index.values,
    ).flatten()
    closest_distance_idx = np.argmin(distances)
    closest_row = datasets_with_landmarkers_to_search_from.iloc[
        [closest_distance_idx]
    ]  # type: ignore
    closest_row_ranks = closest_row.iloc[:, 1:-1].values.flatten()
    best_pipeline_idx = np.argmin(closest_row_ranks)
    return best_pipeline_idx.item()


def objective(subset: set[str]) -> float:
    if len(subset) < SUBSET_SIZE:
        return 1e6
    ranks = []
    for idx, row in test_ranks.iterrows():
        selected_pipelines = list(sorted(list(subset)))
        row = test_ranks.loc[[idx]]
        best_pipeline_idx = search_closest_by_index(
            row[selected_pipelines],
            train_ranks,
            train_ranks[selected_pipelines],
        )
        best_pipeline_rank = row.iloc[:, 1:-1].values.reshape(-1)[
            best_pipeline_idx
        ]  # type: ignore
        ranks.append(best_pipeline_rank)

    return np.mean(ranks).item()

In [None]:
param = ng.p.Choice(
    train_ranks.iloc[:, 1:-1].columns.tolist(), repetitions=SUBSET_SIZE
)
optimizer = ng.optimizers.OnePlusOne(
    parametrization=param, budget=100_000, num_workers=8
)
recommendation = optimizer.minimize(
    objective, batch_mode=True, max_time=3600, verbosity=1
)

In [None]:
with open("best_optimized.json", "w") as f:
    json.dump(recommendation.value, f, indent=4)