In [None]:
import json
import pandas as pd
from scipy.spatial.distance import cdist
import numpy as np
from tqdm import tqdm
import random
from typing import Any
import pickle as pkl
from typing import Callable
from functools import partial
from gbdsim.data.data_preprocessor import DataPreprocessor
from pathlib import Path
from tqdm import tqdm
import torch

In [None]:
with open("data/tabrepo/selected_pipelines.json", "r") as f:
    selected_pipelines = json.load(f)

with open("data/tabrepo/split.json", "r") as f:
    splits = json.load(f)

raw_ranks = pd.read_csv("data/tabrepo/raw_ranks.csv")

train_ranks = raw_ranks.loc[raw_ranks.dataset_id.isin(splits["train"])]
test_ranks = raw_ranks.loc[raw_ranks.dataset_id.isin(splits["test"])]

## Baselines

In [None]:
def calculate_distances_to_rows_based_on_euclidean_distance(
    X: pd.DataFrame,
    y: pd.DataFrame,
    columns_to_select: list[str] | None = None,
) -> np.ndarray:
    if columns_to_select is not None:
        X = X[columns_to_select]
        y = y[columns_to_select]
    return cdist(
        y.values.reshape(1, -1),
        X.values,
    ).flatten()


def get_closest_dataset_idx(
    X: pd.DataFrame,
    y: pd.DataFrame,
    distance_calculator: Callable[[pd.DataFrame, pd.DataFrame], np.ndarray],
) -> int:
    distances = distance_calculator(X, y)
    closest_distance_idx = np.argmin(distances)
    return closest_distance_idx.item()


def search_closest_by_index(
    target_dataset: pd.DataFrame,
    dataset_base: pd.DataFrame,
    distance_calculator: Callable[[pd.DataFrame, pd.DataFrame], np.ndarray],
) -> float:
    closest_distance_idx = get_closest_dataset_idx(
        dataset_base, target_dataset, distance_calculator
    )
    closest_row = dataset_base.iloc[[closest_distance_idx]]
    closest_row_ranks = closest_row.iloc[:, 1:-1].values.flatten()
    best_pipeline_idx = np.argmin(closest_row_ranks)
    return best_pipeline_idx.item()


def search_random_pipeline(
    target_dataset: pd.DataFrame, dataset_base: pd.DataFrame, *args, **kwargs
) -> float:
    num_pipelines = dataset_base.iloc[:, 1:-1].shape[1]
    return random.randint(0, num_pipelines - 1)


def search_best_pipeline_from_random_dataset(
    target_dataset: pd.DataFrame,
    dataset_base: pd.DataFrame,
    *args,
    **kwargs,
) -> float:
    return search_closest_by_index(
        target_dataset,
        dataset_base,
        lambda X, y: np.random.uniform(0, 1, (X.shape[0],)),
    )

In [None]:
ranks = []
for idx, row in tqdm(list(test_ranks.iterrows())):
    row = test_ranks.loc[[idx]]
    best_pipeline_idx = search_closest_by_index(
        row[selected_pipelines],
        train_ranks,
        partial(
            calculate_distances_to_rows_based_on_euclidean_distance,
            columns_to_select=selected_pipelines,
        ),
    )
    best_pipeline_rank = row.iloc[:, 1:-1].values.reshape(-1)[
        best_pipeline_idx
    ]  # type: ignore
    ranks.append(best_pipeline_rank)
print(f"search by landmarkers = {np.mean(ranks):.4f}")

In [None]:
ranks = []
for idx, row in tqdm(list(test_ranks.iterrows())):
    row = test_ranks.loc[[idx]]
    for _ in range(1000):
        best_pipeline_idx = search_random_pipeline(
            row[selected_pipelines],
            train_ranks,
        )
        best_pipeline_rank = row.iloc[:, 1:-1].values.reshape(-1)[
            best_pipeline_idx
        ]  # type: ignore
        ranks.append(best_pipeline_rank)
print(f"random pipeline = {np.median(ranks):.4f}")

In [None]:
ranks = []
for idx, row in tqdm(list(test_ranks.iterrows())):
    row = test_ranks.loc[[idx]]
    for _ in range(1000):
        best_pipeline_idx = search_best_pipeline_from_random_dataset(
            row[selected_pipelines],
            train_ranks,
        )
        best_pipeline_rank = row.iloc[:, 1:-1].values.reshape(-1)[
            best_pipeline_idx
        ]  # type: ignore
        ranks.append(best_pipeline_rank)
print(f"random dataset = {np.median(ranks):.4f}")

## Model

In [None]:
with open(
    "results/tabrepo/dataset2vec/2025_05_13__16_40_26/final_model.pkl", "rb"
) as f:
    model = pkl.load(f).model

In [None]:
datasets = [
    (
        int(path.stem),
        DataPreprocessor().preprocess_pandas_data(pd.read_csv(path)),
    )
    for path in Path("data/tabrepo/datasets").iterdir()
]
train_datasets = list(filter(lambda d: d[0] in splits["train"], datasets))
test_datasets = list(filter(lambda d: d[0] in splits["test"], datasets))

In [None]:
ranks = []
for dataset_id, (X, y) in tqdm(test_datasets):
    distances = []
    dids = []
    for train_dataset_id, (X_train, y_train) in train_datasets:
        dids.append(train_dataset_id)
        with torch.no_grad():
            if X_train.shape[1] == 0 or X.shape[1] == 0:
                distances.append(float("inf"))
            else:
                distances.append(
                    model.calculate_dataset_distance(
                        X_train.cuda(), y_train.cuda(), X.cuda(), y.cuda()
                    )
                    .detach()
                    .cpu()
                    .item()
                )
    closest_dataset_did = dids[np.argmin(distances)]
    best_pipeline_idx = np.argmin(
        train_ranks.loc[
            train_ranks.dataset_id == closest_dataset_did
        ].values.flatten()[1:-1]
    )
    ranks.append(
        test_ranks.loc[test_ranks.dataset_id == dataset_id].values.flatten()[
            1:-1
        ][best_pipeline_idx]
    )
print(f"gbdsim = {np.median(ranks):.4f}")
# gbdsim = 0.1373
# dataset2vec = 0.1767

# dataset2vec median = 0.1326
# gbdsim median = 0.1594

In [None]:
print("Dataset2Vec")
print(f"mean = {np.mean(ranks):.4f}")
print(f"median = {np.median(ranks):.4f}")
print(f"stdev = {np.std(ranks):.4f}")

# Dataset2Vec
# mean = 0.1480
# median = 0.1413
# stdev = 0.0555

In [None]:
print("GBDSIM")
print(f"mean = {np.mean(ranks):.4f}")
print(f"median = {np.median(ranks):.4f}")
print(f"stdev = {np.std(ranks):.4f}")

# GBDSIM
# mean = 0.1631
# median = 0.1594
# stdev = 0.0554

In [None]:
df = pd.read_csv("data/tabrepo/raw_ranks.csv")

In [None]:
df

In [None]:
(
    (df.iloc[:, 1:-1] * 100).values.T
    == (df.iloc[:, 1:-1] * 100).values.T.min(axis=0)
).sum()