In [None]:
import os

os.chdir("..")

In [None]:
import numpy as np
import pandas as pd
import json
import random

from collections import Counter

In [None]:
with open("data/tabrepo/split.json", "r") as f:
    train_split = json.load(f)["train"]

df = pd.read_csv("data/tabrepo/raw_ranks.csv")
df = df.loc[df.dataset_id.isin(train_split)]

In [None]:
value_columns = df.iloc[:, 1:-1]

In [None]:
pipelines = [p.split("_")[0] for p in value_columns.columns.tolist()]
Counter(pipelines)

In [None]:
best_models = np.argmin(value_columns.values, axis=1)
best_models = [value_columns.columns[idx] for idx in best_models]
Counter([p.split("_")[0] for p in best_models]).most_common(20)

In [None]:
best_performing_models = [
    item[0] for item in Counter(best_models).most_common(30)
]

In [None]:
relevant_models = [
    "NeuralNetFastAI",
    "CatBoost",
    "NeuralNetTorch",
    "LightGBM",
    "ExtraTrees",
    "XGBoost",
]

In [None]:
stds = pd.DataFrame(
    {
        "pipeline": value_columns.columns,
        "model": [m.split("_")[0] for m in value_columns.columns],
        "std": value_columns.std(axis=0),
    }
).reset_index(drop=True)
stds = stds.groupby(by="model").apply(
    lambda df: df.sort_values("std", ascending=False).head(5),
)
stds = stds.loc[stds.model.isin(relevant_models)]
highest_std_pipelines = stds.reset_index(drop=True).pipeline.tolist()

In [None]:
selected_models = highest_std_pipelines + best_performing_models
random.shuffle(selected_models)

In [None]:
with open("data/tabrepo/selected_pipelines.json", "w") as f:
    json.dump(selected_models, f, indent=4)