In [1]:
import pandas as pd
import numpy as np
from kaggle_kl_div import score as kaggle_metric
from scipy.special import softmax
from scipy.optimize import minimize
from glob import glob
from tqdm import tqdm
import warnings

warnings.simplefilter("ignore")


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


pd.set_option("display.max_columns", 100)


def calc_metric(oof):
    true = oof[true_cols]
    pred = oof[pred_cols]
    pred[pred_cols] = softmax(pred[pred_cols].values, 1)
    true["id"] = list(range(len(true)))
    pred["id"] = list(range(len(pred)))
    true.columns = [0, 1, 2, 3, 4, 5, "id"]
    pred.columns = [0, 1, 2, 3, 4, 5, "id"]
    score = kaggle_metric(solution=true, submission=pred, row_id_column_name="id")
    return score


def kl_divergence(final_preds):
    epsilon = 1e-10
    final_preds = softmax(final_preds, 1) + epsilon
    scores = []
    for i in range(len(pred_cols)):
        pr = final_preds[:, i]
        score = np.mean(true[:, i] * np.log(true[:, i] / pr))
        scores.append(score)
    return np.sum(scores)


def loss_fn(weights):
    final_preds = 0
    weights = np.array(weights) / np.sum(weights)
    for weight, pred in zip(weights, preds):
        final_preds += weight * pred
    score = kl_divergence(final_preds)
    return score


def round_to_nearest(x, base=0.1):
    return base * np.round(x / base)

In [20]:
# ariyasu
configs = [
    "finetune_hms_chris_fmax30_50sec_16ims_bandpass",
    "finetune_hms_chris_fmax30_30sec_8ims_bandpass",
    "finetune_hms_chris_fmax30_30sec_8ims_bandpass_spe_and_eeg",
    "finetune_hms_chris_fmax60_30sec_16ims_bandpass",
    "finetune_hms_chris_fmax90_10sec_8ims_bandpass",
    "finetune_hms_chris_fmax90_10sec_8ims_bandpass_spe_and_eeg",
]
true_cols = ["seizure", "lpd", "gpd", "lrda", "grda", "other"]
pred_cols = [f"pred_{c}" for c in true_cols]
preds = []
folds = range(5)
df = pd.read_csv("hms/input/train_with_path.csv").drop_duplicates(
    [
        "eeg_id",
        "seizure_vote",
        "lpd_vote",
        "gpd_vote",
        "lrda_vote",
        "grda_vote",
        "other_vote",
    ]
)
df["vote_sum"] = df[
    ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]
].sum(1)
good_label_ids = df[df["vote_sum"] >= 10].eeg_id
for config in configs:
    oof = pd.concat(
        [pd.read_csv(f"hms/results/{config}/oof_fold{fold}.csv") for fold in folds]
    )
    oof = oof.sort_values("eeg_id")
    oof = oof[oof.eeg_id.isin(good_label_ids)]
    preds.append(oof[pred_cols].values)
true = oof[true_cols].values

# tattaka
cols = [
    "oof_logits_seizure_vote",
    "oof_logits_lpd_vote",
    "oof_logits_gpd_vote",
    "oof_logits_lrda_vote",
    "oof_logits_grda_vote",
    "oof_logits_other_vote",
]
exp_dirs = [
    "hms/results/tattaka/0317/exp092/tiny_vit_21m_384_el30_mixup_50ep",
    "hms/results/tattaka/0317/exp094/caformer_s18_2_5d_256_el30_mixup_100ep",
]
for exp_dir in exp_dirs:
    c = exp_dir.split("/")[-1]
    df = pd.read_csv(
        glob(f"{exp_dir}/**/result_df.csv", recursive=True)[0]
    ).sort_values("eeg_id")
    df = df[df.eeg_id.isin(good_label_ids)]
    preds.append(df[cols].values)
    configs.append(c)

# bilzard
cols = [
    "pl_seizure_vote",
    "pl_lpd_vote",
    "pl_gpd_vote",
    "pl_lrda_vote",
    "pl_grda_vote",
    "pl_other_vote",
]
for c in [
    "exp081_8ep_sc03",
    "eeg022_16ep_sc03c",
]:
    tmp = pd.read_parquet(
        f"hms/results/bilzard/pseudo_label/{c}/train_pseudo_label.pqt"
    ).sort_values("eeg_id")
    tmp = tmp[tmp.eeg_id.isin(good_label_ids)]
    preds.append(tmp[cols].values)
    configs.append(c)

In [22]:
# Nelder-Mead
true = oof.sort_values("eeg_id")[true_cols].values + 1e-10
starting_weights = [1 / len(preds)] * len(preds)
constraints = {"type": "eq", "fun": lambda w: 1 - sum(w)}
bounds = [(0, 1)] * len(preds)
res = minimize(
    loss_fn,
    starting_weights,
    method="Nelder-Mead",
    bounds=bounds,
    constraints=constraints,
)

In [None]:
# print
weights = res["x"] / np.sum(res["x"])
print("weights:")
for w, c in zip(weights, configs):
    print(c, round(w, 4))

print("best score:", round(res["fun"], 4))

print("=" * 100)

rounded_weights = np.array([round_to_nearest(x) for x in weights])
rounded_weights = rounded_weights / np.sum(rounded_weights)
print("rounded_weights:")
for w, c in zip(rounded_weights, configs):
    print(c, round(w, 4))

all_preds = []
for w, p in zip(rounded_weights, preds):
    all_preds.append(p * w)
oof[pred_cols] = softmax(np.sum(all_preds, 0), 1)

true = oof[true_cols].copy()
pred = oof[pred_cols].copy()
true["id"] = list(range(len(true)))
pred["id"] = list(range(len(pred)))
true.columns = [0, 1, 2, 3, 4, 5, "id"]
pred.columns = [0, 1, 2, 3, 4, 5, "id"]
score = kaggle_metric(solution=true, submission=pred, row_id_column_name="id")
print("score:", round(score, 4))