In [1]:
import pandas as pd

In [74]:
df = pd.read_csv("results_full.csv")
df = df[df["split_method"] == "gower_split"]
df = df[df['suite_id'] == 334]
split = "gower_split"
type = 334

In [75]:
rename_map = {
    'ConstantPredictor': 'Const.',
    'FTTransformer': 'FT-Trans.',
    'LGBMRegressor': 'GBT',
    'LGBMClassifier': 'GBT',
    'LinearRegressor': 'Lin. Regr.',
    'RandomForestRegressor': 'RF',
    'RandomForestClassifier': 'RF',
    'ResNet': 'ResNet',
    'TabPFNRegressor': 'TabPFN',
    'TabPFNClassifier': 'TabPFN',
    'LogisticRegressor': 'Log. Regr.',
    'GPBoost_LogLoss': 'GP',
    'GPBoost_Accuracy': 'GP',
    'GPBoost_CRPS': 'GP',
    'GPBoost_RMSE': 'GP'

}


Average Difference and Average Relative Difference RMSE

In [132]:
metric = "RMSE"
dfm   = df[df["metric"] == metric]

pivot_rmse = dfm.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)
pivot_rmse = pivot_rmse.rename(columns=rename_map)

pivot_rmse.to_csv(f"avg_{metric}_per_task_per_model.csv", float_format="%.3f")


In [133]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    # average across tasks (skipping any NaNs)
    return rel_diff.mean(axis=0)


avg_diff_rmse = avg_relative_diff(pivot_rmse, error_metric=True)

print("Average relative difference (RMSE):")
print(avg_diff_rmse.round(2))


Average relative difference (RMSE):
model
Const.          335.43
Engression     2589.68
FT-Trans.        35.96
GPBoost         106.00
GBT             141.53
Lin. Regr.    96189.89
MLP             367.64
RF              153.44
ResNet          120.62
TabPFN           24.50
dtype: float64


In [134]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_rmse = normalized_accuracy(pivot_rmse, error_metric=True)
print("Average normalized accuracy (from RMSE):")
print((100 * avg_norm_acc_rmse).round(2))


Average normalized accuracy (from RMSE):
model
Const.        15.72
Engression    42.19
FT-Trans.     61.80
GPBoost       28.86
GBT           75.12
Lin. Regr.     0.00
MLP           45.42
RF            60.94
ResNet        44.72
TabPFN        95.74
dtype: float64


In [135]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_rmse = avg_rank(pivot_rmse, error_metric=True)
print("Average rank (RMSE):")
print(avg_rank_rmse.round(2))



Average rank (RMSE):
model
Const.        8.29
Engression    5.59
FT-Trans.     3.93
GPBoost       6.35
GBT           3.24
Lin. Regr.    9.18
MLP           5.65
RF            4.35
ResNet        5.71
TabPFN        1.65
dtype: float64


In [136]:
avg_diff    = avg_relative_diff(pivot_rmse, error_metric=True)           # in %
avg_acc     = normalized_accuracy(pivot_rmse, error_metric=True) * 100   # convert to % 
avg_rank    = avg_rank(pivot_rmse, error_metric=True)

metrics = pd.DataFrame(
    [avg_diff, avg_acc, avg_rank],
    index=["Avg. diff.", "Avg. acc.", "Avg. rank."]
)

pivot_with_summaries = pd.concat([pivot_rmse, metrics])

out = pivot_with_summaries.reset_index().rename(columns={"index":"task_id"})

out.to_csv(f"avg_{metric}_with_summary.csv", float_format="%.3f", index=False)

latex = out.to_latex(index=False, escape=True, float_format="%.3f")
with open(f"{type}_{split}_{metric}_results.tex","w") as f:
    f.write(latex)

Average Difference and Average Relative Difference CRPS

In [137]:
metric1 = "CRPS"
dfm1   = df[df["metric"] == metric1]

pivot_crps = dfm1.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

pivot_crps = pivot_crps.rename(columns=rename_map)
pivot_crps.to_csv(f"avg_{metric1}_per_task_per_model.csv", float_format="%.3f")


In [138]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    # average across tasks (skipping any NaNs)
    return rel_diff.mean(axis=0)


avg_diff_crps = avg_relative_diff(pivot_crps, error_metric=True)

print("Average relative difference (CRPS):")
print(avg_diff_crps.round(2))


Average relative difference (CRPS):
model
Const.          743.72
DGBT             71.88
DRF             152.01
Engression     2359.59
FT-Trans.        26.64
GPBoost         310.01
GBT              94.24
Lin. Regr.    49160.59
MLP             290.49
RF              108.34
ResNet          367.46
TabPFN           15.10
dtype: float64


In [139]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_crps = normalized_accuracy(pivot_crps, error_metric=True)
print("Average normalized accuracy (from CRPS):")
print((100 * avg_norm_acc_crps).round(2))


Average normalized accuracy (from CRPS):
model
Const.        14.70
DGBT          71.29
DRF           70.29
Engression    18.80
FT-Trans.     77.77
GPBoost       40.86
GBT           71.46
Lin. Regr.     6.42
MLP           57.00
RF            63.31
ResNet        54.12
TabPFN        89.68
dtype: float64


In [140]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_crps = avg_rank(pivot_crps, error_metric=True)
print("Average rank (CRPS):")
print(avg_rank_crps.round(2))



Average rank (CRPS):
model
Const.        10.41
DGBT           4.41
DRF            4.71
Engression     9.47
FT-Trans.      3.71
GPBoost        7.82
GBT            4.65
Lin. Regr.    10.24
MLP            6.47
RF             5.59
ResNet         6.88
TabPFN         2.18
dtype: float64


In [141]:
avg_diff1    = avg_relative_diff(pivot_crps, error_metric=True)           # in %
avg_acc1     = normalized_accuracy(pivot_crps, error_metric=True) * 100   # convert to % 
avg_rank1    = avg_rank(pivot_crps, error_metric=True)

metrics = pd.DataFrame(
    [avg_diff1, avg_acc1, avg_rank1],
    index=["Avg. diff.", "Avg. acc.", "Avg. rank."]
)

pivot_with_summaries1 = pd.concat([pivot_crps, metrics])

out1 = pivot_with_summaries1.reset_index().rename(columns={"index":"task_id"})

out1.to_csv(f"avg_{metric1}_with_summary.csv", float_format="%.3f", index=False)

latex1 = out1.to_latex(index=False, escape=True, float_format="%.3f")
with open(f"{type}_{split}_{metric1}_results.tex","w") as f:
    f.write(latex1)

Average Difference and Average Relative Difference LOGLOSS

In [76]:
metric2 = "LogLoss"
dfm2   = df[df["metric"] == metric2]

pivot_ll = dfm2.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)
pivot_ll = pivot_ll.rename(columns=rename_map)
pivot_ll.to_csv(f"avg_{metric2}_per_task_per_model.csv", float_format="%.3f")


In [77]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    return rel_diff.mean(axis=0)


avg_diff_ll = avg_relative_diff(pivot_ll, error_metric=True)

print("Average relative difference (LogLoss):")
print(avg_diff_ll.round(2))


Average relative difference (LogLoss):
model
Const.         46.43
Engression    152.45
FT-Trans.       6.57
GP             15.02
GBT             5.32
Log. Regr.     21.10
MLP            18.39
RF              5.50
ResNet         14.12
TabPFN          1.54
dtype: float64


In [78]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_ll = normalized_accuracy(pivot_ll, error_metric=True)
print("Average normalized accuracy (from LogLoss):")
print((100 * avg_norm_acc_ll).round(2))


Average normalized accuracy (from LogLoss):
model
Const.         2.03
Engression     0.00
FT-Trans.     66.13
GP            38.31
GBT           80.11
Log. Regr.    14.73
MLP           40.90
RF            77.53
ResNet        59.10
TabPFN        93.62
dtype: float64


In [79]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_ll = avg_rank(pivot_ll, error_metric=True)
print("Average rank (LogLoss):")
print(avg_rank_ll.round(2))



Average rank (LogLoss):
model
Const.        9.14
Engression    8.86
FT-Trans.     4.17
GP            6.14
GBT           2.71
Log. Regr.    7.43
MLP           5.71
RF            3.14
ResNet        5.00
TabPFN        1.86
dtype: float64


In [80]:
avg_diff2    = avg_relative_diff(pivot_ll, error_metric=True)           # in %
avg_acc2     = normalized_accuracy(pivot_ll, error_metric=True) * 100   # convert to % 
avg_rank2    = avg_rank(pivot_ll, error_metric=True)

metrics = pd.DataFrame(
    [avg_diff2, avg_acc2, avg_rank2],
    index=["Avg. diff.", "Avg. acc.", "Avg. rank."]
)

pivot_with_summaries2 = pd.concat([pivot_ll, metrics])

out2 = pivot_with_summaries2.reset_index().rename(columns={"index":"task_id"})

out2.to_csv(f"avg_{metric2}_with_summary.csv", float_format="%.3f", index=False)

latex2 = out2.to_latex(index=False, escape=True, float_format="%.3f")
with open(f"{type}_{split}_{metric2}_results.tex","w") as f:
    f.write(latex2)

Average Difference and Average Relative Difference Accuracy

In [81]:
metric3 = "Accuracy"
dfm3   = df[df["metric"] == metric3]

pivot_acc = dfm3.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

pivot_acc = pivot_acc.rename(columns=rename_map)
pivot_acc.to_csv(f"avg_{metric3}_per_task_per_model.csv", float_format="%.5f")


In [82]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)     
                         .mul(-1)                         
                         .div(best_per_task , axis=0)
                         * 100)

    return rel_diff.mean(axis=0)


avg_diff_accuracy = avg_relative_diff(pivot_acc, error_metric=False)

print("\nAverage relative difference (Accuracy):")
print(avg_diff_accuracy.round(2))



Average relative difference (Accuracy):
model
Const.        41.83
Engression     8.03
FT-Trans.      6.59
GP             6.72
GBT            2.30
Log. Regr.     6.69
MLP            3.49
RF             2.50
ResNet         3.54
TabPFN         1.15
dtype: float64


In [83]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_acc = normalized_accuracy(pivot_acc, error_metric=False)
print("Average normalized accuracy (from Accuracy):")
print((100 * avg_norm_acc_acc).round(3))


Average normalized accuracy (from Accuracy):
model
Const.         0.000
Engression    18.091
FT-Trans.     29.494
GP            15.650
GBT           65.321
Log. Regr.    13.713
MLP           51.136
RF            69.918
ResNet        59.839
TabPFN        82.953
dtype: float64


In [84]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_acc = avg_rank(pivot_acc, error_metric=False)
print("Average rank (Accuracy):")
print(avg_rank_acc.round(2))



Average rank (Accuracy):
model
Const.        9.86
Engression    7.07
FT-Trans.     6.08
GP            7.29
GBT           3.71
Log. Regr.    6.86
MLP           4.14
RF            3.07
ResNet        3.71
TabPFN        2.64
dtype: float64


In [85]:
avg_diff3    = avg_relative_diff(pivot_acc, error_metric=False)           # in %
avg_acc3     = normalized_accuracy(pivot_acc, error_metric=False) * 100   # convert to % 
avg_rank3    = avg_rank(pivot_acc, error_metric=False)

metrics = pd.DataFrame(
    [avg_diff3, avg_acc3, avg_rank3],
    index=["Avg. diff.", "Avg. acc.", "Avg. rank."]
)

pivot_with_summaries3 = pd.concat([pivot_acc, metrics])

out3 = pivot_with_summaries3.reset_index().rename(columns={"index":"task_id"})

out3.to_csv(f"avg_{metric3}_with_summary.csv", float_format="%.3f", index=False)

latex3 = out3.to_latex(index=False, escape=True, float_format="%.3f")
with open(f"{type}_{split}_{metric3}_results.tex","w") as f:
    f.write(latex3)