In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("results_tabz.csv")
df = df[df["split_method"] == "random_split"]

In [3]:
rename_map = {
    'ConstantPredictor': 'Const.',
    'FTTransformer': 'FT-Trans.',
    'LGBMRegressor': 'GBT',
    'LGBMClassifier': 'GBT',
    'LinearRegressor': 'Lin. Regr.',
    'RandomForestRegressor': 'RF',
    'RandomForestClassifier': 'RF',
    'ResNet': 'ResNet',
    'TabPFNRegressor': 'TabPFN',
    'TabPFNClassifier': 'TabPFN',
    'LogisticRegressor': 'Log. Regr.',
    'GPBoost_LogLoss': 'GP',
    'GPBoost_Accuracy': 'GP',
    'GPBoost_CRPS': 'GP',
    'GPBoost_RMSE': 'GP'

}


Average Difference and Average Relative Difference LOGLOSS

In [4]:
rename_map_ll = {
    'ConstantPredictor': 'Const.',
    'FTTransformer': 'FT-Trans.',
    'LGBMClassifier': 'GBT',
    'RandomForestClassifier': 'RF',
    'ResNet': 'ResNet',
    'TabPFNClassifier': 'TabPFN',
    'LogisticRegressor': 'Log. Regr.',
    'GPBoost_LogLoss': 'GP'

}


In [5]:
metric2 = "LogLoss"
dfm2   = df[df["metric"] == metric2]

pivot_ll = dfm2.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)
pivot_ll = pivot_ll.rename(columns=rename_map_ll)
pivot_ll.to_csv(f"avg_{metric2}_per_task_per_model_IP_tabz.csv", float_format="%.3f")


In [6]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    return rel_diff.mean(axis=0)


avg_diff_ll = avg_relative_diff(pivot_ll, error_metric=True)

print("Average relative difference (LogLoss) - Interpolation:")
print(avg_diff_ll.round(2))


Average relative difference (LogLoss) - Interpolation:
model
Const.        51642.82
Engression       28.06
FT-Trans.       131.26
GP             1513.32
GBT               8.16
Log. Regr.     2564.13
MLP              87.14
RF             2924.21
ResNet           89.00
TabPFN          253.53
dtype: float64


In [7]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_ll = normalized_accuracy(pivot_ll, error_metric=True)
print("Average normalized accuracy (from LogLoss) - Interpolation:")
print((100 * avg_norm_acc_ll).round(2))


Average normalized accuracy (from LogLoss) - Interpolation:
model
Const.         0.00
Engression    60.62
FT-Trans.     19.85
GP            49.77
GBT           82.59
Log. Regr.    42.14
MLP           36.07
RF            69.05
ResNet        21.30
TabPFN        94.07
dtype: float64


In [8]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_ll = avg_rank(pivot_ll, error_metric=True)
print("Average rank (LogLoss) - Interpolation:")
print(avg_rank_ll.round(2))



Average rank (LogLoss) - Interpolation:
model
Const.        8.79
Engression    4.46
FT-Trans.     6.62
GP            4.76
GBT           2.86
Log. Regr.    5.24
MLP           5.35
RF            3.95
ResNet        6.58
TabPFN        1.66
dtype: float64


In [9]:
avg_diff2    = avg_relative_diff(pivot_ll, error_metric=True)           # in %
avg_acc2     = normalized_accuracy(pivot_ll, error_metric=True) * 100   # convert to % 
avg_rank2    = avg_rank(pivot_ll, error_metric=True)

metrics = pd.DataFrame(
    [avg_diff2, avg_acc2, avg_rank2],
    index=["Avg. diff.", "Avg. acc.", "Avg. rank."]
)

pivot_with_summaries2 = pd.concat([pivot_ll, metrics])

out2 = pivot_with_summaries2.reset_index().rename(columns={"index":"task_id"})

out2.to_csv(f"avg_{metric2}_with_summary_IP.csv", float_format="%.3f", index=False)

latex2 = out2.to_latex(index=False, escape=True, float_format="%.3f")
with open(f"tabz_{metric2}_IP_results.tex","w") as f:
    f.write(latex2)

Average Difference and Average Relative Difference Accuracy

In [10]:
rename_map_acc = {
    'ConstantPredictor': 'Const.',
    'FTTransformer': 'FT-Trans.',
    'LGBMClassifier': 'GBT',
    'RandomForestClassifier': 'RF',
    'ResNet': 'ResNet',
    'TabPFNClassifier': 'TabPFN',
    'LogisticRegressor': 'Log. Regr.',
    'GPBoost_Accuracy': 'GP'

}


In [11]:
metric3 = "Accuracy"
dfm3   = df[df["metric"] == metric3]

pivot_acc = dfm3.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

pivot_acc = pivot_acc.rename(columns=rename_map_acc)
pivot_acc.to_csv(f"avg_{metric3}_per_task_per_model_IP_tabz.csv", float_format="%.5f")


In [12]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)     
                         .mul(-1)                         
                         .div(best_per_task , axis=0)
                         * 100)

    return rel_diff.mean(axis=0)


avg_diff_accuracy = avg_relative_diff(pivot_acc, error_metric=False)

print("\nAverage relative difference (Accuracy) - Interpolation:")
print(avg_diff_accuracy.round(2))



Average relative difference (Accuracy) - Interpolation:
model
Const.        36.48
Engression     6.88
FT-Trans.     33.92
GP             7.82
GBT            3.87
Log. Regr.     9.72
MLP           19.58
RF             7.70
ResNet        25.95
TabPFN         2.41
dtype: float64


In [13]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_acc = normalized_accuracy(pivot_acc, error_metric=False)
print("Average normalized accuracy (from Accuracy) - Interpolation:")
print((100 * avg_norm_acc_acc).round(3))


Average normalized accuracy (from Accuracy) - Interpolation:
model
Const.         4.585
Engression    64.750
FT-Trans.     15.326
GP            58.877
GBT           73.427
Log. Regr.    47.811
MLP           23.894
RF            57.630
ResNet        19.633
TabPFN        84.867
dtype: float64


In [14]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_acc = avg_rank(pivot_acc, error_metric=False)
print("Average rank (Accuracy) -Interpolation:")
print(avg_rank_acc.round(2))



Average rank (Accuracy) -Interpolation:
model
Const.        8.45
Engression    3.68
FT-Trans.     7.36
GP            4.39
GBT           3.75
Log. Regr.    5.27
MLP           6.25
RF            4.52
ResNet        6.82
TabPFN        2.27
dtype: float64


In [15]:
avg_diff3    = avg_relative_diff(pivot_acc, error_metric=False)           # in %
avg_acc3     = normalized_accuracy(pivot_acc, error_metric=False) * 100   # convert to % 
avg_rank3    = avg_rank(pivot_acc, error_metric=False)

metrics = pd.DataFrame(
    [avg_diff3, avg_acc3, avg_rank3],
    index=["Avg. diff.", "Avg. acc.", "Avg. rank."]
)

pivot_with_summaries3 = pd.concat([pivot_acc, metrics])

out3 = pivot_with_summaries3.reset_index().rename(columns={"index":"task_id"})

out3.to_csv(f"avg_{metric3}_with_summary_IP.csv", float_format="%.3f", index=False)

latex3 = out3.to_latex(index=False, escape=True, float_format="%.3f")
with open(f"tabz_{metric3}_IP_results.tex","w") as f:
    f.write(latex3)