In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("combined_results.csv")
df = df[df["split_method"] != "random_split"]

In [None]:
metric = "RMSE"
dfm   = df[df["metric"] == metric]

avg_per_task = dfm.groupby("task_id")["value"].mean()

avg_per_task = avg_per_task.reset_index(name=f"avg_{metric}")
avg_per_task.to_csv(f"avg_{metric}_per_task.csv", index=False)


Average Difference and Average Relative Difference RMSE

In [3]:
metric = "RMSE"
dfm   = df[df["metric"] == metric]

pivot_rmse = dfm.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

pivot_rmse.to_csv(f"avg_{metric}_per_task_per_model.csv", float_format="%.3f")


In [4]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    # average across tasks (skipping any NaNs)
    return rel_diff.mean(axis=0)


avg_diff_rmse = avg_relative_diff(pivot_rmse, error_metric=True)

print("Average relative difference (RMSE):")
print(avg_diff_rmse.round(2))


Average relative difference (RMSE):
model
ConstantPredictor           332.93
Engression                 9070.44
FTTransformer                49.61
LGBMRegressor                47.43
LinearRegressor          103915.06
MLP                         149.23
RandomForestRegressor        54.33
ResNet                      124.98
TabPFNRegressor              21.36
dtype: float64


In [5]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_rmse = normalized_accuracy(pivot_rmse, error_metric=True)
print("Average normalized accuracy (from RMSE):")
print((100 * avg_norm_acc_rmse).round(2))


Average normalized accuracy (from RMSE):
model
ConstantPredictor         9.07
Engression               38.39
FTTransformer            62.46
LGBMRegressor            69.62
LinearRegressor           4.35
MLP                      46.72
RandomForestRegressor    60.42
ResNet                   37.16
TabPFNRegressor          89.75
dtype: float64


In [6]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_rmse = avg_rank(pivot_rmse, error_metric=True)
print("Average rank (RMSE):")
print(avg_rank_rmse.round(2))



Average rank (RMSE):
model
ConstantPredictor        7.69
Engression               5.31
FTTransformer            3.91
LGBMRegressor            3.36
LinearRegressor          7.42
MLP                      5.11
RandomForestRegressor    4.08
ResNet                   5.37
TabPFNRegressor          2.03
dtype: float64


Average Difference and Average Relative Difference CRPS

In [7]:
metric1 = "CRPS"
dfm1   = df[df["metric"] == metric1]

pivot_crps = dfm1.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

# 4) (Optional) write out to CSV
pivot_crps.to_csv(f"avg_{metric1}_per_task_per_model.csv", float_format="%.3f")


In [8]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    # average across tasks (skipping any NaNs)
    return rel_diff.mean(axis=0)


avg_diff_crps = avg_relative_diff(pivot_crps, error_metric=True)

print("Average relative difference (CRPS):")
print(avg_diff_crps.round(2))


Average relative difference (CRPS):
model
ConstantPredictor          556.73
DGBT                        64.48
DRF                         84.20
Engression               14765.58
FTTransformer               62.12
LGBMRegressor               65.11
LinearRegressor          72921.62
MLP                         83.01
RandomForestRegressor       75.62
ResNet                     156.40
TabPFNRegressor             25.32
dtype: float64


In [9]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_crps = normalized_accuracy(pivot_crps, error_metric=True)
print("Average normalized accuracy (from CRPS):")
print((100 * avg_norm_acc_crps).round(2))


Average normalized accuracy (from CRPS):
model
ConstantPredictor         5.75
DGBT                     59.88
DRF                      56.96
Engression               53.80
FTTransformer            53.79
LGBMRegressor            56.74
LinearRegressor           6.68
MLP                      34.01
RandomForestRegressor    43.87
ResNet                   29.95
TabPFNRegressor          76.39
dtype: float64


In [10]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_crps = avg_rank(pivot_crps, error_metric=True)
print("Average rank (CRPS):")
print(avg_rank_crps.round(2))



Average rank (CRPS):
model
ConstantPredictor        10.00
DGBT                      4.14
DRF                       4.86
Engression                5.20
FTTransformer             5.00
LGBMRegressor             4.56
LinearRegressor           9.19
MLP                       6.89
RandomForestRegressor     5.72
ResNet                    6.83
TabPFNRegressor           2.72
dtype: float64


Average Difference and Average Relative Difference LOGLOSS

In [11]:
metric2 = "LogLoss"
dfm2   = df[df["metric"] == metric2]

pivot_ll = dfm2.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

pivot_ll.to_csv(f"avg_{metric2}_per_task_per_model.csv", float_format="%.3f")


In [12]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (best_per_task.sub(pivot, axis=0)
                              .div(best_per_task, axis=0)
                              * 100)

    return rel_diff.mean(axis=0)


avg_diff_ll = avg_relative_diff(pivot_ll, error_metric=True)

print("Average relative difference (LogLoss):")
print(avg_diff_ll.round(2))


Average relative difference (LogLoss):
model
ConstantPredictor         13722.54
Engression                  200.07
FTTransformer                17.19
LGBMClassifier              267.34
LogisticRegressor           185.24
MLP                          22.99
RandomForestClassifier      267.34
ResNet                       34.74
TabPFNClassifier              4.92
dtype: float64


In [13]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .mul(-1)
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_ll = normalized_accuracy(pivot_ll, error_metric=True)
print("Average normalized accuracy (from LogLoss):")
print((100 * avg_norm_acc_ll).round(2))


Average normalized accuracy (from LogLoss):
model
ConstantPredictor          0.00
Engression                31.36
FTTransformer             70.78
LGBMClassifier            12.07
LogisticRegressor         33.00
MLP                       62.61
RandomForestClassifier    12.07
ResNet                    60.47
TabPFNClassifier          99.31
dtype: float64


In [14]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_ll = avg_rank(pivot_ll, error_metric=True)
print("Average rank (LogLoss):")
print(avg_rank_ll.round(2))



Average rank (LogLoss):
model
ConstantPredictor         8.96
Engression                5.52
FTTransformer             3.18
LGBMClassifier            6.41
LogisticRegressor         5.48
MLP                       3.70
RandomForestClassifier    6.41
ResNet                    3.74
TabPFNClassifier          1.35
dtype: float64


Average Difference and Average Relative Difference Accuracy

In [15]:
metric3 = "Accuracy"
dfm3   = df[df["metric"] == metric3]

pivot_acc = dfm3.pivot_table(
    index="task_id",
    columns="model",
    values="value",
    aggfunc="mean"
)

pivot_acc.to_csv(f"avg_{metric3}_per_task_per_model.csv", float_format="%.5f")


In [16]:
def avg_relative_diff(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:

    if error_metric:
        best_per_task = pivot.min(axis=1)   # best is minimum
        # (v - best)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)
                         .div(best_per_task, axis=0)
                         * 100)
    else:
        best_per_task = pivot.max(axis=1)   # best is maximum
        # (best - v)/best * 100
        rel_diff = (pivot.sub(best_per_task, axis=0)     
                         .mul(-1)                         
                         .div(best_per_task , axis=0)
                         * 100)

    return rel_diff.mean(axis=0)


avg_diff_accuracy = avg_relative_diff(pivot_acc, error_metric=False)

print("\nAverage relative difference (Accuracy):")
print(avg_diff_accuracy.round(2))



Average relative difference (Accuracy):
model
ConstantPredictor         45.96
Engression                17.99
FTTransformer              3.30
LGBMClassifier             1.17
LogisticRegressor          9.17
MLP                        4.32
RandomForestClassifier     2.06
ResNet                     4.11
TabPFNClassifier           0.24
dtype: float64


In [17]:
def normalized_accuracy(pivot: pd.DataFrame,
                        error_metric: bool = True) -> pd.Series:

    if error_metric:
        best = pivot.min(axis=1)  

        mid  = pivot.apply(
            lambda s: s.dropna().nlargest(3).min(),
            axis=1
        )
        norm = (
            pivot
            .rsub(mid,       axis=0)    # mid[t] - pivot.loc[t, m]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )
    else:
        best = pivot.max(axis=1)
        mid  = pivot.apply(
            lambda s: s.dropna().nsmallest(3).max(),
            axis=1
        )
        # 3) norm_acc per cell = (acc - mid) / (best - mid), clipped
        norm = (
            pivot
            .rsub(mid,          axis=0)   # pivot.loc[t,m] - mid[t]
            .div(mid - best, axis=0)
            .clip(0, 1)
        )

    return norm.mean(axis=0)


avg_norm_acc_acc = normalized_accuracy(pivot_acc, error_metric=False)
print("Average normalized accuracy (from Accuracy):")
print((100 * avg_norm_acc_acc).round(3))


Average normalized accuracy (from Accuracy):
model
ConstantPredictor          0.000
Engression                 2.183
FTTransformer             57.816
LGBMClassifier            85.567
LogisticRegressor          9.175
MLP                       45.448
RandomForestClassifier    75.468
ResNet                    50.169
TabPFNClassifier          94.547
dtype: float64


In [18]:
def avg_rank(pivot: pd.DataFrame, error_metric: bool = True) -> pd.Series:
  
    ranks = pivot.rank(
        axis=1,
        method="average",   
        ascending=error_metric
    )

    return ranks.mean(axis=0)

avg_rank_acc = avg_rank(pivot_acc, error_metric=False)
print("Average rank (Accuracy):")
print(avg_rank_acc.round(2))



Average rank (Accuracy):
model
ConstantPredictor         8.96
Engression                7.70
FTTransformer             4.23
LGBMClassifier            2.61
LogisticRegressor         6.65
MLP                       4.87
RandomForestClassifier    3.26
ResNet                    4.74
TabPFNClassifier          1.78
dtype: float64
