In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import numpy as np
import pandas as pd
import time
import sys

In [3]:
import matplotlib.pyplot as plt

In [4]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [5]:
## our hyperparameters optimization for inherently interpretable models
intelligible_bench = pd.read_csv("benchmark_inherently_interpretable.csv",low_memory=False)

## results of hyperparameters optimization for full-complexity models from Grinsztajn et al. (2022)
all_bench = pd.read_csv("benchmark_total.csv",low_memory=False)

### Average number of iteration (for TabSRALinear)

In [6]:
run_df_tabsra = intelligible_bench[(intelligible_bench.model_name=='TabSRA')&(intelligible_bench.model__module__n_head<3) & (intelligible_bench.model__module__n_hidden_encoder==1)]
res = run_df_tabsra.groupby(['benchmark','data__keyword'],as_index=False).agg({'mean_test_score':'count'})
res.mean_test_score.mean()

96.96610169491525

### Get hyperparameter tuning samples from Grinsztajn et al. (2022) for full-complexity models

In [7]:
## This function id used to sample hyperparameter optimization samples form from Grinsztajn et al. (2022)
## The sampling factor is 3 for XGBoost meaning that the number of iterations for XGBoost is 3x the one of TabSRALinear
## in addition we also add the default hyperparameter configuration from Grinsztajn et al. (2022). 
## Therefore we have 3x+1 iteration for XGBoost. 
def sample_from_benchmark(reference_data, bencharmark_data, models=['XGBoost'], sampling_factor = [3],  random_state= 42, metric ='mean_r2_test'):

    tasks = list(reference_data.benchmark.value_counts().index)
    result = []
    #Choice  of the model
    for target_model_id, target_model in enumerate(models):
        bencharmark_data_model = bencharmark_data[(bencharmark_data.model_name==target_model) & (~ bencharmark_data[metric].isnull())]
        target_result = [bencharmark_data_model[bencharmark_data_model.hp=='default']]

        ## choice of the task 
        for id_task, task in enumerate(tasks):
            #target_result = [bencharmark_data_model[(bencharmark_data_model.hp=='default') & ]]
            reference_data_task = reference_data[reference_data.benchmark==task]
            tmp_group = reference_data_task.groupby(['data__keyword']).agg({metric:'count'})
            data_names = tmp_group.index.to_list()
            N_sample_reference = tmp_group.values.flatten().tolist()

            ### choice of the data 
            for data_id, data_name in enumerate(data_names):
                condition = (bencharmark_data_model.hp!='default') & (bencharmark_data_model.data__keyword==data_name)& (bencharmark_data_model.benchmark==task)
                target_result.append(bencharmark_data_model[condition].sample(n= N_sample_reference[data_id]*sampling_factor[target_model_id], random_state = random_state))
        result.append(pd.concat(target_result))
    return pd.concat(result)
        

In [8]:
models = ['XGBoost', 'FT Transformer', 'MLP', 'SAINT', 'Resnet', 'RandomForest']
sampling_factor = [3, 1, 2 ,1 , 1, 2]
all_bench_x = sample_from_benchmark(reference_data=run_df_tabsra, bencharmark_data=all_bench,models=models,sampling_factor = sampling_factor,metric ='mean_test_score')

In [9]:
bench_with_tabr = pd.concat([intelligible_bench,all_bench_x])
bench_with_tabr.shape

(111428, 219)

###  Results

In [10]:
### This function helps to create boostrap like estimate of the test score
### the sampling ratio is 50% and we reapet it for 10 different random seeds
def bootstrap_result(result_data,sampling_ratio=0.5, group_critera = 'model_name',val_metric = 'mean_r2_val', metric ='mean_r2_test', normalize = 'max', clip_0 = True, random_states = [42,43]):
    tasks = list(result_data.benchmark.value_counts().index)
    all_result_all_task = []
    for id_task, task in enumerate(tasks):
        result_data_task = result_data[result_data.benchmark==task]
        tmp_group = result_data_task.groupby(['data__keyword']).agg({metric:'count'})
        data_names = tmp_group.index.to_list()
        all_result = []
        for ind_seed, seed in  enumerate(random_states):   
            result = []
            for data_id, data_name in enumerate(data_names):
                result_target_data_default = result_data_task[(result_data_task.hp=='default') & (result_data_task.data__keyword==data_name)]
                result_target_data_random = result_data_task[(result_data_task.hp!='default') & (result_data_task.data__keyword==data_name)]
                tmp = result_target_data_random.groupby(group_critera).agg({metric:'count'})
                dict_sample_model_per_data = dict(zip(tmp.index.to_list(),(sampling_ratio*tmp.values.flatten()).astype(int).tolist()))
                tmp_sampled = result_target_data_random.groupby(group_critera).apply(lambda group: group.sample(dict_sample_model_per_data[group.name], random_state = seed)).reset_index(drop = True)
                result_target_data_default = result_target_data_default.append(tmp_sampled)
                # sample the best validation performance for each model 
                idx_best_val = result_target_data_default.groupby([group_critera])[val_metric].transform(max) == result_target_data_default[val_metric]
                result_target_data_default = result_target_data_default[idx_best_val].drop_duplicates(subset=group_critera, keep="last")
                result_target_data_default["rank"] = result_target_data_default[metric].rank(ascending=False)
                if clip_0:
                    result_target_data_default[metric] = result_target_data_default[metric].clip(0)
                if normalize=='max':
                    result_target_data_default[metric+'_normalized'] = result_target_data_default[metric]/result_target_data_default[metric].max() 
                if normalize=='minmax':
                    result_target_data_default[metric+'_normalized'] = (result_target_data_default[metric] - result_target_data_default[metric].min())/(result_target_data_default[metric].max() - result_target_data_default[metric].min())
                result_target_data_default["bootstrap_seed"] = seed
                
                result.append(result_target_data_default)
            all_result.append(pd.concat(result))
        all_result_all_task.append(pd.concat(all_result))
    return pd.concat(all_result_all_task)

### Comparison of models

In [11]:
## For default TabSRALinear is ( N_hidden_encoder =1) and N_head in {1,2}
## we optimize this for 9 days/216 hours
## TabSRA refers to the TabSRALinear model 
cond_1 = (bench_with_tabr.model_name=='TabSRA')&(bench_with_tabr.model__module__n_hidden_encoder==1)&(bench_with_tabr.model__module__n_head<3)
cond = cond_1|(bench_with_tabr.model_name!='TabSRA')
inter_model = bench_with_tabr[cond]

boost_x_normalize_max = bootstrap_result(result_data = inter_model, random_states = [42,43,44,45,46,47,48,49,50, 51],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all = bootstrap_result(result_data = inter_model, sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')

In [12]:
boost_x_normalize_max.shape, boost_x_normalize_max_all.shape

((6490, 222), (649, 222))

#### Bootstrap like results (for 10 different random seeds)

In [13]:
res = boost_x_normalize_max.groupby(["model_name"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.iloc[[0,1,2,4,9,5,7,8,3,6,10]].style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model_name,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
0,DecisionTree,2.0,11.0,9.479661,10.0,0.869981,0.916109,0.163403,0.29369,0.03244
1,EBM,1.0,9.0,4.577966,4.0,0.961211,0.98532,0.066698,97.837115,19.737442
2,EBM_S,1.0,10.0,6.750847,7.0,0.93295,0.958367,0.086356,23.99701,5.1444
4,Linear,6.0,11.0,10.702542,11.0,0.761312,0.839935,0.231866,21.124014,19.715953
9,TabSRA,1.0,11.0,7.288983,8.0,0.903323,0.97281,0.19684,47.575668,38.073438
5,MLP,1.0,11.0,6.118644,7.0,0.925977,0.978015,0.158691,24.165454,19.255583
7,Resnet,1.0,11.0,6.250847,7.0,0.911317,0.976046,0.195338,95.122911,53.211523
8,SAINT,1.0,11.0,4.776271,5.0,0.948525,0.984996,0.092687,216.052733,126.841212
3,FT Transformer,1.0,10.0,4.382203,4.0,0.94588,0.988338,0.109301,126.589095,77.465159
6,RandomForest,1.0,9.0,3.500847,3.0,0.987247,0.994001,0.019212,39.029824,8.251971


#### Results for all iterations

In [14]:
res = boost_x_normalize_max_all.groupby(["model_name"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.iloc[[0,1,2,4,9,5,7,8,3,6,10]].style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model_name,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
0,DecisionTree,3.0,11.0,9.525424,10.0,0.870741,0.916359,0.169162,0.281834,0.031518
1,EBM,1.0,9.0,4.627119,4.0,0.961282,0.984411,0.066816,70.59688,18.138514
2,EBM_S,1.0,10.0,6.754237,7.0,0.933045,0.955212,0.086136,21.101231,5.268758
4,Linear,7.0,11.0,10.728814,11.0,0.764432,0.840108,0.230635,21.837197,20.461714
9,TabSRA,1.0,11.0,7.09322,7.0,0.91104,0.975628,0.181012,48.167476,38.073438
5,MLP,1.0,11.0,6.135593,7.0,0.929507,0.97799,0.140413,23.840109,16.969646
7,Resnet,2.0,11.0,6.237288,7.0,0.912925,0.975918,0.185455,118.734121,70.919175
8,SAINT,1.0,10.0,4.838983,5.0,0.955264,0.984211,0.074576,253.252389,131.435309
3,FT Transformer,1.0,9.0,4.330508,4.0,0.950594,0.988798,0.096353,164.44565,82.789054
6,RandomForest,1.0,9.0,3.59322,3.0,0.987696,0.994122,0.018216,36.496975,7.918278


#### Result on all iterations
- x $\approx$ 95 per dataset for TabSRALinear
- we consider 3x for XGBoost, 2x for RF, MLP and 1x for FT-Transformer, SAINT, ResNet
- For all inherenlty interpretable models, Decision Tree (DT) , Linear/Logistic Regression (LR), Explainable Boosting Machine (EBM)
and EBM without interactions terms (EBM_S), we use the same budget of optimization: 9days/216hours on 64-core processor CPU. 

## Ablation study (for TabSRALinear)

In [15]:
def sample_data(base_data, group_critera = 'model__module__dim_head', metric='mean_test_score',random_state=42):

    tasks = list(base_data.benchmark.value_counts().index)
    result = []
    ## choice of the task 
    for id_task, task in enumerate(tasks):
        target_result = []
        reference_data_task = base_data[base_data.benchmark==task]
        tmp_group = reference_data_task.groupby(['data__keyword']).agg({metric:'count'})
        data_names = tmp_group.index.to_list()
        #N_sample_reference = tmp_group.values.flatten().tolist()

        ### choice of the data 
        for data_id, data_name in enumerate(data_names):
            condition = reference_data_task.data__keyword==data_name
            N_sample = reference_data_task[condition].groupby(group_critera).agg({metric:'count'})[metric].min()
            target_result.append(reference_data_task[condition].groupby(group_critera).sample(n= N_sample, random_state = random_state))
        result.append(pd.concat(target_result))
    return pd.concat(result)
        

### Study influence of dimension $d_k$ 
we study the influence of $d_k$  when the number of ensembles/heads is one 

In [16]:
inter_dim_attention = inter_model[inter_model.model__module__n_head==1]
inter_dim_attention =  sample_data(inter_dim_attention)

In [17]:
tmp = inter_dim_attention.groupby(['data__keyword','model__module__dim_head'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(17.666666666666668, 16.0, 36)

**For each for each configuration of dataset, n head==1 and $d_k \in \{4,8,12\}$, we run $\approx 17$ hyperparameters optimizations iterations**

In [18]:
boost_x_normalize_max_all = bootstrap_result(result_data = inter_dim_attention,group_critera = 'model__module__dim_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all_train = bootstrap_result(result_data = inter_dim_attention,group_critera = 'model__module__dim_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_train_score', metric ='mean_train_score')

#### test score (corresponding to the best validation validation configuration)

In [19]:
res = boost_x_normalize_max_all.groupby(["model__module__dim_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__dim_head,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
0,4.0,1.0,3.0,2.20339,2.0,0.933491,0.994062,0.182908,35.945906,29.875256
1,8.0,1.0,3.0,1.991525,2.0,0.961429,0.997392,0.142988,39.42855,37.528634
2,12.0,1.0,3.0,1.805085,2.0,0.957837,0.999291,0.167891,40.719496,34.766769


#### best training configuration

In [20]:
boost_x_normalize_max_all_train.groupby(["model__module__dim_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_train_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__dim_head,rank,rank,rank,rank,mean_train_score_normalized,mean_train_score_normalized,mean_train_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,4.0,1.0,3.0,2.542373,3.0,0.909784,0.984292,0.172646,34.735301,28.964525,59
1,8.0,1.0,3.0,1.915254,2.0,0.953021,0.994483,0.108158,33.176924,28.571159,59
2,12.0,1.0,3.0,1.542373,1.0,0.960809,1.0,0.174095,38.541131,30.238892,59


### Study influence of H: number of  ensembles/heads 
we study the influence of $H$  when the number of the encoder's hidden layer is set one 

In [21]:
cond_2 = (bench_with_tabr.model_name=='TabSRA')&(bench_with_tabr.model__module__n_hidden_encoder==1)
n_head_attention = bench_with_tabr[cond_2]
n_head_attention =  sample_data(n_head_attention,group_critera='model__module__n_head')

In [22]:
tmp = n_head_attention.groupby(['data__keyword','model__module__n_head'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.min(),tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(10, 29.866666666666667, 26.0, 65)

**For each for each configuration of dataset, n hidden_encoder==1, $H  \in \{1,2,3,4,5,6\}$, we run $\approx 30$ hyperparameters optimizations iterations**

In [23]:
boost_x_normalize_max_all = bootstrap_result(result_data = n_head_attention,group_critera = 'model__module__n_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all_train = bootstrap_result(result_data = n_head_attention,group_critera = 'model__module__n_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_train_score', metric ='mean_train_score')

#### test score (corresponding to the best validation validation configuration)

In [24]:
res = boost_x_normalize_max_all.groupby(["model__module__n_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_head,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
0,1.0,1.0,6.0,4.169492,5.0,0.953517,0.993783,0.141901,40.758109,36.329584
1,2.0,1.0,6.0,3.542373,4.0,0.961215,0.996075,0.154157,48.419314,38.817081
2,3.0,1.0,6.0,3.457627,3.0,0.95347,0.995228,0.16214,63.702822,57.670847
3,4.0,1.0,6.0,3.127119,3.0,0.962566,0.996454,0.15968,61.878182,53.617516
4,5.0,1.0,6.0,3.20339,3.0,0.976632,0.996632,0.103416,91.68299,62.888771
5,6.0,1.0,6.0,3.5,4.0,0.968803,0.996276,0.116104,99.672029,73.969212


#### best training configuration

In [26]:
boost_x_normalize_max_all_train.groupby(["model__module__n_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_train_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_head,rank,rank,rank,rank,mean_train_score_normalized,mean_train_score_normalized,mean_train_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,1.0,1.0,6.0,4.491525,5.0,0.951747,0.983669,0.077268,34.376824,27.852051,59
1,2.0,1.0,6.0,3.466102,3.5,0.953981,0.986448,0.107931,51.992523,37.784909,59
2,3.0,1.0,6.0,3.525424,4.0,0.936132,0.992044,0.146533,58.965438,32.882006,59
3,4.0,1.0,6.0,3.330508,3.0,0.937038,0.991321,0.155277,56.913128,46.199217,59
4,5.0,1.0,6.0,3.033898,3.0,0.941421,0.993779,0.157068,80.318409,61.332792,59
5,6.0,1.0,6.0,3.152542,3.0,0.958266,0.992529,0.114083,88.165148,50.360974,59


### Study influence of n_hidden_encoder: the number of hidden layer in the Query/Key encoder 
we study the influence of n_hidden_encoder for n_head $\in \{1,2\}$ and $d_k \in \{4,8,12\}$

In [27]:
cond_3 = (bench_with_tabr.model_name=='TabSRA')&(bench_with_tabr.model__module__n_hidden_encoder==1)&(bench_with_tabr.model__module__n_head<3)
cond = cond_3|(bench_with_tabr.model__module__n_hidden_encoder!=1)&(bench_with_tabr.model__module__n_head<3)
n_hidden_encoder_attention = bench_with_tabr[cond]
n_hidden_encoder_attention['model__module__n_hidden_encoder'] = n_hidden_encoder_attention['model__module__n_hidden_encoder'].fillna(2)

In [28]:
tmp = n_hidden_encoder_attention.groupby(['data__keyword','model__module__n_hidden_encoder'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(115.57777777777778, 97.0, 242)

**For each for each configuration of dataset, n head $\in \{1,2\}$ and $d_k \in \{4,8,12\}$, we run $\approx 115$ hyperparameters optimizations iterations** in order to study the influence of the n_hidden_encoder 

In [29]:
n_hidden_encoder_attention =  sample_data(n_hidden_encoder_attention,group_critera='model__module__n_hidden_encoder')

In [30]:
tmp = n_head_attention.groupby(['data__keyword','model__module__n_hidden_encoder'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.min(),tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(60, 179.2, 156.0, 390)

In [31]:
boost_x_normalize_max_all = bootstrap_result(result_data = n_hidden_encoder_attention,group_critera = 'model__module__n_hidden_encoder', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all_train = bootstrap_result(result_data = n_hidden_encoder_attention,group_critera = 'model__module__n_hidden_encoder', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_train_score', metric ='mean_train_score')

In [32]:
boost_x_normalize_max_all.groupby(["model__module__n_hidden_encoder"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_hidden_encoder,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,1.0,1.0,2.0,1.474576,1.0,0.993707,1.0,0.018899,49.427056,39.931599,59
1,2.0,1.0,2.0,1.525424,2.0,0.988527,0.999424,0.042065,54.597869,45.27572,59


In [33]:
boost_x_normalize_max_all_train.groupby(["model__module__n_hidden_encoder"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_train_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_hidden_encoder,rank,rank,rank,rank,mean_train_score_normalized,mean_train_score_normalized,mean_train_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,1.0,1.0,2.0,1.389831,1.0,0.984088,1.0,0.052872,59.673125,34.181583,59
1,2.0,1.0,2.0,1.610169,2.0,0.979664,0.99738,0.048702,51.173171,44.120896,59


In [34]:
res = boost_x_normalize_max_all.groupby(["model__module__n_hidden_encoder","rank"]).agg({'data__keyword':["count"]})
res.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,Unnamed: 1_level_0,data__keyword
Unnamed: 0_level_1,Unnamed: 1_level_1,count
model__module__n_hidden_encoder,rank,Unnamed: 2_level_2
1.0,1.0,31
1.0,2.0,28
2.0,1.0,28
2.0,2.0,31


In [38]:
#using all iterations instead of the bootstrap (with 10 different random seeds)
#boost_x_normalize_max_all = bootstrap_result(result_data = inter_model, sampling_ratio=1.0,random_states = [42], val_metric = 'mean_val_score', metric ='mean_test_score')

In [49]:
boost_x_normalize_max = bootstrap_result(result_data = inter_model, random_states = [42,43,44,45,46,47,48,49,50, 51],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')


In [50]:
boost_x_normalize_max.shape

(6490, 222)

In [51]:
res = boost_x_normalize_max.groupby(['benchmark','data__keyword','model_name'],as_index=False).agg({'mean_test_score':["mean","std"],'mean_time':"mean"}).round(3)
#res = res.fillna(0)

In [52]:
res

Unnamed: 0_level_0,benchmark,data__keyword,model_name,mean_test_score,mean_test_score,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean
0,categorical_classification_medium,albert,DecisionTree,0.637,0.003,0.117
1,categorical_classification_medium,albert,EBM,0.658,0.001,187.308
2,categorical_classification_medium,albert,EBM_S,0.652,0.0,14.31
3,categorical_classification_medium,albert,FT Transformer,0.653,0.001,70.966
4,categorical_classification_medium,albert,Linear,0.635,0.0,6.272
5,categorical_classification_medium,albert,MLP,0.652,0.001,12.17
6,categorical_classification_medium,albert,RandomForest,0.654,0.001,7.306
7,categorical_classification_medium,albert,Resnet,0.65,0.001,22.604
8,categorical_classification_medium,albert,SAINT,0.652,0.001,70.149
9,categorical_classification_medium,albert,TabSRA,0.643,0.003,44.022


### Example of Difficult tasks for TabSRALinear (and MLP like architecture)

In [53]:
features = ['model_name','mean_test_score','std_test_score','mean_time']

In [57]:
res[(res.benchmark=='numerical_regression_medium')&(res.data__keyword=='delays_zurich_transport')].iloc[[0,1,2,4,9,5,7,8,3,6,10]]

Unnamed: 0_level_0,benchmark,data__keyword,model_name,mean_test_score,mean_test_score,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean
506,numerical_regression_medium,delays_zurich_transport,DecisionTree,0.018,0.0,0.017
507,numerical_regression_medium,delays_zurich_transport,EBM,0.027,0.0,17.792
508,numerical_regression_medium,delays_zurich_transport,EBM_S,0.026,0.0,4.725
510,numerical_regression_medium,delays_zurich_transport,Linear,0.005,0.0,9.96
515,numerical_regression_medium,delays_zurich_transport,TabSRA,0.012,0.001,47.965
511,numerical_regression_medium,delays_zurich_transport,MLP,0.013,0.001,22.449
513,numerical_regression_medium,delays_zurich_transport,Resnet,0.011,0.001,50.765
514,numerical_regression_medium,delays_zurich_transport,SAINT,0.021,0.001,58.392
509,numerical_regression_medium,delays_zurich_transport,FT Transformer,0.019,0.001,51.534
512,numerical_regression_medium,delays_zurich_transport,RandomForest,0.031,0.002,5.921


In [47]:
res[(res.benchmark=='numerical_regression_medium')&(res.data__keyword=='yprop_4_1')].iloc[[0,1,2,4,9,5,7,8,3,6,10]]

Unnamed: 0_level_0,benchmark,data__keyword,model_name,mean_test_score,mean_test_score,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean
638,numerical_regression_medium,yprop_4_1,DecisionTree,0.026,0.007,0.006
639,numerical_regression_medium,yprop_4_1,EBM,0.056,0.002,4.138
640,numerical_regression_medium,yprop_4_1,EBM_S,0.048,0.001,1.686
642,numerical_regression_medium,yprop_4_1,Linear,0.043,0.003,19.606
647,numerical_regression_medium,yprop_4_1,TabSRA,0.023,0.02,26.594
643,numerical_regression_medium,yprop_4_1,MLP,0.014,0.01,9.927
645,numerical_regression_medium,yprop_4_1,Resnet,0.013,0.017,28.791
646,numerical_regression_medium,yprop_4_1,SAINT,0.057,0.005,165.168
641,numerical_regression_medium,yprop_4_1,FT Transformer,0.045,0.005,162.342
644,numerical_regression_medium,yprop_4_1,RandomForest,0.092,0.002,5.152
