In [1]:
import os
import numpy as np
import pandas as pd
import time
import sys

In [2]:
import matplotlib.pyplot as plt

In [3]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [4]:
## our hyperparameters optimization for inherently interpretable models
intelligible_bench = pd.read_csv("results/Model_Intelligible_Update.csv",low_memory=False)

## results of hyperparameters optimization for full-complexity models from Grinsztajn et al. (2022)
all_bench = pd.read_csv("results/benchmark_total.csv",low_memory=False)

### Average number of iteration (for TabSRALinear)

In [5]:
run_df_tabsra = intelligible_bench[(intelligible_bench.model_name=='TabSRA')&(intelligible_bench.model__module__n_head<3) & (intelligible_bench.model__module__n_hidden_encoder==1)]
res = run_df_tabsra.groupby(['benchmark','data__keyword'],as_index=False).agg({'mean_test_score':'count'})
res.mean_test_score.mean()

96.96610169491525

### Get hyperparameter tuning samples from Grinsztajn et al. (2022) for full-complexity models

In [6]:
## This function id used to sample hyperparameter optimization samples form from Grinsztajn et al. (2022)
## The sampling factor is 3 for XGBoost meaning that the number of iterations for XGBoost is 3x the one of TabSRALinear
## in addition we also add the default hyperparameter configuration from Grinsztajn et al. (2022). 
## Therefore we have 3x+1 iteration for XGBoost. 
def sample_from_benchmark(reference_data, bencharmark_data, models=['XGBoost'], sampling_factor = [3],  random_state= 42, metric ='mean_r2_test'):

    tasks = list(reference_data.benchmark.value_counts().index)
    result = []
    #Choice  of the model
    for target_model_id, target_model in enumerate(models):
        bencharmark_data_model = bencharmark_data[(bencharmark_data.model_name==target_model) & (~ bencharmark_data[metric].isnull())]
        target_result = [bencharmark_data_model[bencharmark_data_model.hp=='default']]

        ## choice of the task 
        for id_task, task in enumerate(tasks):
            #target_result = [bencharmark_data_model[(bencharmark_data_model.hp=='default') & ]]
            reference_data_task = reference_data[reference_data.benchmark==task]
            tmp_group = reference_data_task.groupby(['data__keyword']).agg({metric:'count'})
            data_names = tmp_group.index.to_list()
            N_sample_reference = tmp_group.values.flatten().tolist()

            ### choice of the data 
            for data_id, data_name in enumerate(data_names):
                condition = (bencharmark_data_model.hp!='default') & (bencharmark_data_model.data__keyword==data_name)& (bencharmark_data_model.benchmark==task)
                target_result.append(bencharmark_data_model[condition].sample(n= N_sample_reference[data_id]*sampling_factor[target_model_id], random_state = random_state))
        result.append(pd.concat(target_result))
    return pd.concat(result)
        

In [7]:
models = ['XGBoost', 'FT Transformer', 'MLP', 'SAINT', 'Resnet', 'RandomForest']
sampling_factor = [3, 1, 2 ,1 , 1, 2]
all_bench_x = sample_from_benchmark(reference_data=run_df_tabsra, bencharmark_data=all_bench,models=models,sampling_factor = sampling_factor,metric ='mean_test_score')

In [8]:
bench_with_tabr = pd.concat([intelligible_bench,all_bench_x])
bench_with_tabr.shape

(131317, 220)

###  Results

In [9]:
### This function helps to create boostrap like estimate of the test score
### the sampling ratio is 50% and we reapet it for 10 different random seeds
def bootstrap_result(result_data,sampling_ratio=0.5, group_critera = 'model_name',val_metric = 'mean_r2_val', metric ='mean_r2_test', normalize = 'max', clip_0 = True, random_states = [42,43]):
    tasks = list(result_data.benchmark.value_counts().index)
    all_result_all_task = []
    for id_task, task in enumerate(tasks):
        result_data_task = result_data[result_data.benchmark==task]
        tmp_group = result_data_task.groupby(['data__keyword']).agg({metric:'count'})
        data_names = tmp_group.index.to_list()
        all_result = []
        for ind_seed, seed in  enumerate(random_states):   
            result = []
            for data_id, data_name in enumerate(data_names):
                result_target_data_default = result_data_task[(result_data_task.hp=='default') & (result_data_task.data__keyword==data_name)]
                result_target_data_random = result_data_task[(result_data_task.hp!='default') & (result_data_task.data__keyword==data_name)]
                tmp = result_target_data_random.groupby(group_critera).agg({metric:'count'})
                dict_sample_model_per_data = dict(zip(tmp.index.to_list(),(sampling_ratio*tmp.values.flatten()).astype(int).tolist()))
                tmp_sampled = result_target_data_random.groupby(group_critera).apply(lambda group: group.sample(dict_sample_model_per_data[group.name], random_state = seed)).reset_index(drop = True)
                result_target_data_default = result_target_data_default.append(tmp_sampled)
                # sample the best validation performance for each model 
                idx_best_val = result_target_data_default.groupby([group_critera])[val_metric].transform(max) == result_target_data_default[val_metric]
                result_target_data_default = result_target_data_default[idx_best_val].drop_duplicates(subset=group_critera, keep="last")
                result_target_data_default["rank"] = result_target_data_default[metric].rank(ascending=False)
                if clip_0:
                    result_target_data_default[metric] = result_target_data_default[metric].clip(0)
                if normalize=='max':
                    result_target_data_default[metric+'_normalized'] = result_target_data_default[metric]/result_target_data_default[metric].max() 
                if normalize=='minmax':
                    result_target_data_default[metric+'_normalized'] = (result_target_data_default[metric] - result_target_data_default[metric].min())/(result_target_data_default[metric].max() - result_target_data_default[metric].min())
                result_target_data_default["bootstrap_seed"] = seed
                
                result.append(result_target_data_default)
            all_result.append(pd.concat(result))
        all_result_all_task.append(pd.concat(all_result))
    return pd.concat(all_result_all_task)

### Comparison of models

In [10]:
## For default TabSRALinear is ( N_hidden_encoder =1) and N_head in {1,2}
## we optimize this for 9 days/216 hours
cond_1 = (bench_with_tabr.model_name=='TabSRA')&(bench_with_tabr.model__module__n_hidden_encoder==1)&(bench_with_tabr.model__module__n_head<3)
cond = cond_1|(bench_with_tabr.model_name!='TabSRA')
inter_model = bench_with_tabr[cond]

boost_x_normalize_max = bootstrap_result(result_data = inter_model, random_states = [42,43,44,45,46,47,48,49,50, 51],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all = bootstrap_result(result_data = inter_model, sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')

In [11]:
boost_x_normalize_max.shape, boost_x_normalize_max_all.shape

((7080, 223), (708, 223))

#### Bootstrap like results (for 10 different random seeds)

In [12]:
res = boost_x_normalize_max.groupby(["model_name"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.iloc[[1,3,2,5,10,6,8,9,4,7,11,0]].style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model_name,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
1,DecisionTree,2.0,12.0,10.476271,11.0,0.86809,0.907437,0.1634,0.29369,0.03244
3,EBM_S,1.0,11.0,7.692373,8.0,0.930764,0.95521,0.086872,23.99701,5.1444
2,EBM,1.0,10.0,5.477119,5.0,0.95893,0.98196,0.067106,97.837115,19.737442
5,Linear,7.0,12.0,11.700847,12.0,0.759739,0.83926,0.231781,21.124014,19.715953
10,TabSRA,1.0,12.0,8.225424,9.0,0.901333,0.971269,0.19661,47.575668,38.073438
6,MLP,1.0,12.0,6.991525,8.0,0.923804,0.972794,0.158786,24.165454,19.255583
8,Resnet,1.0,12.0,7.120339,8.0,0.909464,0.975321,0.195154,95.122911,53.211523
9,SAINT,1.0,12.0,5.625424,6.0,0.946328,0.982436,0.093217,216.052733,126.841212
4,FT Transformer,1.0,11.0,5.20339,5.0,0.943688,0.984018,0.109301,126.589095,77.465159
7,RandomForest,1.0,10.0,4.214407,4.0,0.984898,0.992013,0.021005,39.029824,8.251971


#### Results for all iterations

In [13]:
res = boost_x_normalize_max_all.groupby(["model_name"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.iloc[[1,3,2,5,10,6,8,9,4,7,11,0]].style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model_name,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
1,DecisionTree,4.0,12.0,10.525424,11.0,0.869102,0.908604,0.169047,0.281834,0.031518
3,EBM_S,1.0,11.0,7.686441,8.0,0.931311,0.95499,0.086172,21.101231,5.268758
2,EBM,1.0,10.0,5.525424,5.0,0.959473,0.98227,0.066571,70.59688,18.138514
5,Linear,8.0,12.0,11.728814,12.0,0.763039,0.840034,0.230408,21.837197,20.461714
10,TabSRA,2.0,12.0,8.008475,8.0,0.909282,0.973625,0.180661,48.167476,38.073438
6,MLP,1.0,12.0,7.016949,8.0,0.927743,0.974647,0.140171,23.840109,16.969646
8,Resnet,2.0,12.0,7.101695,8.0,0.911412,0.975764,0.185223,118.734121,70.919175
9,SAINT,1.0,11.0,5.669492,6.0,0.953479,0.981906,0.074529,253.252389,131.435309
4,FT Transformer,1.0,10.0,5.118644,5.0,0.948776,0.98421,0.095895,164.44565,82.789054
7,RandomForest,1.0,10.0,4.322034,4.0,0.985875,0.993003,0.01932,36.496975,7.918278


#### Result on all iterations
- x $\approx$ 95 per dataset for TabSRALinear
- we consider $\approx$ 3x for CatBoost, XGBoost, 2x for RF, MLP and 1x for FT-Transformer, SAINT, ResNet
- For all inherenlty interpretable models, Decision Tree (DT) , Linear/Logistic Regression (LR), Explainable Boosting Machine (EBM)
and EBM without interactions terms (EBM_S), we use the same budget of optimization: 9days/216hours on 64-core processor CPU. 

## Ablation study (for TabSRALinear)

In [14]:
def sample_data(base_data, group_critera = 'model__module__dim_head', metric='mean_test_score',random_state=42):

    tasks = list(base_data.benchmark.value_counts().index)
    result = []
    ## choice of the task 
    for id_task, task in enumerate(tasks):
        target_result = []
        reference_data_task = base_data[base_data.benchmark==task]
        tmp_group = reference_data_task.groupby(['data__keyword']).agg({metric:'count'})
        data_names = tmp_group.index.to_list()
        #N_sample_reference = tmp_group.values.flatten().tolist()

        ### choice of the data 
        for data_id, data_name in enumerate(data_names):
            condition = reference_data_task.data__keyword==data_name
            N_sample = reference_data_task[condition].groupby(group_critera).agg({metric:'count'})[metric].min()
            target_result.append(reference_data_task[condition].groupby(group_critera).sample(n= N_sample, random_state = random_state))
        result.append(pd.concat(target_result))
    return pd.concat(result)
        

### Study influence of dimension $d_k$ 
we study the influence of $d_k$  when the number of ensembles/heads is one 

In [15]:
inter_dim_attention = inter_model[inter_model.model__module__n_head==1]
inter_dim_attention =  sample_data(inter_dim_attention)

In [16]:
tmp = inter_dim_attention.groupby(['data__keyword','model__module__dim_head'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(17.666666666666668, 16.0, 36)

**For each for each configuration of dataset, n head==1 and $d_k \in \{4,8,12\}$, we run $\approx 17$ hyperparameters optimizations iterations**

In [17]:
boost_x_normalize_max_all = bootstrap_result(result_data = inter_dim_attention,group_critera = 'model__module__dim_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all_train = bootstrap_result(result_data = inter_dim_attention,group_critera = 'model__module__dim_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_train_score', metric ='mean_train_score')

#### test score (corresponding to the best validation validation configuration)

In [18]:
res = boost_x_normalize_max_all.groupby(["model__module__dim_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__dim_head,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
0,4.0,1.0,3.0,2.20339,2.0,0.933491,0.994062,0.182908,35.945906,29.875256
1,8.0,1.0,3.0,1.991525,2.0,0.961429,0.997392,0.142988,39.42855,37.528634
2,12.0,1.0,3.0,1.805085,2.0,0.957837,0.999291,0.167891,40.719496,34.766769


#### best training configuration

In [19]:
boost_x_normalize_max_all_train.groupby(["model__module__dim_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_train_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__dim_head,rank,rank,rank,rank,mean_train_score_normalized,mean_train_score_normalized,mean_train_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,4.0,1.0,3.0,2.542373,3.0,0.909784,0.984292,0.172646,34.735301,28.964525,59
1,8.0,1.0,3.0,1.915254,2.0,0.953021,0.994483,0.108158,33.176924,28.571159,59
2,12.0,1.0,3.0,1.542373,1.0,0.960809,1.0,0.174095,38.541131,30.238892,59


### Study influence of H: number of  ensembles/heads 
we study the influence of $H$  when the number of the encoder's hidden layer is set one 

In [20]:
cond_2 = (bench_with_tabr.model_name=='TabSRA')&(bench_with_tabr.model__module__n_hidden_encoder==1)
n_head_attention = bench_with_tabr[cond_2]
n_head_attention =  sample_data(n_head_attention,group_critera='model__module__n_head')

In [21]:
tmp = n_head_attention.groupby(['data__keyword','model__module__n_head'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.min(),tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(10, 29.866666666666667, 26.0, 65)

**For each for each configuration of dataset, n hidden_encoder==1, $H  \in \{1,2,3,4,5,6\}$, we run $\approx 30$ hyperparameters optimizations iterations**

In [22]:
boost_x_normalize_max_all = bootstrap_result(result_data = n_head_attention,group_critera = 'model__module__n_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all_train = bootstrap_result(result_data = n_head_attention,group_critera = 'model__module__n_head', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_train_score', metric ='mean_train_score')

#### test score (corresponding to the best validation validation configuration)

In [23]:
res = boost_x_normalize_max_all.groupby(["model__module__n_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median"]})
res.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_head,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median
0,1.0,1.0,6.0,4.169492,5.0,0.953517,0.993783,0.141901,40.758109,36.329584
1,2.0,1.0,6.0,3.542373,4.0,0.961215,0.996075,0.154157,48.419314,38.817081
2,3.0,1.0,6.0,3.457627,3.0,0.95347,0.995228,0.16214,63.702822,57.670847
3,4.0,1.0,6.0,3.127119,3.0,0.962566,0.996454,0.15968,61.878182,53.617516
4,5.0,1.0,6.0,3.20339,3.0,0.976632,0.996632,0.103416,91.68299,62.888771
5,6.0,1.0,6.0,3.5,4.0,0.968803,0.996276,0.116104,99.672029,73.969212


#### best training configuration

In [24]:
boost_x_normalize_max_all_train.groupby(["model__module__n_head"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_train_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_head,rank,rank,rank,rank,mean_train_score_normalized,mean_train_score_normalized,mean_train_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,1.0,1.0,6.0,4.491525,5.0,0.951747,0.983669,0.077268,34.376824,27.852051,59
1,2.0,1.0,6.0,3.466102,3.5,0.953981,0.986448,0.107931,51.992523,37.784909,59
2,3.0,1.0,6.0,3.525424,4.0,0.936132,0.992044,0.146533,58.965438,32.882006,59
3,4.0,1.0,6.0,3.330508,3.0,0.937038,0.991321,0.155277,56.913128,46.199217,59
4,5.0,1.0,6.0,3.033898,3.0,0.941421,0.993779,0.157068,80.318409,61.332792,59
5,6.0,1.0,6.0,3.152542,3.0,0.958266,0.992529,0.114083,88.165148,50.360974,59


### Study influence of n_hidden_encoder: the number of hidden layer in the Query/Key encoder 
we study the influence of n_hidden_encoder for n_head $\in \{1,2\}$ and $d_k \in \{4,8,12\}$

In [25]:
cond_3 = (bench_with_tabr.model_name=='TabSRA')&(bench_with_tabr.model__module__n_hidden_encoder==1)&(bench_with_tabr.model__module__n_head<3)
cond = cond_3|(bench_with_tabr.model__module__n_hidden_encoder!=1)&(bench_with_tabr.model__module__n_head<3)
n_hidden_encoder_attention = bench_with_tabr[cond]
n_hidden_encoder_attention['model__module__n_hidden_encoder'] = n_hidden_encoder_attention['model__module__n_hidden_encoder'].fillna(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_hidden_encoder_attention['model__module__n_hidden_encoder'] = n_hidden_encoder_attention['model__module__n_hidden_encoder'].fillna(2)


In [26]:
tmp = n_hidden_encoder_attention.groupby(['data__keyword','model__module__n_hidden_encoder'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(115.57777777777778, 97.0, 242)

**For each for each configuration of dataset, n head $\in \{1,2\}$ and $d_k \in \{4,8,12\}$, we run $\approx 115$ hyperparameters optimizations iterations** in order to study the influence of the n_hidden_encoder 

In [27]:
n_hidden_encoder_attention =  sample_data(n_hidden_encoder_attention,group_critera='model__module__n_hidden_encoder')

In [28]:
tmp = n_head_attention.groupby(['data__keyword','model__module__n_hidden_encoder'],as_index=False).agg({'mean_val_score':'count'}).rename(columns={'mean_val_score':'Nb'})
tmp.Nb.min(),tmp.Nb.mean(),tmp.Nb.median(),tmp.Nb.max()

(60, 179.2, 156.0, 390)

In [29]:
boost_x_normalize_max_all = bootstrap_result(result_data = n_hidden_encoder_attention,group_critera = 'model__module__n_hidden_encoder', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')
boost_x_normalize_max_all_train = bootstrap_result(result_data = n_hidden_encoder_attention,group_critera = 'model__module__n_hidden_encoder', sampling_ratio=1.0,random_states = [42],normalize='max', val_metric = 'mean_train_score', metric ='mean_train_score')

In [30]:
boost_x_normalize_max_all.groupby(["model__module__n_hidden_encoder"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_test_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_hidden_encoder,rank,rank,rank,rank,mean_test_score_normalized,mean_test_score_normalized,mean_test_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,1.0,1.0,2.0,1.474576,1.0,0.993707,1.0,0.018899,49.427056,39.931599,59
1,2.0,1.0,2.0,1.525424,2.0,0.988527,0.999424,0.042065,54.597869,45.27572,59


In [31]:
boost_x_normalize_max_all_train.groupby(["model__module__n_hidden_encoder"],as_index=False).agg({'rank':["min","max","mean","median"],'mean_train_score'+'_normalized':["mean","median", "std"],'mean_time':["mean", "median", "count"]}) \
.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,model__module__n_hidden_encoder,rank,rank,rank,rank,mean_train_score_normalized,mean_train_score_normalized,mean_train_score_normalized,mean_time,mean_time,mean_time
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,median,mean,median,std,mean,median,count
0,1.0,1.0,2.0,1.389831,1.0,0.984088,1.0,0.052872,59.673125,34.181583,59
1,2.0,1.0,2.0,1.610169,2.0,0.979664,0.99738,0.048702,51.173171,44.120896,59


In [32]:
res = boost_x_normalize_max_all.groupby(["model__module__n_hidden_encoder","rank"]).agg({'data__keyword':["count"]})
res.style.background_gradient(cmap='coolwarm')

Unnamed: 0_level_0,Unnamed: 1_level_0,data__keyword
Unnamed: 0_level_1,Unnamed: 1_level_1,count
model__module__n_hidden_encoder,rank,Unnamed: 2_level_2
1.0,1.0,31
1.0,2.0,28
2.0,1.0,28
2.0,2.0,31


In [33]:
boost_x_normalize_max_all = bootstrap_result(result_data = inter_model, sampling_ratio=1.0,random_states = [42], val_metric = 'mean_val_score', metric ='mean_test_score')

In [34]:
boost_x_normalize_max = bootstrap_result(result_data = inter_model, random_states = [42,43,44,45,46,47,48,49,50, 51],normalize='max', val_metric = 'mean_val_score', metric ='mean_test_score')


In [35]:
boost_x_normalize_max.shape

(7080, 223)

In [36]:
res = boost_x_normalize_max.groupby(['benchmark','data__keyword','model_name'],as_index=False).agg({'mean_test_score':["mean","std"],'mean_time':"mean"}).round(3)
#res = res.fillna(0)

### Additional statitics

In [37]:
boost_x_normalize_max_test = res.copy()

In [38]:
benchmarks_data = boost_x_normalize_max_test.benchmark.unique()
boost_x_normalize_max_test.columns = ['benchmark','data__keyword','model_name','mean_test_score','std_test_score','mean_time']
TabSRA_best_EBM = []
for bench_name in benchmarks_data:
    tmp = boost_x_normalize_max_test[boost_x_normalize_max_test.benchmark==bench_name]
    for data_name in tmp.data__keyword.unique():
        if tmp[(tmp.model_name=='TabSRA') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]>1.05*tmp[(tmp.model_name=='EBM') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]:
            TabSRA_best_EBM.append([bench_name,data_name])

In [39]:
benchmarks_data = boost_x_normalize_max_test.benchmark.unique()
boost_x_normalize_max_test.columns = ['benchmark','data__keyword','model_name','mean_test_score','std_test_score','mean_time']
CatBoost_best_EBM = []
for bench_name in benchmarks_data:
    tmp = boost_x_normalize_max_test[boost_x_normalize_max_test.benchmark==bench_name]
    for data_name in tmp.data__keyword.unique():
        if tmp[(tmp.model_name=='CatBoost') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]>1.05*tmp[(tmp.model_name=='EBM') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]:
            CatBoost_best_EBM.append([bench_name,data_name])
            


In [40]:
benchmarks_data = boost_x_normalize_max_test.benchmark.unique()
boost_x_normalize_max_test.columns = ['benchmark','data__keyword','model_name','mean_test_score','std_test_score','mean_time']
EBM_best_TabSRA = []
for bench_name in benchmarks_data:
    tmp = boost_x_normalize_max_test[boost_x_normalize_max_test.benchmark==bench_name]
    for data_name in tmp.data__keyword.unique():
        if tmp[(tmp.model_name=='EBM') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]>1.1*tmp[(tmp.model_name=='TabSRA') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]:
            EBM_best_TabSRA.append([bench_name,data_name])
            

In [41]:
benchmarks_data = boost_x_normalize_max_test.benchmark.unique()
boost_x_normalize_max_test.columns = ['benchmark','data__keyword','model_name','mean_test_score','std_test_score','mean_time']
XGBoost_best_CatBoost = []
CatBoost_best_XGBoost = []
for bench_name in benchmarks_data:
    tmp = boost_x_normalize_max_test[boost_x_normalize_max_test.benchmark==bench_name]
    for data_name in tmp.data__keyword.unique():
        if tmp[(tmp.model_name=='XGBoost') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]>1.0*tmp[(tmp.model_name=='CatBoost') & (tmp.data__keyword==data_name)]['mean_test_score'].iloc[0]:
            XGBoost_best_CatBoost.append([bench_name,data_name])
        else:
            CatBoost_best_XGBoost.append([bench_name,data_name])

In [42]:
TabSRA_best_EBM

[['categorical_classification_medium', 'covertype'],
 ['numerical_regression_medium', 'pol'],
 ['numerical_regression_medium', 'sulfur']]

In [43]:
CatBoost_best_EBM

[['categorical_classification_medium', 'covertype'],
 ['categorical_classification_medium', 'electricity'],
 ['categorical_classification_medium', 'eye_movements'],
 ['categorical_regression_medium', 'topo_2_1'],
 ['numerical_classification_medium', 'covertype'],
 ['numerical_regression_medium', 'pol'],
 ['numerical_regression_medium', 'sulfur'],
 ['numerical_regression_medium', 'wine_quality'],
 ['numerical_regression_medium', 'yprop_4_1']]

In [44]:
EBM_best_TabSRA

[['categorical_regression_medium', 'Airlines_DepDelay_1M'],
 ['categorical_regression_medium', 'delays_zurich_transport'],
 ['categorical_regression_medium', 'seattlecrime6'],
 ['categorical_regression_medium', 'topo_2_1'],
 ['numerical_regression_medium', 'delays_zurich_transport'],
 ['numerical_regression_medium', 'nyc-taxi-green-dec-2016'],
 ['numerical_regression_medium', 'wine_quality'],
 ['numerical_regression_medium', 'yprop_4_1']]

### Results per dataset

In [45]:
features = ['benchmark','data__keyword','Model','Mean Test Score','Std Test Score','Mean Runing Time']
dict_replace_name = {'DecisionTree':'DT','Linear':'LR', 'TabSRA':'TabSRALinear',
                     'Resnet':'ResNet','FT Transformer':'FT-Transformer','RandomForest':'Random Forest'
                    }

In [46]:
res.columns = features
res.replace(dict_replace_name,inplace=True)
res.benchmark.value_counts()

numerical_regression_medium          228
categorical_regression_medium        204
numerical_classification_medium      192
categorical_classification_medium     84
Name: benchmark, dtype: int64

In [48]:
res_NR = res[res.benchmark=='numerical_regression_medium'].pivot_table(values='Mean Test Score',columns='data__keyword',index='Model')
res_NR

data__keyword,Ailerons,Bike_Sharing_Demand,Brazilian_houses,MiamiHousing2016,abalone,cpu_act,delays_zurich_transport,diamonds,elevators,house_16H,house_sales,houses,medical_charges,nyc-taxi-green-dec-2016,pol,sulfur,superconduct,wine_quality,yprop_4_1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
CatBoost,0.857,0.703,0.996,0.936,0.546,0.986,0.028,0.946,0.913,0.488,0.887,0.85,0.979,0.536,0.991,0.877,0.908,0.495,0.089
DT,0.772,0.636,0.982,0.812,0.489,0.967,0.018,0.94,0.688,0.326,0.794,0.706,0.978,0.435,0.951,0.783,0.804,0.28,0.026
EBM,0.841,0.683,0.99,0.924,0.533,0.982,0.027,0.945,0.887,0.496,0.876,0.817,0.979,0.538,0.923,0.769,0.888,0.392,0.056
EBM_S,0.826,0.652,0.984,0.894,0.512,0.979,0.026,0.944,0.859,0.468,0.842,0.781,0.979,0.516,0.845,0.733,0.883,0.336,0.048
FT-Transformer,0.842,0.679,0.998,0.921,0.568,0.98,0.019,0.943,0.915,0.47,0.88,0.832,0.979,0.451,0.994,0.859,0.878,0.362,0.045
LR,0.819,0.28,0.803,0.72,0.476,0.666,0.005,0.929,0.815,0.229,0.743,0.674,0.819,0.287,0.706,0.523,0.742,0.24,0.043
MLP,0.836,0.674,0.994,0.909,0.576,0.977,0.013,0.942,0.915,0.481,0.865,0.815,0.98,0.454,0.963,0.843,0.893,0.39,0.014
Random Forest,0.839,0.687,0.993,0.924,0.551,0.983,0.031,0.945,0.837,0.502,0.87,0.829,0.979,0.556,0.989,0.844,0.908,0.502,0.092
ResNet,0.838,0.201,0.997,0.913,0.565,0.981,0.011,0.944,0.899,0.489,0.866,0.825,0.979,0.469,0.955,0.807,0.892,0.366,0.013
SAINT,0.571,0.685,0.994,0.921,0.564,0.985,0.021,0.943,0.917,0.489,0.877,0.825,0.979,0.49,0.995,0.787,0.895,0.371,0.057


In [50]:
res_CR = res[res.benchmark=='categorical_regression_medium'].pivot_table(values='Mean Test Score',columns='data__keyword',index='Model')
res_CR.iloc[[1,3,2,5,10,6,8,9,4,7,11,0]]

data__keyword,Airlines_DepDelay_1M,Allstate_Claims_Severity,Bike_Sharing_Demand,Brazilian_houses,Mercedes_Benz_Greener_Manufacturing,SGEMM_GPU_kernel_performance,abalone,analcatdata_supreme,delays_zurich_transport,diamonds,house_sales,medical_charges,nyc-taxi-green-dec-2016,particulate-matter-ukair-2017,seattlecrime6,topo_2_1,visualizing_soil
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DT,0.037,0.384,0.789,0.984,0.574,1.0,0.497,0.98,0.056,0.964,0.795,0.978,0.441,0.636,0.18,0.009,1.0
EBM_S,0.046,0.51,0.744,0.983,0.549,1.0,0.514,0.981,0.069,0.987,0.853,0.979,0.521,0.67,0.18,0.049,0.935
EBM,0.048,0.517,0.924,0.991,0.563,1.0,0.537,0.983,0.073,0.989,0.885,0.979,0.563,0.679,0.186,0.053,0.993
LR,0.033,0.482,0.366,0.826,0.533,0.699,0.445,0.74,0.008,0.952,0.751,0.819,0.318,0.533,0.04,0.0,0.871
TabSRALinear,0.041,0.507,0.927,0.979,0.547,0.999,0.529,0.958,0.052,0.983,0.879,0.978,0.533,0.652,0.061,0.0,0.996
MLP,0.041,0.514,0.935,0.994,0.558,1.0,0.577,0.981,0.061,0.987,0.878,0.98,0.47,0.659,0.171,0.025,1.0
ResNet,0.04,0.512,0.934,0.996,0.567,1.0,0.575,0.978,0.057,0.987,0.882,0.979,0.486,0.662,0.176,0.019,0.998
SAINT,0.045,0.521,0.94,0.994,0.553,1.0,0.561,0.979,0.065,0.989,0.889,0.979,0.498,0.669,0.18,0.054,1.0
FT-Transformer,0.045,0.519,0.933,0.996,0.561,1.0,0.559,0.98,0.063,0.99,0.891,0.979,0.47,0.671,0.179,0.039,1.0
Random Forest,0.045,0.499,0.935,0.993,0.577,1.0,0.554,0.981,0.076,0.988,0.875,0.979,0.567,0.673,0.182,0.07,1.0


In [53]:
res_NC = res[res.benchmark=='numerical_classification_medium'].pivot_table(values='Mean Test Score',columns='data__keyword',index='Model')
res_NC.iloc[[1,3,2,5,10,6,8,9,4,7,11,0]]

data__keyword,Bioresponse,Diabetes130US,Higgs,MagicTelescope,MiniBooNE,bank-marketing,california,covertype,credit,default-of-credit-card-clients,electricity,eye_movements,heloc,house_16H,jannis,pol
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
DT,0.68,0.601,0.649,0.788,0.869,0.771,0.839,0.745,0.752,0.699,0.775,0.566,0.693,0.823,0.715,0.915
EBM_S,0.772,0.605,0.686,0.828,0.915,0.799,0.879,0.752,0.767,0.713,0.821,0.59,0.722,0.87,0.747,0.948
EBM,0.775,0.606,0.707,0.85,0.924,0.803,0.89,0.777,0.77,0.716,0.829,0.614,0.721,0.877,0.763,0.978
LR,0.735,0.599,0.636,0.768,0.842,0.742,0.831,0.627,0.706,0.678,0.74,0.556,0.71,0.821,0.724,0.855
TabSRALinear,0.767,0.605,0.679,0.85,0.918,0.789,0.877,0.793,0.744,0.709,0.792,0.584,0.719,0.873,0.749,0.983
MLP,0.763,0.604,0.685,0.85,0.933,0.789,0.868,0.78,0.771,0.708,0.807,0.578,0.719,0.877,0.745,0.944
ResNet,0.766,0.604,0.689,0.858,0.936,0.786,0.87,0.79,0.772,0.706,0.808,0.581,0.718,0.873,0.744,0.948
SAINT,0.758,0.604,0.705,0.847,0.937,0.793,0.88,0.801,0.763,0.714,0.819,0.583,0.718,0.888,0.767,0.979
FT-Transformer,0.748,0.605,0.703,0.857,0.934,0.794,0.885,0.799,0.775,0.714,0.816,0.584,0.721,0.88,0.763,0.981
Random Forest,0.794,0.604,0.707,0.853,0.927,0.797,0.892,0.824,0.772,0.718,0.859,0.645,0.717,0.881,0.772,0.982


In [56]:
res_CC = res[res.benchmark=='categorical_classification_medium'].pivot_table(values='Mean Test Score',columns='data__keyword',index='Model')
res_CC.iloc[[1,3,2,5,10,6,8,9,4,7,11,0]]

data__keyword,albert,compas-two-years,covertype,default-of-credit-card-clients,electricity,eye_movements,road-safety
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DT,0.637,0.66,0.76,0.698,0.767,0.565,0.727
EBM_S,0.652,0.675,0.776,0.712,0.827,0.596,0.732
EBM,0.658,0.672,0.799,0.717,0.838,0.616,0.749
LR,0.635,0.667,0.772,0.679,0.74,0.567,0.697
TabSRALinear,0.643,0.668,0.849,0.711,0.806,0.602,0.748
MLP,0.652,0.678,0.834,0.709,0.819,0.586,0.756
ResNet,0.65,0.676,0.833,0.704,0.824,0.589,0.761
SAINT,0.652,0.674,0.847,0.712,0.83,0.593,0.765
FT-Transformer,0.653,0.682,0.858,0.715,0.831,0.59,0.769
Random Forest,0.654,0.679,0.859,0.717,0.876,0.658,0.76
