In [61]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
import common_datasets.binary_classification as binclas

In [62]:
data = pd.read_csv('bupa-ml.csv')

In [63]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,fold,sparam,classifier,cparam,auc
0,0,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}",0.7375
1,1,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 6, 'random_state': 5}",0.675431
2,2,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 8, 'random_state': 5}",0.681034
3,3,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 4, 'random_state': 5}",0.859483
4,4,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 6, 'random_state': 5}",0.858621


In [64]:
data['sparam'] = data['sparam'].apply(eval)
data['cparam'] = data['cparam'].apply(eval)

In [65]:
def remove_key(dict, key):
    del dict[key]
    return dict

In [66]:
data['metric'] = data['sparam'].apply(lambda x: x['nn_params']['metric_learning_method'])
data['sparam'] = data['sparam'].apply(lambda x: remove_key(x, 'nn_params'))
data['cparam'] = data['cparam'].apply(str)
data['sparam'] = data['sparam'].apply(str)

In [67]:
data.columns

Index(['Unnamed: 0', 'name', 'fold', 'sparam', 'classifier', 'cparam', 'auc',
       'metric'],
      dtype='object')

In [68]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,fold,sparam,classifier,cparam,auc,metric
0,0,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}",0.7375,id
1,1,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 6, 'random_state': 5}",0.675431,id
2,2,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 8, 'random_state': 5}",0.681034,id
3,3,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 4, 'random_state': 5}",0.859483,id
4,4,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 6, 'random_state': 5}",0.858621,id


In [69]:
grouped = data.groupby(['name', 'sparam', 'classifier', 'cparam', 'metric']).apply(lambda pdf: pdf.sort_values('fold')['auc'].values.tolist())

In [70]:
grouped = grouped.reset_index(drop=False)
grouped = grouped.rename(columns={0: 'auc'})
determ = grouped[grouped['metric'] == 'MI_weighted'].drop(columns=['metric'])
rand = grouped[grouped['metric'] == 'id'].drop(columns=['metric'])
merged = pd.merge(determ.rename(columns={'auc': 'auc_det'}), rand, on=['name', 'sparam', 'classifier', 'cparam'])

In [71]:
merged['auc_mean_det'] = merged['auc_det'].apply(np.mean)
merged['auc_std_det'] = merged['auc_det'].apply(np.std)
merged['auc_min_det'] = merged['auc_det'].apply(np.min)
merged['auc_max_det'] = merged['auc_det'].apply(np.max)
merged['auc_mean'] = merged['auc'].apply(np.mean)
merged['auc_std'] = merged['auc'].apply(np.std)
merged['auc_min'] = merged['auc'].apply(np.min)
merged['auc_max'] = merged['auc'].apply(np.max)
merged['p_l'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='less').pvalue, axis=1)
merged['p_g'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='greater').pvalue, axis=1)
merged['f_l'] = merged['p_l'] < 0.05
merged['f_g'] = merged['p_g'] < 0.05

In [72]:
def model_selection(pdf):
    max_det = pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()].iloc[0]
    max_ran = pdf[pdf['auc_mean'] == pdf['auc_mean'].max()].iloc[0]
    return pd.Series({'auc_mean_det': max_det['auc_mean_det'],
            'auc_mean': max_ran['auc_mean'],
            'auc_std_det': max_det['auc_std_det'],
            'auc_std': max_ran['auc_std'],
            'p_l': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='less').pvalue,
            'p_g': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='greater').pvalue,
            'auc_median_det': np.median(max_det['auc_det']),
            'auc_median': np.median(max_ran['auc'])})

In [73]:
merged.groupby('classifier').apply(model_selection)

Unnamed: 0_level_0,auc_mean_det,auc_mean,auc_std_det,auc_std,p_l,p_g,auc_median_det,auc_median
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DecisionTreeClassifier,0.661172,0.661109,0.060384,0.059722,0.443313,0.556687,0.662931,0.664224
KNeighborsClassifier,0.659195,0.658767,0.055252,0.055262,0.665874,0.334126,0.661207,0.657543
RandomForestClassifier,0.762595,0.764598,0.052691,0.052473,0.002484,0.997516,0.765517,0.76681
SVC,0.65013,0.650833,0.065829,0.065979,0.001734,0.998266,0.651724,0.652155


In [74]:
merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DecisionTreeClassifier,0,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}","[0.6857758620689656, 0.5900862068965518, 0.638...","[0.7375, 0.6668103448275862, 0.7, 0.6504310344...",0.661172,0.060384,0.45819,0.855603,0.661109,0.059722,0.475,0.833621,0.443313,0.556687,False,False
KNeighborsClassifier,4,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",KNeighborsClassifier,{'n_neighbors': 3},"[0.6133620689655173, 0.6340517241379311, 0.630...","[0.6051724137931035, 0.6189655172413793, 0.637...",0.659195,0.055252,0.482759,0.853448,0.658767,0.055262,0.492241,0.853448,0.665874,0.334126,False,False
RandomForestClassifier,10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False
SVC,19,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",SVC,"{'C': 0.1, 'probability': True, 'random_state'...","[0.746551724137931, 0.6741379310344828, 0.6362...","[0.7543103448275862, 0.6715517241379311, 0.634...",0.65013,0.065829,0.437069,0.850862,0.650833,0.065979,0.434483,0.850862,0.001734,0.998266,True,False


In [75]:
merged[merged['auc_mean_det'] == merged['auc_mean_det'].max()]

Unnamed: 0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False


In [76]:
merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DecisionTreeClassifier,0,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}","[0.6857758620689656, 0.5900862068965518, 0.638...","[0.7375, 0.6668103448275862, 0.7, 0.6504310344...",0.661172,0.060384,0.45819,0.855603,0.661109,0.059722,0.475,0.833621,0.443313,0.556687,False,False
KNeighborsClassifier,4,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",KNeighborsClassifier,{'n_neighbors': 3},"[0.6133620689655173, 0.6340517241379311, 0.630...","[0.6051724137931035, 0.6189655172413793, 0.637...",0.659195,0.055252,0.482759,0.853448,0.658767,0.055262,0.492241,0.853448,0.665874,0.334126,False,False
RandomForestClassifier,10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False
SVC,19,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",SVC,"{'C': 0.1, 'probability': True, 'random_state'...","[0.746551724137931, 0.6741379310344828, 0.6362...","[0.7543103448275862, 0.6715517241379311, 0.634...",0.65013,0.065829,0.437069,0.850862,0.650833,0.065979,0.434483,0.850862,0.001734,0.998266,True,False


In [82]:
tmp = merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])
tmp.iloc[-1]['cparam']

"{'C': 0.1, 'probability': True, 'random_state': 5}"

In [77]:
merged[merged['auc_mean'] == merged['auc_mean'].max()]

Unnamed: 0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False


In [78]:
merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det', 'p_l', 'p_g', 'f_l', 'f_g']].mean()

auc_mean        0.610534
auc_std         0.052280
auc_mean_det    0.610252
auc_std_det     0.052778
p_l             0.405746
p_g             0.594254
f_l             0.250000
f_g             0.100000
dtype: float64

In [79]:
merged[['auc_min', 'auc_max', 'auc_min_det', 'auc_max_det']].mean()

auc_min        0.429138
auc_max        0.770366
auc_min_det    0.425280
auc_max_det    0.773621
dtype: float64

In [80]:
merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det']].median()

auc_mean        0.630787
auc_std         0.057362
auc_mean_det    0.630810
auc_std_det     0.057514
dtype: float64