In [316]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
import common_datasets.binary_classification as binclas

In [317]:
data = pd.read_csv('haberman.csv')

In [318]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,fold,sparam,classifier,cparam,auc
0,0,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 2, 'random_state': 5}",0.604575
1,1,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}",0.594771
2,2,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 6, 'random_state': 5}",0.639216
3,3,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 8, 'random_state': 5}",0.666667
4,4,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",RandomForestClassifier,"{'max_depth': 2, 'random_state': 5}",0.601961


In [319]:
data['sparam'] = data['sparam'].apply(eval)
data['cparam'] = data['cparam'].apply(eval)

In [320]:
def remove_key(dict, key):
    del dict[key]
    return dict

In [321]:
data['deterministic'] = data['sparam'].apply(lambda x: x['ss_params']['within_simplex_sampling'])
data['sparam'] = data['sparam'].apply(lambda x: remove_key(x, 'ss_params'))
data['cparam'] = data['cparam'].apply(str)
data['sparam'] = data['sparam'].apply(str)

In [322]:
data.columns

Index(['Unnamed: 0', 'name', 'fold', 'sparam', 'classifier', 'cparam', 'auc',
       'deterministic'],
      dtype='object')

In [323]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,fold,sparam,classifier,cparam,auc,deterministic
0,0,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 2, 'random_state': 5}",0.604575,random
1,1,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}",0.594771,random
2,2,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 6, 'random_state': 5}",0.639216,random
3,3,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 8, 'random_state': 5}",0.666667,random
4,4,haberman,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",RandomForestClassifier,"{'max_depth': 2, 'random_state': 5}",0.601961,random


In [324]:
grouped = data.groupby(['name', 'sparam', 'classifier', 'cparam', 'deterministic']).apply(lambda pdf: pdf.sort_values('fold')['auc'].values.tolist())

In [325]:
grouped = grouped.reset_index(drop=False)
grouped = grouped.rename(columns={0: 'auc'})
determ = grouped[grouped['deterministic'] == 'deterministic'].drop(columns=['deterministic'])
rand = grouped[grouped['deterministic'] == 'random'].drop(columns=['deterministic'])
merged = pd.merge(determ.rename(columns={'auc': 'auc_det'}), rand, on=['name', 'sparam', 'classifier', 'cparam'])

In [326]:
merged['auc_mean_det'] = merged['auc_det'].apply(np.mean)
merged['auc_std_det'] = merged['auc_det'].apply(np.std)
merged['auc_min_det'] = merged['auc_det'].apply(np.min)
merged['auc_max_det'] = merged['auc_det'].apply(np.max)
merged['auc_mean'] = merged['auc'].apply(np.mean)
merged['auc_std'] = merged['auc'].apply(np.std)
merged['auc_min'] = merged['auc'].apply(np.min)
merged['auc_max'] = merged['auc'].apply(np.max)
merged['p_l'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='less').pvalue, axis=1)
merged['p_g'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='greater').pvalue, axis=1)
merged['f_l'] = merged['p_l'] < 0.05
merged['f_g'] = merged['p_g'] < 0.05

In [327]:
def model_selection(pdf):
    max_det = pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()].iloc[0]
    max_ran = pdf[pdf['auc_mean'] == pdf['auc_mean'].max()].iloc[0]
    return pd.Series({'auc_mean_det': max_det['auc_mean_det'],
            'auc_mean': max_ran['auc_mean'],
            'auc_std_det': max_det['auc_std_det'],
            'auc_std': max_ran['auc_std'],
            'p_l': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='less').pvalue,
            'p_g': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='greater').pvalue,
            'auc_median_det': np.median(max_det['auc_det']),
            'auc_median': np.median(max_ran['auc'])})

In [328]:
merged.groupby('classifier').apply(model_selection)

Unnamed: 0_level_0,auc_mean_det,auc_mean,auc_std_det,auc_std,p_l,p_g,auc_median_det,auc_median
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DecisionTreeClassifier,0.665536,0.665514,0.065766,0.060871,0.627707,0.372293,0.666319,0.670139
KNeighborsClassifier,0.643067,0.643093,0.071446,0.069934,0.446855,0.553145,0.646446,0.649163
RandomForestClassifier,0.710325,0.708644,0.06055,0.062652,0.909747,0.090253,0.711458,0.711111
SVC,0.719221,0.718938,0.070995,0.070885,0.810147,0.189853,0.723897,0.722917


In [329]:
merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DecisionTreeClassifier,155,haberman,"{'n_neighbors': 7, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}","[0.5908496732026143, 0.7548611111111111, 0.653...","[0.630718954248366, 0.8020833333333333, 0.7270...",0.665536,0.065766,0.474306,0.807639,0.660699,0.073328,0.434722,0.826389,0.915268,0.084732,False,False
KNeighborsClassifier,116,haberman,"{'n_neighbors': 5, 'proportion': 1.5, 'random_...",KNeighborsClassifier,{'n_neighbors': 5},"[0.550326797385621, 0.6277777777777778, 0.5958...","[0.565359477124183, 0.6069444444444444, 0.5687...",0.643067,0.071446,0.419444,0.796528,0.643093,0.069934,0.390278,0.797917,0.446855,0.553145,False,False
RandomForestClassifier,120,haberman,"{'n_neighbors': 5, 'proportion': 1.5, 'random_...",RandomForestClassifier,"{'max_depth': 4, 'random_state': 5}","[0.5947712418300654, 0.7388888888888889, 0.75,...","[0.5934640522875817, 0.7486111111111111, 0.730...",0.710325,0.06055,0.491503,0.873611,0.707879,0.059607,0.51634,0.866667,0.984935,0.015065,False,True
SVC,150,haberman,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",SVC,"{'C': 0.01, 'probability': True, 'random_state...","[0.6261437908496732, 0.751388888888889, 0.6930...","[0.615686274509804, 0.75, 0.6958333333333333, ...",0.719221,0.070995,0.462745,0.851634,0.7187,0.07096,0.458824,0.84902,0.928957,0.071043,False,False


In [330]:
merged[merged['auc_mean_det'] == merged['auc_mean_det'].max()]

Unnamed: 0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
150,haberman,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",SVC,"{'C': 0.01, 'probability': True, 'random_state...","[0.6261437908496732, 0.751388888888889, 0.6930...","[0.615686274509804, 0.75, 0.6958333333333333, ...",0.719221,0.070995,0.462745,0.851634,0.7187,0.07096,0.458824,0.84902,0.928957,0.071043,False,False


In [331]:
merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DecisionTreeClassifier,176,haberman,"{'n_neighbors': 7, 'proportion': 1.5, 'random_...",DecisionTreeClassifier,"{'max_depth': 2, 'random_state': 5}","[0.5215686274509804, 0.8076388888888888, 0.655...","[0.5875816993464053, 0.8083333333333332, 0.672...",0.665476,0.063516,0.49085,0.85,0.665514,0.060871,0.49085,0.822222,0.583773,0.416227,False,False
KNeighborsClassifier,116,haberman,"{'n_neighbors': 5, 'proportion': 1.5, 'random_...",KNeighborsClassifier,{'n_neighbors': 5},"[0.550326797385621, 0.6277777777777778, 0.5958...","[0.565359477124183, 0.6069444444444444, 0.5687...",0.643067,0.071446,0.419444,0.796528,0.643093,0.069934,0.390278,0.797917,0.446855,0.553145,False,False
RandomForestClassifier,164,haberman,"{'n_neighbors': 7, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 4, 'random_state': 5}","[0.6052287581699347, 0.75, 0.7305555555555555,...","[0.5830065359477125, 0.7361111111111112, 0.773...",0.708954,0.061871,0.500654,0.879167,0.708644,0.062652,0.51634,0.879167,0.69542,0.30458,False,False
SVC,153,haberman,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",SVC,"{'C': 0.1, 'probability': True, 'random_state'...","[0.6235294117647059, 0.7486111111111112, 0.691...","[0.6143790849673203, 0.75, 0.701388888888889, ...",0.719117,0.071229,0.465359,0.854248,0.718938,0.070885,0.458824,0.84902,0.6654,0.3346,False,False


In [332]:
merged[merged['auc_mean'] == merged['auc_mean'].max()]

Unnamed: 0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
153,haberman,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",SVC,"{'C': 0.1, 'probability': True, 'random_state'...","[0.6235294117647059, 0.7486111111111112, 0.691...","[0.6143790849673203, 0.75, 0.701388888888889, ...",0.719117,0.071229,0.465359,0.854248,0.718938,0.070885,0.458824,0.84902,0.6654,0.3346,False,False


In [333]:
merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det', 'p_l', 'p_g', 'f_l', 'f_g']].mean()

auc_mean        0.608057
auc_std         0.077244
auc_mean_det    0.607343
auc_std_det     0.077029
p_l             0.431314
p_g             0.568686
f_l             0.141414
f_g             0.055556
dtype: float64

In [334]:
merged[['auc_min', 'auc_max', 'auc_min_det', 'auc_max_det']].mean()

auc_min        0.381989
auc_max        0.802708
auc_min_det    0.380450
auc_max_det    0.799888
dtype: float64

In [335]:
merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det']].median()

auc_mean        0.632868
auc_std         0.070529
auc_mean_det    0.631631
auc_std_det     0.070056
dtype: float64