In [48]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
import common_datasets.binary_classification as binclas
import matplotlib.pyplot as plt

In [49]:
data = pd.read_csv('appendicitis-reg.csv')

In [50]:
grouped = data\
    .groupby(['name', 'oversampler', 'sparam', 'classifier', 'cparam'])\
    .apply(lambda pdf: pd.Series({'auc': pdf.sort_values('fold')['auc'].values.tolist()}))\
    .reset_index(drop=False)

In [51]:
grouped['auc_mean'] = grouped['auc'].apply(np.mean)

In [52]:
def extract_reg_param(row):
    if row['classifier'] == 'SVC':
        return eval(row['cparam'])['C']
    if row['classifier'] == 'DecisionTreeClassifier':
        return eval(row['cparam'])['max_depth']
    if row['classifier'] == 'RandomForestClassifier':
        return eval(row['cparam'])['max_depth']
    if row['classifier'] == 'KNeighborsClassifier':
        return eval(row['cparam'])['n_neighbors']

def extract_classifier_subparam(row):
    if row['classifier'] == 'SVC':
        kernel = eval(row['cparam']).get('kernel', 'rbf')
        degree = str(eval(row['cparam']).get('degree', ''))
        return kernel + degree
    return ''

In [53]:
grouped['reg_param'] = grouped.apply(extract_reg_param, axis=1)

In [54]:
for classifier in ['SVC', 'DecisionTreeClassifier', 'RandomForestClassifier', 'KNeighborsClassifier']:
    filtered = grouped[grouped['classifier'] == classifier]
    nosmote = filtered[filtered['oversampler'] == 'NoSMOTE']
    smote = filtered[filtered['oversampler'] == 'SMOTE']
    merged = pd.merge(nosmote,
                        smote[['reg_param', 'cparam', 'auc_mean']].rename(columns={'auc_mean': 'auc_mean_smote'}),
                        on=['reg_param', 'cparam'])
    merged['kernel'] = merged.apply(extract_classifier_subparam, axis=1)

    kernels = merged['kernel'].drop_duplicates().values

    for kernel in kernels:
        print(merged[merged['kernel'] == kernel].sort_values('reg_param')[['classifier', 'kernel', 'reg_param', 'auc_mean', 'auc_mean_smote']])

   classifier kernel  reg_param  auc_mean  auc_mean_smote
0         SVC  poly2      0.001  0.711359        0.250122
3         SVC  poly2      0.002  0.713850        0.250703
6         SVC  poly2      0.005  0.713638        0.261456
9         SVC  poly2      0.010  0.710585        0.657913
12        SVC  poly2      0.020  0.710629        0.746131
15        SVC  poly2      0.050  0.710712        0.750371
18        SVC  poly2      0.100  0.713918        0.751929
21        SVC  poly2      0.200  0.723368        0.752891
24        SVC  poly2      0.500  0.725665        0.755069
27        SVC  poly2      1.000  0.720097        0.755713
30        SVC  poly2      2.000  0.718424        0.746403
   classifier kernel  reg_param  auc_mean  auc_mean_smote
1         SVC  poly3      0.001  0.821631        0.155966
4         SVC  poly3      0.002  0.821949        0.402381
7         SVC  poly3      0.005  0.820960        0.857509
10        SVC  poly3      0.010  0.820712        0.848819
13        SVC 

In [55]:
merged[merged['kernel'] == ''].sort_values('reg_param')[['classifier', 'reg_param', 'auc_mean', 'auc_mean_smote']]

Unnamed: 0,classifier,reg_param,auc_mean,auc_mean_smote
2,KNeighborsClassifier,1.0,0.733643,0.709665
11,KNeighborsClassifier,5.0,0.779324,0.790846
12,KNeighborsClassifier,9.0,0.804013,0.809659
0,KNeighborsClassifier,13.0,0.81514,0.81761
1,KNeighborsClassifier,17.0,0.834174,0.82769
3,KNeighborsClassifier,21.0,0.830619,0.835022
4,KNeighborsClassifier,25.0,0.829724,0.834254
5,KNeighborsClassifier,29.0,0.843328,0.837459
6,KNeighborsClassifier,33.0,0.849118,0.842318
7,KNeighborsClassifier,37.0,0.849035,0.847653


In [56]:
data['reg_param'] = data.apply(extract_reg_param, axis=1)

In [57]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,fold,oversampler,sparam,classifier,cparam,auc,reg_param
0,0,appendicitis,0,SMOTE,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 3, 'random_state': 5}",0.564706,3.0
1,1,appendicitis,0,SMOTE,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 5, 'random_state': 5}",0.6,5.0
2,2,appendicitis,0,SMOTE,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 7, 'random_state': 5}",0.670588,7.0
3,3,appendicitis,0,SMOTE,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 9, 'random_state': 5}",0.670588,9.0
4,4,appendicitis,0,SMOTE,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 11, 'random_state': 5}",0.670588,11.0


In [58]:
data['sparam'] = data['sparam'].apply(eval)
data['cparam'] = data['cparam'].apply(eval)

In [59]:
def remove_key(dict, key):
    del dict[key]
    return dict

In [60]:
data['metric'] = data['sparam'].apply(lambda x: x['nn_params']['metric_learning_method'])
data['sparam'] = data['sparam'].apply(lambda x: remove_key(x, 'nn_params'))
data['cparam'] = data['cparam'].apply(str)
data['sparam'] = data['sparam'].apply(str)

KeyError: 'nn_params'

In [None]:
data.columns

Index(['Unnamed: 0', 'name', 'fold', 'sparam', 'classifier', 'cparam', 'auc',
       'metric'],
      dtype='object')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,fold,sparam,classifier,cparam,auc,metric
0,0,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}",0.7375,id
1,1,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 6, 'random_state': 5}",0.675431,id
2,2,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 8, 'random_state': 5}",0.681034,id
3,3,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 4, 'random_state': 5}",0.859483,id
4,4,bupa,0,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 6, 'random_state': 5}",0.858621,id


In [None]:
grouped = data.groupby(['name', 'sparam', 'classifier', 'cparam', 'metric']).apply(lambda pdf: pdf.sort_values('fold')['auc'].values.tolist())

In [None]:
grouped = grouped.reset_index(drop=False)
grouped = grouped.rename(columns={0: 'auc'})
determ = grouped[grouped['metric'] == 'MI_weighted'].drop(columns=['metric'])
rand = grouped[grouped['metric'] == 'id'].drop(columns=['metric'])
merged = pd.merge(determ.rename(columns={'auc': 'auc_det'}), rand, on=['name', 'sparam', 'classifier', 'cparam'])

In [None]:
merged['auc_mean_det'] = merged['auc_det'].apply(np.mean)
merged['auc_std_det'] = merged['auc_det'].apply(np.std)
merged['auc_min_det'] = merged['auc_det'].apply(np.min)
merged['auc_max_det'] = merged['auc_det'].apply(np.max)
merged['auc_mean'] = merged['auc'].apply(np.mean)
merged['auc_std'] = merged['auc'].apply(np.std)
merged['auc_min'] = merged['auc'].apply(np.min)
merged['auc_max'] = merged['auc'].apply(np.max)
merged['p_l'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='less').pvalue, axis=1)
merged['p_g'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='greater').pvalue, axis=1)
merged['f_l'] = merged['p_l'] < 0.05
merged['f_g'] = merged['p_g'] < 0.05

In [None]:
def model_selection(pdf):
    max_det = pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()].iloc[0]
    max_ran = pdf[pdf['auc_mean'] == pdf['auc_mean'].max()].iloc[0]
    return pd.Series({'auc_mean_det': max_det['auc_mean_det'],
            'auc_mean': max_ran['auc_mean'],
            'auc_std_det': max_det['auc_std_det'],
            'auc_std': max_ran['auc_std'],
            'p_l': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='less').pvalue,
            'p_g': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='greater').pvalue,
            'auc_median_det': np.median(max_det['auc_det']),
            'auc_median': np.median(max_ran['auc'])})

In [None]:
merged.groupby('classifier').apply(model_selection)

Unnamed: 0_level_0,auc_mean_det,auc_mean,auc_std_det,auc_std,p_l,p_g,auc_median_det,auc_median
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DecisionTreeClassifier,0.661172,0.661109,0.060384,0.059722,0.443313,0.556687,0.662931,0.664224
KNeighborsClassifier,0.659195,0.658767,0.055252,0.055262,0.665874,0.334126,0.661207,0.657543
RandomForestClassifier,0.762595,0.764598,0.052691,0.052473,0.002484,0.997516,0.765517,0.76681
SVC,0.65013,0.650833,0.065829,0.065979,0.001734,0.998266,0.651724,0.652155


In [None]:
merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DecisionTreeClassifier,0,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}","[0.6857758620689656, 0.5900862068965518, 0.638...","[0.7375, 0.6668103448275862, 0.7, 0.6504310344...",0.661172,0.060384,0.45819,0.855603,0.661109,0.059722,0.475,0.833621,0.443313,0.556687,False,False
KNeighborsClassifier,4,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",KNeighborsClassifier,{'n_neighbors': 3},"[0.6133620689655173, 0.6340517241379311, 0.630...","[0.6051724137931035, 0.6189655172413793, 0.637...",0.659195,0.055252,0.482759,0.853448,0.658767,0.055262,0.492241,0.853448,0.665874,0.334126,False,False
RandomForestClassifier,10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False
SVC,19,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",SVC,"{'C': 0.1, 'probability': True, 'random_state'...","[0.746551724137931, 0.6741379310344828, 0.6362...","[0.7543103448275862, 0.6715517241379311, 0.634...",0.65013,0.065829,0.437069,0.850862,0.650833,0.065979,0.434483,0.850862,0.001734,0.998266,True,False


In [None]:
merged[merged['auc_mean_det'] == merged['auc_mean_det'].max()]

Unnamed: 0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False


In [None]:
merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DecisionTreeClassifier,0,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",DecisionTreeClassifier,"{'max_depth': 4, 'random_state': 5}","[0.6857758620689656, 0.5900862068965518, 0.638...","[0.7375, 0.6668103448275862, 0.7, 0.6504310344...",0.661172,0.060384,0.45819,0.855603,0.661109,0.059722,0.475,0.833621,0.443313,0.556687,False,False
KNeighborsClassifier,4,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",KNeighborsClassifier,{'n_neighbors': 3},"[0.6133620689655173, 0.6340517241379311, 0.630...","[0.6051724137931035, 0.6189655172413793, 0.637...",0.659195,0.055252,0.482759,0.853448,0.658767,0.055262,0.492241,0.853448,0.665874,0.334126,False,False
RandomForestClassifier,10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False
SVC,19,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",SVC,"{'C': 0.1, 'probability': True, 'random_state'...","[0.746551724137931, 0.6741379310344828, 0.6362...","[0.7543103448275862, 0.6715517241379311, 0.634...",0.65013,0.065829,0.437069,0.850862,0.650833,0.065979,0.434483,0.850862,0.001734,0.998266,True,False


In [None]:
tmp = merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])
tmp.iloc[-1]['cparam']

"{'C': 0.1, 'probability': True, 'random_state': 5}"

In [None]:
merged[merged['auc_mean'] == merged['auc_mean'].max()]

Unnamed: 0,name,sparam,classifier,cparam,auc_det,auc,auc_mean_det,auc_std_det,auc_min_det,auc_max_det,auc_mean,auc_std,auc_min,auc_max,p_l,p_g,f_l,f_g
10,bupa,"{'n_neighbors': 5, 'proportion': 1.0, 'random_...",RandomForestClassifier,"{'max_depth': 8, 'random_state': 5}","[0.8543103448275862, 0.7698275862068966, 0.755...","[0.8551724137931035, 0.7422413793103448, 0.755...",0.762595,0.052691,0.603448,0.928448,0.764598,0.052473,0.606034,0.923276,0.002484,0.997516,True,False


In [None]:
merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det', 'p_l', 'p_g', 'f_l', 'f_g']].mean()

auc_mean        0.610534
auc_std         0.052280
auc_mean_det    0.610252
auc_std_det     0.052778
p_l             0.405746
p_g             0.594254
f_l             0.250000
f_g             0.100000
dtype: float64

In [None]:
merged[['auc_min', 'auc_max', 'auc_min_det', 'auc_max_det']].mean()

auc_min        0.429138
auc_max        0.770366
auc_min_det    0.425280
auc_max_det    0.773621
dtype: float64

In [None]:
merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det']].median()

auc_mean        0.630787
auc_std         0.057362
auc_mean_det    0.630810
auc_std_det     0.057514
dtype: float64