In [1]:
# Author: Ahmet Yildirim
# Date: 13.09.2019

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import warnings

In [3]:
warnings.filterwarnings("ignore")
pd.options.display.max_rows=500
pd.options.display.max_columns=50

In [4]:
def create_grid_search(est, p_grid, X_train, y_train, scr, refit, n=2):
    cv = StratifiedKFold(n_splits=n, shuffle=True, random_state=1)
    return GridSearchCV(estimator=est, param_grid=p_grid, scoring=scr, n_jobs=1, cv=cv, verbose=0, refit=refit)

In [5]:
def compute_nested_score(est, X_train, y_train, scr, n=5):
    cv = StratifiedKFold(n_splits=n, shuffle=True, random_state=1)
    nested_score = pd.DataFrame(cross_validate(est, X=X_train, y=y_train, cv=cv, n_jobs=1, scoring=scr, return_train_score=True))
    return {'mean_score': nested_score.mean().to_dict(), 'std_score':nested_score.std().to_dict()}

In [6]:
def compare_models(est_dict, X_train, y_train, scr, n=5):
    return {name: compute_nested_score(est, X_train, y_train, scr, n) for name, est in est_dict.items()}

In [7]:
# Loading and splitting the dataset
dataset = load_iris()
X = dataset['data']
y = dataset['target']
X = X.astype(np.float32)
y = y.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

In [8]:
# Estimators
logreg = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=1)
knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=50)
svm = SVC(random_state=1)

# Parameter Grids for Estimators
p_grid_logreg = [{'penalty': ['l2'],
              'C': np.power(10., np.arange(-4, 4))}]

p_grid_knn = [{'n_neighbors': list(range(1, 10)),
               'p': [1, 2]}]

p_grid_svm = [{'kernel': ['rbf'],
              'C': np.power(10., np.arange(-4, 4)),
              'gamma': np.power(10., np.arange(-5, 0))},
             {'kernel': ['linear'],
              'C': np.power(10., np.arange(-4, 4))}]

# Estimator List and Parameter Grid List
est_names = ['LogisticRegression', 'KNN', 'SVM']
est_list = [logreg, knn, svm]
p_grid_list = [p_grid_logreg, p_grid_knn, p_grid_svm]

scoring = {'accuracy': 'accuracy'} 

In [9]:
# Estimator dictionary with tuned parameters
grid_search_dict = {name: create_grid_search(est, p_grid, X_train, y_train, scr='accuracy', refit='accuracy', n=2) for name, est, p_grid in zip(est_names, est_list, p_grid_list)}

In [10]:
# Comparison of estimators with tuned parameters
result = compare_models(grid_search_dict, X_train, y_train, scr=scoring, n=5)

In [11]:
# Result of estimator comparison in terms of roc_auc
pd.concat({k: pd.DataFrame(v).unstack().to_frame().T for k, v in result.items()})

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_score,mean_score,mean_score,mean_score,std_score,std_score,std_score,std_score
Unnamed: 0_level_1,Unnamed: 1_level_1,fit_time,score_time,test_accuracy,train_accuracy,fit_time,score_time,test_accuracy,train_accuracy
KNN,0,0.121391,0.00095,0.966667,0.96875,0.006577,1.4e-05,0.034861,0.023292
LogisticRegression,0,0.303104,0.000191,0.941667,0.972917,0.034667,2e-05,0.047507,0.011877
SVM,0,0.131478,0.00025,0.966667,0.977083,0.01545,8.2e-05,0.034861,0.017116


In [12]:
best_algo = grid_search_dict['SVM']
best_algo.fit(X_train, y_train)
best_params = best_algo.best_params_
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

In [15]:
print('Accuracy {0:.2f} (average over CV test folds)'.format(100 * best_algo.best_score_))
print('Best Parameters: {}'.format(best_params))
print('Training Accuracy: {0:.2f}'.format(100 * train_acc))
print('Test Accuracy: {0:.2f}'.format(100 * test_acc))

Accuracy 98.33 (average over CV test folds)
Best Parameters: {'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}
Training Accuracy: 97.50
Test Accuracy: 96.67
