In [1]:
# Author: Ahmet Yildirim
# Date: 30.04.2019

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_rows=500
pd.options.display.max_columns=50

In [3]:
def find_optimal_parameters(est, p_grid, X_train, y_train, n=2, scr='accuracy'):
    cv = StratifiedKFold(n_splits=n, 
                         shuffle=True, 
                         random_state=1)  
    
    return GridSearchCV(estimator=est,
                        param_grid=p_grid,
                        scoring=scr,
                        n_jobs=1,
                        cv=cv,
                        verbose=0,
                        refit=True)

In [4]:
def compute_nested_score(est, X_train, y_train, n):
    cv = StratifiedKFold(n_splits=n,
                         shuffle=True,
                         random_state=1)
    
    nested_score = cross_val_score(est,
                                   X=X_train,
                                   y=y_train,
                                   cv=cv,
                                   n_jobs=1)
    
    return {'mean_score': nested_score.mean() * 100, 'std_score':nested_score.std() * 100}

In [5]:
def compare_models(est_dict, X_train, y_train, n=5):
    return {name: compute_nested_score(est, X_train, y_train, n) for name, est in est_dict.items()}

In [6]:
# Loading and splitting the dataset
dataset = load_iris()
X = dataset['data']
y = dataset['target']
X = X.astype(np.float32)
y = y.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

In [7]:
# Estimators
logreg = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=1)
knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=50)
svm = SVC(random_state=1)

# Parameter Grids for Estimators
p_grid_logreg = [{'penalty': ['l2'],
              'C': np.power(10., np.arange(-4, 4))}]

p_grid_knn = [{'n_neighbors': list(range(1, 10)),
               'p': [1, 2]}]

p_grid_svm = [{'kernel': ['rbf'],
              'C': np.power(10., np.arange(-4, 4)),
              'gamma': np.power(10., np.arange(-5, 0))},
             {'kernel': ['linear'],
              'C': np.power(10., np.arange(-4, 4))}]

# Estimator List and Parameter Grid List
est_names = ['LogisticRegression', 'KNN', 'SVM']
est_list = [logreg, knn, svm]
p_grid_list = [p_grid_logreg, p_grid_knn, p_grid_svm]

In [8]:
# Estimator dictionary with tuned parameters
tuned_est_dict = {name: find_optimal_parameters(est, p_grid, X_train, y_train, n=2, scr='accuracy') for name, est, p_grid in zip(est_names, est_list, p_grid_list)}

In [9]:
# Comparison of estimators with tuned parameters
result = compare_models(tuned_est_dict, X_train, y_train, n=5)

In [12]:
pd.DataFrame(result).T.sort_values(['mean_score','std_score'],ascending=[False,True])

Unnamed: 0,mean_score,std_score
SVM,96.666667,3.118048
KNN,96.666667,3.118048
LogisticRegression,94.166667,4.249183
