# Optimizing model parameters

In [6]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data[:,2:]
y = iris.target
from sklearn.model_selection import train_test_split, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)  # split into 75:25 by default

In [7]:
from sklearn.neighbors import KNeighborsClassifier
### Number of nearest neighbors
knn_clf = KNeighborsClassifier()

In [8]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [9]:
from sklearn.metrics import accuracy_score

In [11]:
# To evaluate all the hyper parameter combinations we can write a nested for loops as below 
best_score = 0
for nn in [2, 3, 4, 5, 6, 7, 8, 9]: 
    for algo in ['auto', 'ball_tree', 'kd_tree', 'brute']: 
        # for each combination of parameters, train a model 
        knn_clf = KNeighborsClassifier(n_neighbors = nn , algorithm = algo)
        knn_clf.fit(X_train, y_train)
        # evaluate the model on the test data
        score = knn_clf.score( X_test, y_test) 
        # if the score is higher than previous iteration score, store the score and parameters 
        if score > best_score: 
            best_score = score 
            best_parameters = {' n_neighbors': nn, 'algorithm': algo} 
            
            
print(" Best score: {:.2f}".format( best_score)) 
print(" Best parameters: {}".format( best_parameters))



 Best score: 0.97
 Best parameters: {' n_neighbors': 3, 'algorithm': 'auto'}


In [12]:
# The problem with this approach it is not reliable as it is based on only one test data
# and the best hyper parameter values are influenced by test data 
# Instead we can split data into three parts - Training, Validation and Testing. Do the hyper parameter evaluation on training 
# and validation data. Check the best hyper parameters on the test set.  
# When we split data into three and sample is small, this reduced the number of data points available for training!


# Use Grid Search with Cross Validation

In [12]:
param_grid = {'n_neighbors': list(range(2,9)) , 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [13]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(knn_clf,param_grid,cv=10)

In [14]:
gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='brute', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=9, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [2, 3, 4, 5, 6, 7, 8]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [15]:
gs.best_params_    # Check if this is the same combination obtained thru one single train test above in the for next loop

{'algorithm': 'auto', 'n_neighbors': 2}

In [16]:
gs.best_estimator_   # To obtain an instance of the model with the best hyper parameters. This is automatically copied /used to
                     # update our model with the best hyper parameters

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [17]:
gs.cv_results_['params']

[{'algorithm': 'auto', 'n_neighbors': 2},
 {'algorithm': 'auto', 'n_neighbors': 3},
 {'algorithm': 'auto', 'n_neighbors': 4},
 {'algorithm': 'auto', 'n_neighbors': 5},
 {'algorithm': 'auto', 'n_neighbors': 6},
 {'algorithm': 'auto', 'n_neighbors': 7},
 {'algorithm': 'auto', 'n_neighbors': 8},
 {'algorithm': 'ball_tree', 'n_neighbors': 2},
 {'algorithm': 'ball_tree', 'n_neighbors': 3},
 {'algorithm': 'ball_tree', 'n_neighbors': 4},
 {'algorithm': 'ball_tree', 'n_neighbors': 5},
 {'algorithm': 'ball_tree', 'n_neighbors': 6},
 {'algorithm': 'ball_tree', 'n_neighbors': 7},
 {'algorithm': 'ball_tree', 'n_neighbors': 8},
 {'algorithm': 'kd_tree', 'n_neighbors': 2},
 {'algorithm': 'kd_tree', 'n_neighbors': 3},
 {'algorithm': 'kd_tree', 'n_neighbors': 4},
 {'algorithm': 'kd_tree', 'n_neighbors': 5},
 {'algorithm': 'kd_tree', 'n_neighbors': 6},
 {'algorithm': 'kd_tree', 'n_neighbors': 7},
 {'algorithm': 'kd_tree', 'n_neighbors': 8},
 {'algorithm': 'brute', 'n_neighbors': 2},
 {'algorithm': 'bru

In [18]:
gs.cv_results_['mean_test_score']  # These scores are more reliable as they are the averaged output of CV scores

array([0.95606061, 0.94621212, 0.9469697 , 0.95530303, 0.95530303,
       0.95530303, 0.95530303, 0.95606061, 0.94621212, 0.9469697 ,
       0.95530303, 0.95530303, 0.95530303, 0.95530303, 0.95606061,
       0.94621212, 0.9469697 , 0.95530303, 0.95530303, 0.95530303,
       0.95530303, 0.9469697 , 0.94621212, 0.9469697 , 0.95530303,
       0.95530303, 0.95530303, 0.94621212])

In [19]:
# Select the best model as the final model

final_model = gs.best_estimator_   # Note: This is done automatically if the refit option is true. 

In [20]:
final_model.score(X_test , y_test)  

0.9473684210526315