# Hyperparameter Tuning

Grid Search Cross-Validation

In [16]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [18]:
bcd = datasets.load_breast_cancer()
x = bcd.data
y = bcd.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(x_train, y_train)
y_prediction = knn.predict(x_test)

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors':np.arange(1,50)}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(x,y)

print(knn_cv.best_params_)
print(knn_cv.best_score_) # Returns the mean accuracy on the given test data and labels.

{'n_neighbors': 13}
0.9332401800962584


In [21]:
from scipy.stats import randint # randint(1, 9).rvs(2)

#from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import RandomizedSearchCV

#GridSearchCV can be computationally expensive, especially if you are searching over a large hyperparameter space and dealing with multiple hyperparameters

param = {"max_depth": [3, None],
"max_features": randint(1, 9), # [2, 4, 6, 7]
"min_samples_leaf": randint(1, 9)}
#Dictionary with parameters names (string) as keys and distributions or lists of parameters to try.
#Distributions must provide a rvs method for sampling

tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, param, cv=5) #CV=None, to use the default 3-fold cross validation,

tree_cv.fit(x_train, y_train)

print(tree_cv.best_params_)
print(tree_cv.best_score_)

y_pred = tree_cv.predict(x_test)
score = tree_cv.score(x_test, y_test)

print(score)

{'max_depth': None, 'max_features': 3, 'min_samples_leaf': 4}
0.9252747252747252
0.9122807017543859
