<a href="https://colab.research.google.com/github/agarwalpratik/aiml/blob/main/Hyperparameter_Tuning_Iris.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#What is Hyperparameters?
#
# They are function parameters that we set in the Model Constructor
#

# Goal of Hyperparameter tuning is to IDENTIFY/DISCOVER the best parameter values to achieve the optimal score


In [1]:
import pandas as pd
import numpy as np

In [2]:
data= pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

In [6]:
# In sklearn cross validation is implemented in teh function named cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

model = KNeighborsClassifier()

scores = cross_val_score(model,
                         features,
                         label,
                         cv=5) #In theory, CV value can be any natural number. (5,10)

In [7]:
scores

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [8]:
scores.mean()

0.9733333333333334

In [9]:
#Goal: To identify the best Hyperparamter values for KNN that can give score >= 0.97
'''
KNeighborsClassifier(
  n_neighbors= Any positive integers,
  weights= ‘uniform’, ‘distance’
  algorithm= ‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’
  leaf_size= Any positive integers,
  p= 1,2,
  metric= 'minkowski'‘cityblock’‘cosine’'euclidean’‘haversine’‘l1’‘l2’‘manhattan’‘nan_euclidean’

'''

"\nKNeighborsClassifier(\n  n_neighbors= Any positive integers,\n  weights= ‘uniform’, ‘distance’\n  algorithm= ‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’\n  leaf_size= Any positive integers,\n  p= 1,2,\n  metric= 'minkowski'‘cityblock’‘cosine’'euclidean’‘haversine’‘l1’‘l2’‘manhattan’‘nan_euclidean’\n\n"

In [10]:
#Grid Search

#Step1: Create parameter list and parameter grid

weightParameter = ['uniform','distance']
n_neighborsParameter = np.arange(3,31)
algorithmParameter =['auto', 'ball_tree', 'kd_tree', 'brute']
metricParameter = ['minkowski','cityblock','cosine','euclidean','haversine','l1','l2','manhattan','nan_euclidean']
pParameter = np.arange(1,3)

#For ParamGrid
# key = parameterName
# value = valid Parameter arg

paramGrid = dict(n_neighbors=n_neighborsParameter,
                 weights=weightParameter,
                 algorithm=algorithmParameter,
                 metric=metricParameter,
                 p=pParameter)


#Step2: Initialize Algo

from sklearn.neighbors import KNeighborsClassifier
modelGridSearch = KNeighborsClassifier()

#Step3: Perform GridSearch

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(modelGridSearch,
                    param_grid=paramGrid,
                    cv=5)

grid.fit(features,label)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py", line 590, in compute
    unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
ValueError: invalid literal for int() with base 10: 'setosa'

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_classification.py", line 259, in predict
    probabilities = self.predict_proba(X)
  File "/usr/local/

In [11]:
grid.best_score_

0.9866666666666667

In [12]:
grid.best_estimator_

In [13]:
grid.best_params_

{'algorithm': 'auto',
 'metric': 'minkowski',
 'n_neighbors': 10,
 'p': 2,
 'weights': 'distance'}

In [14]:
#Randomized Search Method
# Some of the combinations are randomly selected

from sklearn.model_selection import RandomizedSearchCV

grid = RandomizedSearchCV(modelGridSearch,
                          param_distributions=paramGrid,
                          cv=5)

grid.fit(features,label)

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_classification.py", line 259, in predict
    probabilities = self.predict_proba(X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_classification.py", line 343, in predict_proba
    probabilities = ArgKminClassMode.compute(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py", line 590, in compute
    unique_Y_labels=np.array(unique_Y_labels, dt

In [15]:
grid.best_score_

0.9800000000000001

In [16]:
grid.best_params_

{'weights': 'uniform',
 'p': 1,
 'n_neighbors': 10,
 'metric': 'euclidean',
 'algorithm': 'kd_tree'}

# Try Cross Validation and Hyperparameter tuning for 50 Startups dataset (all possible algos). Create a report showcasing best param and best score for all possible algos

1. LinearRegression

2. KNeighborsRegressor

3. DecisionTreeRegressor

4. RandomForestRegressor

5. BaggingRegressor (in combination with LinearRegression, KNeighborsRegressor, DecisionTreeRegressor)