# Using GridSearchCV/RandomizedSearchCV to find best model and perform hyper parameter tuning: wine dataset
- target is 0,1,2

In [70]:
import pandas as pd

df = pd.read_csv("data_wine.csv")
print(df.sample(10))

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
145    13.16        3.57  2.15               21.0        102           1.50   
120    11.45        2.40  2.42               20.0         96           2.90   
74     11.96        1.09  2.30               21.0        101           3.38   
92     12.69        1.53  2.26               20.7         80           1.38   
23     12.85        1.60  2.52               17.8         95           2.48   
158    14.34        1.68  2.70               25.0         98           2.80   
133    12.70        3.55  2.36               21.5        106           1.70   
169    13.40        4.60  2.86               25.0        112           1.98   
28     13.87        1.90  2.80               19.4        107           2.95   
50     13.05        1.73  2.04               12.4         92           2.72   

     flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
145        0.55                  0.43            

### Or load it from sckit learn
- Uncomment below if above does not work

In [52]:
# import pandas as pd
# from sklearn.datasets import load_wine

# wine_df = load_wine(as_frame=True)

# df = wine_df.frame
# print(df)

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5       95.0           1.68   
174    13.40        3.91  2.48               23.0      102.0           1.80   
175    13.27        4.28  2.26               20.0      120.0           1.59   
176    13.17        2.59  2.37               20.0      120.0           1.65   
177    14.13        4.10  2.74               24.5       96.0           2.05   

     flavanoids  nonflavanoid_phenols  proanthocyan

In [71]:
print(df.shape)

(178, 14)


In [53]:
# step1: Separate features(X) and labels(y):
# This time choose just few columns. Later Choose more.

X = df[["alcohol", "malic_acid", "ash", "alcalinity_of_ash", "magnesium", "total_phenols"]]
y = df["target"]

In [54]:
# step2) Print X and y
print("X:\n", X)
print("y:\n", y)

X:
      alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols
0      14.23        1.71  2.43               15.6      127.0           2.80
1      13.20        1.78  2.14               11.2      100.0           2.65
2      13.16        2.36  2.67               18.6      101.0           2.80
3      14.37        1.95  2.50               16.8      113.0           3.85
4      13.24        2.59  2.87               21.0      118.0           2.80
..       ...         ...   ...                ...        ...            ...
173    13.71        5.65  2.45               20.5       95.0           1.68
174    13.40        3.91  2.48               23.0      102.0           1.80
175    13.27        4.28  2.26               20.0      120.0           1.59
176    13.17        2.59  2.37               20.0      120.0           1.65
177    14.13        4.10  2.74               24.5       96.0           2.05

[178 rows x 6 columns]
y:
 0      0
1      0
2      0
3      0
4      0
      ..
17

## Approach 1 (BAD): Use train_test_split and manually tune parameters by trial and error

In [55]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [56]:
print("Shape of training dataset:", X_train.shape) # 105 rows and 4 columns
print("Shape of testing dataset:", X_test.shape) # 105 rows and 4 columns

Shape of training dataset: (124, 6)
Shape of testing dataset: (54, 6)


In [72]:
from sklearn.svm import SVC

model = SVC(kernel='linear', # linear’, ‘poly’, ‘rbf’, ‘sigmoid’,
            C=50,            # C = 10, 20, ..., 50
            gamma='auto',    # auto, scale
            degree = 4,
            decision_function_shape = 'ovr'
) # Number of combination = 4 kernels x 5 C x 2 gammas = 40

model.fit(X_train,y_train)
model.score(X_test, y_test)

0.8703703703703703

### How many combinations of hyperparameters ?
- **4 x 5 x 2 = 40**
- This would take a long time if done manually

## Approach 2: Use K Fold Cross validation

**Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation**

In [58]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import numpy as np

In [59]:
score = cross_val_score(SVC(kernel='linear',C=1, gamma='auto'), X, y, cv=5)

print(score)
print(np.mean(score))

[0.75       0.97222222 0.75       0.94285714 0.88571429]
0.8601587301587301


In [60]:
score = cross_val_score(SVC(kernel='rbf',  C=10, gamma='auto'), X, y, cv=5) # C=1,10

print(score)
print(np.mean(score))

[0.75       0.77777778 0.69444444 0.8        0.82857143]
0.7701587301587302


In [61]:
score = cross_val_score(SVC(kernel='rbf', C=20, gamma='auto'), X, y, cv=5)

print(score)
print(np.mean(score))

[0.75       0.77777778 0.69444444 0.77142857 0.82857143]
0.7644444444444445


### We ran above on few hyper parameters

**From above limited results we can say that kernel=linear with C=1 will give best performance**

## Approach 3: Use GridSearchCV
**GridSearchCV does exactly same thing as above but in a single line of code**

In [73]:
from sklearn.model_selection import GridSearchCV

model_gs = GridSearchCV(SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)

model_gs.fit(X, y)

In [63]:
# Lets see the CV results
df = pd.DataFrame(model_gs.cv_results_)
print(df)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0       0.004398      0.000787         0.002807        0.000752       1   
1       0.015718      0.007386         0.001993        0.000008       1   
2       0.004308      0.000577         0.002617        0.000502      10   
3       0.036823      0.014200         0.002199        0.000400      10   
4       0.004194      0.000402         0.002810        0.000405      20   
5       0.040441      0.012825         0.002195        0.000421      20   

  param_kernel                         params  split0_test_score  \
0          rbf      {'C': 1, 'kernel': 'rbf'}           0.722222   
1       linear   {'C': 1, 'kernel': 'linear'}           0.750000   
2          rbf     {'C': 10, 'kernel': 'rbf'}           0.750000   
3       linear  {'C': 10, 'kernel': 'linear'}           0.777778   
4          rbf     {'C': 20, 'kernel': 'rbf'}           0.750000   
5       linear  {'C': 20, 'kernel': 'linear'}           0.777778  

In [64]:
# Lets see the important columns

df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.758889
1,1,linear,0.860159
2,10,rbf,0.770159
3,10,linear,0.882381
4,20,rbf,0.764444
5,20,linear,0.89381


In [65]:
model_gs.best_params_

{'C': 20, 'kernel': 'linear'}

In [66]:
model_gs.best_score_

0.8938095238095238

## Now lets use GridsearchCV on different models with different hyperparameters

In [67]:
# step1) pick 3 models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}


In [74]:
# step2) Perform GridSearchCV on above 3 models and print each models 
# best score and best parameter
scores = []

for model_name, mp in model_params.items():
    model_gs =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    model_gs.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': model_gs.best_score_,
        'best_params': model_gs.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
print(df)

                 model  best_score                    best_params
0                  svm    0.893810  {'C': 20, 'kernel': 'linear'}
1        random_forest    0.876984           {'n_estimators': 10}
2  logistic_regression    0.803651                      {'C': 10}


### Conclusion: SVM with C=20 and kernel='linear' is the best model

## Approach4: Use RandomizedSearchCV
- **Use RandomizedSearchCV to reduce number of iterations** with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. 
- It helps reduce the cost of computation
- Does not guarantee the best model


In [69]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

model_rs = RandomizedSearchCV(SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
model_rs.fit(X, y)
pd.DataFrame(model_rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.882381
1,20,rbf,0.764444


### Conclusion: Best SVC model hyperparameters are C=10.(Result may be different in your case) 
- The result may be different than that of GridSearchCV because it uses randomized search