### Hyperparameter Tuning
* Optimize model performance through tuning hyperparameters 
    * This is difficult to intuitively do 
* Any parameter can be tuned
    * To get the names of all parameters use model.get_params() 


In [15]:
#practice using random forest on wine quality csv

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, HalvingGridSearchCV
df = pd.read_csv("winequality-red.csv")
df = df.drop_duplicates()

df.head()
X  = df.drop('quality', axis = 1)
y = df['quality']


In [21]:
stats = pd.DataFrame(columns = ['hp tuning','accuracy', 'precision', 'recall'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 16)
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
pre = precision_score(y_test, pred, average = 'weighted')
re = recall_score(y_test, pred, average = 'weighted')
stats.loc[len(stats.index)] = ['default', acc, pre, re]

stats.head()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,hp tuning,accuracy,precision,recall
0,default,0.552941,0.515443,0.552941


In [24]:
print(model.get_params())


{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


#### Exhaustive Grid Search
* Try every possible combination for given range of hyperparameters 
* Use GridSearchCV(model, param_grid)
    * param_grid is a list of dictionaries, where each dict is the parameter name and values to try
    * param_grid = [{'param' : [values], 'param2' : [values]}]
* This approach scales poorly with bigger HP spaces 


In [None]:
param_grid = [{
               'n_estimators' : range(10,1000,10)
            #    'max_samples' : [x/10 for x in range(1,11)]
               }]
exhaustive = GridSearchCV(model, param_grid = param_grid, scoring= ('accuracy'))

exhaustive.fit(X_train,y_train)

#### Randomized Grid Search
* Randomly search the grid space for HPs
* Use RandomizedSearchCV(model, param_dist)
    * param_dist is a list of dictionaries, where each dict is the parameter names and distributions


#### Bayesian Optimization
* Update HP given new information (conditional probabilities)
* Find probability of hypothesis given the data 
* Maximize the acquisition function with each HP value 