## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd

def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

wine = datasets.load_wine()

df_wine = sklearn_to_df(wine)
df_wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [2]:
X = df_wine
y = df_wine['target'][:,np.newaxis]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train.ravel())
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
print("Feature importance: ", clf.feature_importances_)

Accuracy:  1.0
Feature importance:  [4.95139251e-03 2.84092978e-05 1.12749294e-04 6.31036617e-05
 1.12181964e-02 6.69533517e-06 1.18828665e-03 5.87567563e-05
 9.42780938e-04 2.66174885e-01 2.82330961e-03 2.51146938e-05
 3.29867103e-02 6.79419610e-01]


In [3]:
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
grid_search = GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train.ravel())

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.4s finished


In [4]:
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 1.000000 using {'n_estimators': 100, 'max_depth': 1}


In [5]:
grid_result.best_params_

{'max_depth': 1, 'n_estimators': 100}

In [6]:
clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])
clf_bestparam.fit(x_train, y_train.ravel())
y_pred = clf_bestparam.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
print("Feature importance: ", clf.feature_importances_)

Accuracy:  1.0
Feature importance:  [4.95139251e-03 2.84092978e-05 1.12749294e-04 6.31036617e-05
 1.12181964e-02 6.69533517e-06 1.18828665e-03 5.87567563e-05
 9.42780938e-04 2.66174885e-01 2.82330961e-03 2.51146938e-05
 3.29867103e-02 6.79419610e-01]
