## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn import datasets, metrics
digits = datasets.load_digits()

  from numpy.core.umath_tests import inner1d


In [2]:
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=4)
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print (metrics.classification_report(y_test, y_pred))

Acuuracy:  0.9666666666666667
             precision    recall  f1-score   support

          0       0.96      0.98      0.97        48
          1       0.95      0.95      0.95        39
          2       1.00      0.98      0.99        48
          3       0.98      0.93      0.95        43
          4       1.00      0.95      0.98        42
          5       0.96      0.96      0.96        49
          6       1.00      0.97      0.99        39
          7       0.98      1.00      0.99        49
          8       0.89      0.98      0.93        43
          9       0.96      0.96      0.96        50

avg / total       0.97      0.97      0.97       450



In [3]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
n_estimators = [100,200,300]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [4]:
clf=GradientBoostingClassifier()
clf_random=RandomizedSearchCV(estimator=clf,param_distributions=random_grid,n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
clf_random.fit(x_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 11.5min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [5]:
clf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 60}

In [7]:
clf2 = GradientBoostingClassifier(n_estimators=300,min_samples_split=5,min_samples_leaf=4,max_features='sqrt',max_depth=60)
clf2.fit(x_train, y_train)
y_pred = clf2.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print (metrics.classification_report(y_test, y_pred))

Acuuracy:  0.9822222222222222
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        48
          1       0.95      0.97      0.96        39
          2       1.00      1.00      1.00        48
          3       0.95      0.98      0.97        43
          4       1.00      1.00      1.00        42
          5       0.98      0.96      0.97        49
          6       1.00      0.97      0.99        39
          7       1.00      1.00      1.00        49
          8       0.98      0.95      0.96        43
          9       0.96      0.98      0.97        50

avg / total       0.98      0.98      0.98       450



In [8]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
n_estimators = [280,300,320]
max_features = ['sqrt']
min_samples_split = [4, 5, 6]
min_samples_leaf = [3, 4, 5]
max_depth=[50,60,70]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 13.2min finished


In [11]:
print(grid_result.best_params_)
clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'],
                                         max_features=grid_result.best_params_['max_features'],
                                         min_samples_split=grid_result.best_params_['min_samples_split'],
                                         min_samples_leaf=grid_result.best_params_['min_samples_leaf'])
clf_bestparam.fit(x_train,y_train)
y_pred=clf_bestparam.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print (metrics.classification_report(y_test, y_pred))


{'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 280}
Acuuracy:  0.9844444444444445
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        48
          1       0.95      1.00      0.97        39
          2       1.00      1.00      1.00        48
          3       0.98      0.95      0.96        43
          4       1.00      1.00      1.00        42
          5       0.98      0.98      0.98        49
          6       1.00      0.97      0.99        39
          7       1.00      1.00      1.00        49
          8       0.98      0.98      0.98        43
          9       0.96      0.96      0.96        50

avg / total       0.98      0.98      0.98       450

