## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# 讀取 Wine 資料集
wine = datasets.load_wine()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
regr = RandomForestRegressor(n_estimators =10)

# 將訓練資料丟進去模型訓練
regr.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

acc = regr.score(x_test, y_test)
print("Acuuracy: ", acc)
print(wine.feature_names)
print("Feature importance: ", regr.feature_importances_)

Acuuracy:  0.9564835164835166
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Feature importance:  [0.09763779 0.00308877 0.00210554 0.00546034 0.00240367 0.
 0.48141455 0.         0.00442492 0.04766857 0.03003609 0.1489494
 0.17681037]


In [17]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
max_features = np.linspace(0.01,1.0)
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(regr, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)


Fitting 3 folds for each of 450 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed:   59.2s finished


In [18]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best Accuracy: -0.037221 using {'max_depth': 5, 'max_features': 0.4342857142857143, 'n_estimators': 300}


In [19]:
grid_result

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=No...
       0.51510204, 0.53530612, 0.5555102 , 0.57571429, 0.59591837,
       0.61612245, 0.63632653, 0.65653061, 0.67673469, 0.69693878,
       0.71714286, 0.73734694, 0.7

In [20]:
regr = RandomForestRegressor(n_estimators =100, max_features= 0.39387755102040817, max_depth = 5)

# 將訓練資料丟進去模型訓練
regr.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

acc = regr.score(x_test, y_test)
print("Acuuracy: ", acc)
print(boston.feature_names)
print("Feature importance: ", regr.feature_importances_)

Acuuracy:  0.9720330232186079
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Feature importance:  [0.06935057 0.01232787 0.00433598 0.01845472 0.01442094 0.05135944
 0.25034775 0.00140773 0.0265639  0.0615348  0.08920366 0.18564556
 0.21504709]
