## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
# 將需要的都import進來
import os
import copy
import time
import math
import numpy             as np
import pandas            as pd
import seaborn           as sns
import datetime          as dt
import warnings
import matplotlib.pyplot as plt
from scipy                   import stats
from itertools               import compress
from sklearn.tree            import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics         import roc_curve,mean_squared_error,r2_score,accuracy_score,precision_score,recall_score,fbeta_score
from sklearn.ensemble        import GradientBoostingRegressor,GradientBoostingClassifier,RandomForestClassifier,RandomForestRegressor
from sklearn.datasets        import load_boston,load_wine,load_breast_cancer
from sklearn.linear_model    import LogisticRegression,LinearRegression,Lasso,Ridge
from sklearn.preprocessing   import LabelEncoder, MinMaxScaler, StandardScaler,OneHotEncoder
from sklearn.model_selection import cross_val_score,train_test_split,KFold,GridSearchCV
from IPython.display         import YouTubeVideo

# 將較長的函式改名一下
MSE  = mean_squared_error
ACC  = accuracy_score
MME  = MinMaxScaler()
LE   = LabelEncoder()
OHE  = OneHotEncoder()

# 一些必要的設定
warnings.filterwarnings('ignore')
%matplotlib inline

# 設定【data的資料夾路徑】，命名為【data_folder】
data_folder = 'C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data'

In [2]:
# 讀取 breast_cancer 資料
t001 = load_breast_cancer()
# 轉成dataframe觀看資料：X
train_X_t1 = pd.DataFrame(t001.data, columns=t001.feature_names)
print(train_X_t1.shape)
print(train_X_t1.head())
# 轉成dataframe觀看資料：Y
train_Y_t1 = pd.DataFrame({"target": t001.target})
print(train_Y_t1.shape)
print(train_Y_t1.head())

(569, 30)
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0 

In [3]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(train_X_t1, train_Y_t1, test_size=0.1, random_state=4)
# 看切完長怎樣
print(x_train.shape)
print(x_train.head())
print(y_train.shape)
print(y_train.head())
print(x_test.shape)
print(x_test.head())
print(y_test.shape)
print(y_test.head())

(512, 30)
     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
345        10.26         14.71           66.20      321.6          0.09882   
319        12.43         17.00           78.60      477.3          0.07557   
72         17.20         24.52          114.20      929.4          0.10710   
298        14.26         18.17           91.22      633.1          0.06576   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mean symmetry  \
345           0.09159         0.03581              0.02037         0.1633   
319           0.03454         0.01342              0.01699         0.1472   
72            0.18300         0.16920              0.07944         0.1927   
298           0.05220         0.02475              0.01374         0.1635   
568           0.04362         0.00000              0.00000         0.1587   

     mean fractal dimension  ...  worst radius  worst text

In [4]:
# GBC1：讓深度是5，max_depth=5
GBC1 = GradientBoostingClassifier(loss='deviance',
                                  learning_rate=0.1,
                                  n_estimators=100,
                                  subsample=1.0,
                                  criterion='friedman_mse',
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  max_depth=5,
                                  min_impurity_decrease=0.0,
                                  min_impurity_split=None,
                                  init=None,
                                  random_state=None,
                                  max_features=None,
                                  verbose=0,
                                  max_leaf_nodes=None,
                                  warm_start=False,
                                  presort='auto',
                                  validation_fraction=0.1,
                                  n_iter_no_change=None,
                                  tol=0.0001)
GBC1.fit(x_train, y_train)
print(GBC1.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = GBC1.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
print("Accuracy: ", ACC(y_test, y_pred))

1.0
(57,)
   0
0  0
1  1
2  0
3  0
4  0
Mean squared error: 0.11
Accuracy:  0.8947368421052632


In [5]:
# 設定要訓練的超參數組合
n = [50, 100, 150]
d = [1, 3, 5]
grid_param = dict(n_estimators=n,max_depth=d)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(GBC1,grid_param,scoring="accuracy",n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    5.0s finished


In [6]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.958984 using {'max_depth': 1, 'n_estimators': 150}


In [7]:
# 使用最佳參數重新建立模型
GBC2 = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                  n_estimators=grid_result.best_params_['n_estimators'])
# 預測測試集
GBC2.fit(x_train, y_train)
print(GBC2.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = GBC2.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
print("Accuracy: ", ACC(y_test, y_pred))

0.9921875
(57,)
   0
0  1
1  1
2  0
3  0
4  0
Mean squared error: 0.09
Accuracy:  0.9122807017543859


於是發現系統選出了最佳參數，比原本自己設定的參數要佳。