# 機器學習模型超參數最佳化


**資料集:**  
&nbsp; sklearn中的波士頓房價資料集

**機器學習演算法:**  
&nbsp; 隨機森林(Random forest, RF), 支持向量機(support vector machine, SVM), k最近鄰居(k-nearest neighbor, KNN)

**超參數最佳化演算法:**  
&nbsp; 網格搜尋(Grid search), 隨機搜尋(random search)

**效能衡量:**  
&nbsp; 均方差(Mean square error, MSE)

In [1]:
# 導入所需套件
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score  #衡量分數
from sklearn.neighbors import KNeighborsRegressor  #KNN
from sklearn.svm import SVR 

from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from scipy import stats  #處理亂數產生
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# 載入波士頓房價資料集
X, y = datasets.load_boston(return_X_y=True)

In [3]:
datasets.load_boston()

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

## 機器學習模式基準線: 以超參數預設值迴歸

In [3]:
# 隨機森林模式
clf = RandomForestRegressor()
scores = cross_val_score(clf, X, y, cv=3,scoring='neg_mean_squared_error') # 3-折交叉驗證
print("MSE:"+ str(-scores.mean())

#沒有用超參數時 誤差都很大

MSE:29.324592211561935


In [4]:
# 支持向量機迴歸
clf = SVR(gamma='scale')
scores = cross_val_score(clf, X, y, cv=3,scoring='neg_mean_squared_error')
print("MSE:"+ str(-scores.mean()))
#沒有用超參數時 誤差都很大

MSE:77.42951812579331


In [5]:
# K最近鄰居
clf = KNeighborsRegressor()
scores = cross_val_score(clf, X, y, cv=3,scoring='neg_mean_squared_error')
print("MSE:"+ str(-scores.mean()))
#沒有用超參數時 誤差都很大

MSE:81.48773186343571


## 超參數演算法1: 網格搜尋(Grid Search)
直接指定超參數的組合範圍，搜尋所有給定的超參數配置，每一組參數都訓練完成，再根據驗證集的結果選擇最佳參數。

**優點:**
* 容易實現  

**缺點:**  
* 費時
* 只可用於分類好的超參數

In [6]:
# 隨機森林模式
# 定義超參數配置空間
rf_params = {
    'n_estimators': [10, 20, 30],
    #'max_features': ['sqrt',0.5],
    'max_depth': [15,20,30,50],
    #'min_samples_leaf': [1,2,4,8],
    #"bootstrap":[True,False],
    #"criterion":['mse','mae']
}
clf = RandomForestRegressor(random_state=0)
grid = GridSearchCV(clf, rf_params, cv=3, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(grid.best_params_)  #最佳參數組
print("MSE:"+ str(-grid.best_score_))

{'max_depth': 30, 'n_estimators': 10}
MSE:28.068229100920448


In [7]:
# 支持向量機迴歸
rf_params = {
    'C': [1,10, 100],
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":[0.01,0.1,1]
}
clf = SVR(gamma='scale')
grid = GridSearchCV(clf, rf_params, cv=3, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(grid.best_params_)
print("MSE:"+ str(-grid.best_score_))

{'C': 100, 'epsilon': 0.01, 'kernel': 'poly'}
MSE:67.07644831331122


In [8]:
# K最近鄰居
rf_params = {
    'n_neighbors': [2, 3, 5,7,10]
}
clf = KNeighborsRegressor()
grid = GridSearchCV(clf, rf_params, cv=3, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(grid.best_params_)
print("MSE:"+ str(-grid.best_score_))

{'n_neighbors': 5}
MSE:81.48773186343571


## 超參數演算法2: 隨機搜尋(Random Search)
指定超參數的範圍，⽤均勻分布進⾏參數抽樣，在搜尋空間中隨機搜尋超參數組合，⽤抽到的參數進⾏訓練，再根據驗證集的結果選擇最佳參數。

**優點:**
* 比網格搜尋更有效率
* 支援平行處理

**缺點:**  
* 沒有考慮先前結果
* 對條件超參數沒有效率

In [9]:
# 隨機森林模式
# 定義超參數配置空間
rf_params = {
    'n_estimators': sp_randint(10,100),
    "max_features":sp_randint(1,13),
    'max_depth': sp_randint(5,50),
    "min_samples_split":sp_randint(2,11),
    "min_samples_leaf":sp_randint(1,11),
    "criterion":['mse','mae']
}
n_iter_search=20 #number of iterations is set to 20, you can increase this number if time permits
clf = RandomForestRegressor(random_state=0)
Random = RandomizedSearchCV(clf, param_distributions=rf_params,n_iter=n_iter_search,cv=3,scoring='neg_mean_squared_error')
Random.fit(X, y)
print(Random.best_params_)
print("MSE:"+ str(-Random.best_score_))

{'criterion': 'mse', 'max_depth': 37, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 64}
MSE:26.689227867179586


In [10]:
# 支持向量機迴歸
rf_params = {
    'C': stats.uniform(0,50),
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":stats.uniform(0,1)
}
n_iter_search=20
clf = SVR(gamma='scale')
Random = RandomizedSearchCV(clf, param_distributions=rf_params,n_iter=n_iter_search,cv=3,scoring='neg_mean_squared_error')
Random.fit(X, y)
print(Random.best_params_)
print("MSE:"+ str(-Random.best_score_))

{'C': 37.112506273972016, 'epsilon': 0.035832345868149984, 'kernel': 'poly'}
MSE:59.69355396806511


In [12]:
# K最近鄰居
rf_params = {
    'n_neighbors': sp_randint(1,20),
}
n_iter_search=10
clf = KNeighborsRegressor()
Random = RandomizedSearchCV(clf, param_distributions=rf_params,n_iter=n_iter_search,cv=3,scoring='neg_mean_squared_error')
Random.fit(X, y)
print(Random.best_params_)
print("MSE:"+ str(-Random.best_score_))

{'n_neighbors': 6}
MSE:80.83005201647829
