### GridSearchCV

In [2]:
from sklearn import datasets, svm
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()

# 定义参数网格，2*3=6个参数组合
params = {'kernel':('rbf','linear'), 'C':[1,5,10]}
svc = svm.SVC()
clf = GridSearchCV(estimator=svc, param_grid=params)
clf.fit(iris.data, iris.target)
clf.best_estimator_



SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

### RandomizedSearchCV

In [4]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# 用于报告超参数搜索的最好结果
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean valdation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Params: {0}".format(results['params'][candidate]))
            print("")
            
# 获取数据
digits = load_digits()
X, y = digits.data, digits.target
# 构建一个分类起
clf = RandomForestClassifier(n_estimators=20)
# 超参数以及取值分布
param_dist = {"max_depth": [3, None],
             "max_features": sp_randint(1,11),
             "min_samples_split": sp_randint(2,11),
             "min_samples_leaf": sp_randint(1,11),
             "bootstrap": [True, False],
             "criterion": ["gini","entropy"]}

# 开启超参数空间随机搜索
n_iter_search = 20
random_search = RandomizedSearchCV(estimator=clf,param_distributions=param_dist,n_iter=n_iter_search)
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates params settings"
     % ((time()-start), n_iter_search))

report(random_search.cv_results_)



RandomizedSearchCV took 2.64 seconds for 20 candidates params settings
Model with rank: 1
Mean valdation score: 0.929 (std: 0.005)
Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 3}

Model with rank: 2
Mean valdation score: 0.928 (std: 0.016)
Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 5}

Model with rank: 3
Mean valdation score: 0.924 (std: 0.016)
Params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 3, 'min_samples_split': 5}



### 超参数优化中随机搜索和网格搜索的对比实验

以随机森林分类起为优化对象，所有影响分类起的参数都被搜索了，除了树的数量以外，随机搜索和网格搜索都在同一个超参数空间对随机森领分类起进行优化。虽然超参数设置组合比较相似，但是随机搜索的运行时间明显比网格搜索要少。随机搜索得到的超参数组合性能稍差一些，但很大程度是由噪声引起的。在实际工程中，一般只选择几个比较重要的参数组合进行优化。

In [5]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# 用于报告超参数搜索的最好结果
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean valdation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Params: {0}".format(results['params'][candidate]))
            print("")
            
# 获取数据
digits = load_digits()
X, y = digits.data, digits.target
# 构建一个分类起
clf = RandomForestClassifier(n_estimators=20)

print(">>>>RandomizedSearchCV测试结果<<<<")
# 超参数以及取值分布
param_dist = {"max_depth": [3, None],
             "max_features": sp_randint(1,11),
             "min_samples_split": sp_randint(2,11),
             "min_samples_leaf": sp_randint(1,11),
             "bootstrap": [True, False],
             "criterion": ["gini","entropy"]}

# 开启超参数空间随机搜索
n_iter_search = 20
random_search = RandomizedSearchCV(estimator=clf,param_distributions=param_dist,n_iter=n_iter_search)
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates params settings"
     % ((time()-start), n_iter_search))

report(random_search.cv_results_)

print(">>>>GridSearchCV测试结果<<<<")
params_grid = {"max_depth": [3, None],
             "max_features": [1,3,10],
             "min_samples_split": [2,3,10],
             "min_samples_leaf": [1,3,10],
             "bootstrap": [True, False],
             "criterion": ["gini","entropy"]}

# 开启超参数空间网格搜索
grid_search = GridSearchCV(estimator=clf,param_grid=params_grid)
start = time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidates params settings"
     % ((time()-start), n_iter_search))
report(grid_search.cv_results_)

>>>>RandomizedSearchCV测试结果<<<<




RandomizedSearchCV took 2.66 seconds for 20 candidates params settings
Model with rank: 1
Mean valdation score: 0.928 (std: 0.012)
Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 2
Mean valdation score: 0.926 (std: 0.009)
Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 3, 'min_samples_split': 6}

Model with rank: 3
Mean valdation score: 0.919 (std: 0.004)
Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 4, 'min_samples_split': 10}

>>>>GridSearchCV测试结果<<<<




GridSearchCV took 29.36 seconds for 20 candidates params settings
Model with rank: 1
Mean valdation score: 0.934 (std: 0.007)
Params: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 2
Mean valdation score: 0.931 (std: 0.013)
Params: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}

Model with rank: 3
Mean valdation score: 0.930 (std: 0.008)
Params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

