In [1]:
import pandas as pd
import sklearn
sklearn.__version__

'0.22.2.post1'

In [2]:
# 重复性设置
seed = 2020

# 基本使用
## 参数不冲突时
参数不冲突时，直接用一个字典传递参数和要对应的候选值给GridSearchCV即可  
我这里的参数冲突指的是类似下面这种情况：  
① 参数取值受限
参数a='a'时，参数b只能取'b'  
参数a='A'时，参数b能取'b'或'B'  
② 参数互斥
参数 a 或 b 二者只能选一个  

In [12]:
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
model = SVC(random_state=seed)

# 需调参数及候选值
parameters = {
    'C': [0.1, 1, 10], 
    'kernel': ['rbf', 'linear']
}

# 评价依据
## 可用评价指标
## https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
## 使用自定义评价指标
from sklearn.metrics import make_scorer
def custom_loss_func(y_true, y_pred):
    return len(y_true[y_true!=y_pred])/len(y_true)
# greater_is_better=False，指标越小越好
# needs_proba=False，指标通过标签计算，不是通过概率
loss_socre = make_scorer(custom_loss_func, greater_is_better=False, needs_proba=False)
scores = {
    'acc': 'accuracy',         # 准确率
    'f1_mi': 'f1_micro',       # 一种多分类f1值
    'loss': loss_socre         # 自定义评价指标
}

# 网格搜索实例
gs = GridSearchCV(
    model,
    parameters,
    cv=5,                      # 交叉验证数
    scoring=scores,            # 验证集上的评价指标
    refit='f1_mi',             # 在此指标下，用最优表现的参数重新训练模型
#     return_train_score=True,   # gs.cv_results_额外保存训练集的评价结果
    verbose=1,                 # 日志信息，默认0不输出
    n_jobs=2                   # 并行加速
)

# 一共要跑的任务数=参数1候选值*...*参数i候选值*交叉验证数
# 这里就是3*2*5=30
gs.fit(iris.data, iris.target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:    0.0s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=2020, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=2,
             param_grid={'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit='f1_mi', return_train_score=False,
             scoring={'acc': 'accuracy', 'f1_mi': 'f1_micro',
                      'loss': make_scorer(custom_loss_func, greater_is_better=False)},
             verbose=1)

In [13]:
print("最优参数")
print(gs.best_params_)
print("最佳模型的评分")
print(gs.best_score_)
print("最优模型")
best_model = gs.best_estimator_  # GridSearchCV的refit参数不能为False

最优参数
{'C': 1, 'kernel': 'linear'}
最佳模型的评分
0.9800000000000001
最优模型


In [14]:
gs.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_kernel', 'params', 'split0_test_acc', 'split1_test_acc', 'split2_test_acc', 'split3_test_acc', 'split4_test_acc', 'mean_test_acc', 'std_test_acc', 'rank_test_acc', 'split0_test_f1_mi', 'split1_test_f1_mi', 'split2_test_f1_mi', 'split3_test_f1_mi', 'split4_test_f1_mi', 'mean_test_f1_mi', 'std_test_f1_mi', 'rank_test_f1_mi', 'split0_test_loss', 'split1_test_loss', 'split2_test_loss', 'split3_test_loss', 'split4_test_loss', 'mean_test_loss', 'std_test_loss', 'rank_test_loss'])

In [15]:
"""
用表格查看训练信息
"""
cv_results = pd.DataFrame(gs.cv_results_)
# 查看其他指标的结果和参数，比如这里按平均准确率排序
cv_results = cv_results.sort_values(by="mean_test_acc", ascending=False)
shown_columns = ["mean_test_"+col for col in scores.keys()] + ["params"]
cv_results[shown_columns].head(3)

Unnamed: 0,mean_test_acc,mean_test_f1_mi,mean_test_loss,params
3,0.98,0.98,-0.02,"{'C': 1, 'kernel': 'linear'}"
4,0.98,0.98,-0.02,"{'C': 10, 'kernel': 'rbf'}"
1,0.973333,0.973333,-0.026667,"{'C': 0.1, 'kernel': 'linear'}"


## 参数冲突时

参数冲突时，互斥参数搜索空间用不同字典来描述  
将这些字典放到列表中再传递给GridSearchCV

In [7]:
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
model = SVC(random_state=seed)

parameters = [
    {
        'C': [0.1, 1, 10], 
        'kernel': ['rbf', 'linear']
    },
    {
        'C': [0.1, 1, 10],
        'kernel': ['poly'],
        'degree': [1, 3, 5]
    }
]

gs = GridSearchCV(
    model,
    parameters,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=2,
)

gs.fit(iris.data, iris.target)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:    0.0s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=2020, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=2,
             param_grid=[{'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
                         {'C': [0.1, 1, 10], 'degree': [1, 3, 5],
                          'kernel': ['poly']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [8]:
print("最优参数")
print(gs.best_params_)
print("最佳模型的评分")
print(gs.best_score_)
print("最优模型")
best_model = gs.best_estimator_

最优参数
{'C': 0.1, 'degree': 3, 'kernel': 'poly'}
最佳模型的评分
0.9866666666666667
最优模型


# 特征选择+模型复合调参

管道可以用来连接多个操作，比如特征选择+模型训练，数据处理+模型训练等等    
如果这些操作也有参数可调，可以用 GridSearchCV 对它们一起调参

In [9]:
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()

pipe = Pipeline([
    ('selector', SelectKBest()),       # 特征选择
    ('model', SVC(random_state=seed))  # 模型
])

# “双下划线”指定要调整的部件及其参数
parameters = [
    {
        'selector__score_func': [chi2, f_classif],
        'selector__k': [2, 3, 4],
        'model__C': [0.1, 1, 10], 
        'model__kernel': ['rbf', 'linear']
    },
    {
        'selector__score_func': [chi2, f_classif],
        'selector__k': [2, 3, 4],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['poly'],
        'model__degree': [1, 3, 5]
    }
]


gs = GridSearchCV(
    pipe,
    parameters,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=2,
)

gs.fit(iris.data, iris.target)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 440 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 447 out of 450 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=2)]: Done 450 out of 450 | elapsed:    0.9s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('selector',
                                        SelectKBest(k=10,
                                                    score_func=<function f_classif at 0x0000028493B1A5E8>)),
                                       ('model',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=2020, shr...
                          'selector__score_func': [<function chi2 at 0x0000028493B1A168>,
          

In [10]:
print("最优参数")
print(gs.best_params_)
print("最佳模型的评分")
print(gs.best_score_)
print("最优组合")
# best_pipe = gs.best_estimator_
best_selector = gs.best_estimator_[0]
best_model = gs.best_estimator_[1]

最优参数
{'model__C': 0.1, 'model__degree': 3, 'model__kernel': 'poly', 'selector__k': 4, 'selector__score_func': <function chi2 at 0x0000028493B1A168>}
最佳模型的评分
0.9866666666666667
最优组合


In [11]:
best_model

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=2020, shrinking=True,
    tol=0.001, verbose=False)