# 模型调参
- 网格调参
- 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pickle
with open('29_features.pkl','rb') as f:
    X = pickle.load(f, encoding = 'gbk')
with open('new_label.pkl','rb') as f:
    y = pickle.load(f)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 2018)

In [4]:
import warnings
warnings.filterwarnings('ignore')

## 简化版模型评估

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score

def model_metrics(clf, X_train, X_test, y_train, y_test):
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    
    y_train_proba = clf.predict_proba(X_train)[:,1]
    y_test_proba = clf.predict_proba(X_test)[:,1]
    
    # 准确率
    print('[准确率]', end = ' ')
    print('训练集：', '%.4f'%accuracy_score(y_train, y_train_pred), end = ' ')
    print('测试集：', '%.4f'%accuracy_score(y_test, y_test_pred))
    
    # auc取值：用roc_auc_score或auc
    print('[auc值]', end = ' ')
    print('训练集：', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
    print('测试集：', '%.4f'%roc_auc_score(y_test, y_test_proba))

## 网格调参法
- 筛选依据为scoreing = 'roc_auc'

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

## lr

In [6]:
lr = LogisticRegression()
param = {'C': [1e-3,0.01,0.1,1,10,100,1e3], 'penalty':['l1', 'l2']}

gsearch = GridSearchCV(lr, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)

print('最佳参数：',gsearch.best_params_)
print('训练集的最佳分数：', gsearch.best_score_)
print('测试集的最佳分数：', gsearch.score(X_test, y_test))

最佳参数： {'C': 0.01, 'penalty': 'l2'}
训练集的最佳分数： 0.7963302528408653
测试集的最佳分数： 0.7840234525784274


In [9]:
lr = LogisticRegression(C = 0.01, penalty = 'l2')
lr.fit(X_train, y_train)
model_metrics(lr, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8028 测试集： 0.7835
[auc值] 训练集： 0.8038 测试集： 0.7840


## SVM模型

> 调参范围可设为'gamma':[0.001,0.01,0.1,1,10,100], 'C':[0.001,0.01,0.1,1,10,100]}。鉴于时间原因, 下面网格搜索时选用较小区间。

In [10]:
# 线性SVM
svm_linear = svm.SVC(kernel = 'linear', probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_linear, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)

print('最佳参数：',gsearch.best_params_)
print('训练集的最佳分数：', gsearch.best_score_)
print('测试集的最佳分数：', gsearch.score(X_test, y_test))

最佳参数： {'C': 0.01}
训练集的最佳分数： 0.8013832599472702
测试集的最佳分数： 0.7883895131086143


In [11]:
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_linear.fit(X_train, y_train)
model_metrics(svm_linear, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7926 测试集： 0.7744
[auc值] 训练集： 0.8082 测试集： 0.7884


In [12]:
# 多项式SVM
svm_poly = svm.SVC(kernel = 'poly', probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_poly, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)

print('最佳参数：',gsearch.best_params_)
print('训练集的最佳分数：', gsearch.best_score_)
print('测试集的最佳分数：', gsearch.score(X_test, y_test))

最佳参数： {'C': 0.01}
训练集的最佳分数： 0.7768374763498004
测试集的最佳分数： 0.76365371975838


In [13]:
svm_poly =  svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_poly.fit(X_train, y_train)
model_metrics(svm_poly, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7610 测试集： 0.7547
[auc值] 训练集： 0.8156 测试集： 0.7637


In [14]:
# 高斯SVM 时间最长
svm_rbf = svm.SVC(probability=True)
param = {'gamma':[0.01,0.1,1,10], 
         'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_rbf, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)

print('最佳参数：',gsearch.best_params_)
print('训练集的最佳分数：', gsearch.best_score_)
print('测试集的最佳分数：', gsearch.score(X_test, y_test))

最佳参数： {'C': 0.1, 'gamma': 0.01}
训练集的最佳分数： 0.7976347244291928
测试集的最佳分数： 0.7843677297528509


In [15]:
svm_rbf =  svm.SVC(gamma = 0.01, C =0.1 , probability=True)
svm_rbf.fit(X_train, y_train)
model_metrics(svm_rbf, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7824 测试集： 0.7730
[auc值] 训练集： 0.8090 测试集： 0.7844


In [16]:
# sigmoid - SVM
svm_sigmoid = svm.SVC(kernel = 'sigmoid',probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_sigmoid, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)

print('最佳参数：',gsearch.best_params_)
print('训练集的最佳分数：', gsearch.best_score_)
print('测试集的最佳分数：', gsearch.score(X_test, y_test))

最佳参数： {'C': 0.01}
训练集的最佳分数： 0.7968126158756947
测试集的最佳分数： 0.7812379372580931


In [17]:
svm_sigmoid =  svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
svm_sigmoid.fit(X_train, y_train)
model_metrics(svm_sigmoid, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7493 测试集： 0.7484
[auc值] 训练集： 0.7994 测试集： 0.7812


## 决策树模型
- max_depth
- min_samples_split
- min_samples_leaf
- max_features

1）首先对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。

In [19]:
param = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch = GridSearchCV(DecisionTreeClassifier(max_depth=8,min_samples_split=300,min_samples_leaf=20, max_features='sqrt',random_state =2018),
                       param_grid = param,scoring ='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'max_depth': 5, 'min_samples_split': 100}, 0.7473700728471221)

2）对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。

In [21]:
param = {'min_samples_split':range(80,200,10), 'min_samples_leaf':range(20,80,10)}
gsearch = GridSearchCV(DecisionTreeClassifier(max_depth=5,min_samples_split=100,min_samples_leaf=20, max_features='sqrt', random_state =2018),
                       param_grid = param,scoring ='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'min_samples_leaf': 60, 'min_samples_split': 140}, 0.7524371932564738)

3）再对最大特征数max_features进行网格搜索

In [24]:
param = {'max_features':range(6,29,2)}
gsearch = GridSearchCV(DecisionTreeClassifier(max_depth=5,min_samples_split=140,min_samples_leaf=60, max_features='sqrt', random_state =2018),
                       param_grid = param,scoring ='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'max_features': 16}, 0.74727256841388)

max_features 不如不调，用原来的。

In [23]:
dt = DecisionTreeClassifier(max_depth=5,min_samples_split=140,min_samples_leaf=60,max_features='sqrt', random_state =2018)
dt.fit(X_train, y_train)
model_metrics(dt, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7890 测试集： 0.7687
[auc值] 训练集： 0.7885 测试集： 0.7310


## 随机森林
- n_estimators 默认为10，精度递增显著，不是越多越好。
- max_depth 不输入不限制。建议在10-100之间。
- min_samples_split 不超过这个值可以不划分。
- min_samples_leaf 如果叶子节点小于它，则被剪枝。
- max_features 允许单个决策树使用特征的最大数量。默认全部；sqrt总特征的平方根个；0.2，总数20%。  
以上，样本较少时，max_depth , min_samples_split, min_samples_leaf可以不考虑。

In [25]:
#观察默认值效果
rf0 = RandomForestClassifier(oob_score=True, random_state=2018)
rf0.fit(X_train, y_train)
print('袋外分数：', rf0.oob_score_)
model_metrics(rf0, X_train, X_test, y_train, y_test)

袋外分数： 0.7538322813345356
[准确率] 训练集： 0.9856 测试集： 0.7758
[auc值] 训练集： 0.9994 测试集： 0.7449


> 1) 首先对n_estimators进行搜索。

In [27]:
param = {'n_estimators':range(10,150,10)}

gsearch = GridSearchCV(estimator = RandomForestClassifier(n_estimators=120, max_depth=5, min_samples_split=140, 
                                                          min_samples_leaf=60, max_features = 'sqrt',random_state=2018), 
                       param_grid = param, scoring='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
gsearch.best_params_, gsearch.best_score_

({'n_estimators': 120}, 0.7877997885691934)

> 2) 对rf的最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。

In [28]:
param = {'max_depth':range(3,15,1), 'min_samples_split':range(100,801,100)}
gsearch = GridSearchCV(estimator = RandomForestClassifier(n_estimators=120, max_depth=5, min_samples_split=140, 
                                                          min_samples_leaf=60, max_features = 'sqrt',random_state=2018), 
                       param_grid = param, scoring='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
gsearch.best_params_, gsearch.best_score_

({'max_depth': 8, 'min_samples_split': 100}, 0.7921046250511624)

> 3) 对min_samples_split和min_samples_leaf一起调参。

In [29]:
param = {'min_samples_split':range(50,100,10), 'min_samples_leaf':range(30,100,10)}
gsearch = GridSearchCV(estimator = RandomForestClassifier(n_estimators=120, max_depth=8, min_samples_split=140, 
                                                          min_samples_leaf=60, max_features = 'sqrt',random_state=2018), 
                       param_grid = param, scoring='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
gsearch.best_params_, gsearch.best_score_

({'min_samples_leaf': 30, 'min_samples_split': 50}, 0.7940500722352857)

根据结果还能再进一步逐步求精。

In [31]:
 param = {'min_samples_split':range(30,55,5), 'min_samples_leaf':range(10,35,5)}
gsearch = GridSearchCV(estimator = RandomForestClassifier(n_estimators=120, max_depth=8, min_samples_split=50, 
                                                          min_samples_leaf=30, max_features = 'sqrt',random_state=2018), 
                       param_grid = param, scoring='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
gsearch.best_params_, gsearch.best_score_

({'min_samples_leaf': 20, 'min_samples_split': 30}, 0.7964826564547423)

参数汇总再训练。

In [32]:
rf = RandomForestClassifier(n_estimators=120, max_depth=8, min_samples_split=30,
                            min_samples_leaf=20, max_features = 'sqrt',oob_score=True, random_state=2018)
rf.fit(X_train, y_train)
print('袋外分数：', rf.oob_score_)
model_metrics(rf, X_train, X_test, y_train, y_test)

袋外分数： 0.7935076645626691
[准确率] 训练集： 0.8206 测试集： 0.7821
[auc值] 训练集： 0.8829 测试集： 0.7782


## xgboost


**梯度提升树**加法模型，以决策树作为基学习器。学习目标拟合残差，借助平方误差损失函数的负梯度(对模型的梯度)进行残差近似。

**xgboost核心原理**： 
- 也是一种梯度提升树模型。
- 每一步主要利用上一步的模型来学习得到新模型
- 每一步训练用的目标函数包括 偏差项和方差项(传统的gbdt不包括方差项)。
- 优化目标函数不是像LR等模型一样寻找最佳参数。而是在函数空间中寻找最优模型（函数）
- 目标函数用二阶泰勒展开式近似，方便优化求解ft(x)  
(一文理解GBDT和xgboost的原理)[https://zhuanlan.zhihu.com/p/29765582]

- max_depth = 5 :这个参数的取值最好在3-10之间。起始值选为5，也可以选择其它的值。起始值在4-6之间都是不错的选择。
- min_child_weight = 1:在这里选了一个比较小的值，因为这是一个极不平衡的分类问题。因此，某些叶子节点下的值会比较小。
- gamma = 0: 起始值也可以选其它比较小的值，在0.1到0.2之间就可以。这个参数后继也是要调整的。
- subsample, colsample_bytree = 0.8: 这个是最常见的初始值了。典型值的范围在0.5-0.9之间。

In [34]:
#默认参数
xgb0 = XGBClassifier()
xgb0.fit(X_train, y_train)

model_metrics(xgb0, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8464 测试集： 0.7849
[auc值] 训练集： 0.8970 测试集： 0.7758


1) 首先从步长(learning rate)和迭代次数(n_estimators)入手。

开始选择一个较小的步长来网格搜索最好的迭代次数。这里，我们将步长初始值设置为0.1, 对于迭代次数进行网格搜索。

In [35]:
param_test = {'n_estimators':range(20,200,20)}
gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'n_estimators': 40}, 0.7948353841027499)

2) max_depth 和 min_child_weight 参数调优

In [36]:
param_test = {'max_depth':range(3,10,2), 'min_child_weight':range(1,12,2)}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=5, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'max_depth': 3, 'min_child_weight': 11}, 0.8028955602250047)

3) gamma参数调优

In [37]:
param_test = {'gamma':[i/10 for i in range(0,6)]}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'gamma': 0.0}, 0.8028955602250047)

4）调整subsample 和 colsample_bytree 参数

In [38]:
param_test = {'subsample':[i/10 for i in range(5,10)], 'colsample_bytree':[i/10 for i in range(5,10)]}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'colsample_bytree': 0.8, 'subsample': 0.8}, 0.8028955602250047)

从这里可以看出来，subsample理想取值0.8, colsample_bytree理想取值都是0.8。现在，我们以0.05为步长，在这个值附近尝试取值。

In [39]:
param_test = { 'subsample':[i/100 for i in range(75,86,5)], 'colsample_bytree':[i/100 for i in range(75,86,5)]}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'colsample_bytree': 0.8, 'subsample': 0.8}, 0.8028955602250047)

5）正则化参数调优

> #'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1]

In [40]:
param_test = {'reg_alpha':[1e-5, 1e-2, 0.1, 0, 1, 100]}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'reg_alpha': 1e-05}, 0.8028955602250047)

6）回到第1）步，降低学习速率, 调整迭代次数

In [42]:
param_test = {'n_estimators':range(160,200,5)}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.01, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27), 
                        param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'n_estimators': 195}, 0.7925502253322702)

效果变坏，不采用。

In [43]:
xgb = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0.0, subsample=0.8, reg_alpha=1e-05,
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27)
xgb.fit(X_train, y_train)

model_metrics(xgb, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8224 测试集： 0.7877
[auc值] 训练集： 0.8490 测试集： 0.7801


0.7849-->0.7801  略有提升。