In [1]:
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost
import pandas as pd
import time

# gdbt
### 以下过程类似对全部的参数使用GridSearchCV。通过找出每一个参数的最优值，重新代入，寻找其中的关系。这里把这些分开主要用于分析。（时间与能力有限，后续增改）
[参数说明链接](http://www.cnblogs.com/DjangoBlog/p/6201663.html)

In [2]:
data = pd.read_csv('data_all.csv')
y_data = pd.DataFrame(data['status']).values.ravel()
x_data = data.drop(columns='status').values

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=2018)
print(data.shape)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(4754, 85)
(3327, 84) (3327,) (1427, 84) (1427,)


## 1、loss 验证LR和adaboost适用的场景

In [3]:
t_start = time.time()

gdbt_loss_param = {'loss':['deviance', 'exponential']}

gdbt = GradientBoostingClassifier(random_state=2018)
gdbt_loss_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbt_loss_param)
gdbt_loss_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['deviance', 'exponential']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [4]:
print(gdbt_loss_gridcv.best_estimator_)
print(gdbt_loss_gridcv.best_score_)
print(gdbt_loss_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7953110910730388
{'mean_fit_time': array([5.40070891, 5.63952253]), 'std_fit_time': array([0.49382256, 0.84632721]), 'mean_score_time': array([0.00400028, 0.00380025]), 'std_score_time': array([0.00044723, 0.00074835]), 'param_loss': masked_array(data=['deviance', 'exponential'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'loss': 'deviance'}, {'loss': 'exponential'}], 'split0_test_score': array([0.80538922, 0.81736527]), 'split1_test_score': array([0.78742515, 0.7784

In [5]:
score = gdbt_loss_gridcv.score(x_test, y_test)
print(score)

0.7876664330763841


In [6]:
joblib.dump(gdbt_loss_gridcv, 'gdbt_loss_gridcv.m')

t_end = time.time()
print('gdbt_loss_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_loss_gridcv 训练结束,耗时:1.9544617891311646min


## 结论

### 整体现象看不出这两个的差距

## 2、n_estimators对数据的影响

In [4]:
t_start = time.time()

gdbt_n_estimators_param = {'n_estimators':range(100, 300, 50)}

gdbt = GradientBoostingClassifier(loss='exponential', random_state=2018)
gdbt_n_estimators_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbt_n_estimators_param)
gdbt_n_estimators_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(100, 300, 50)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [5]:
print(gdbt_n_estimators_gridcv.best_estimator_)
print(gdbt_n_estimators_gridcv.best_score_)
print(gdbt_n_estimators_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7956116621581004
{'mean_fit_time': array([1.0595    , 1.62320004, 2.01839995, 2.50299997]), 'std_fit_time': array([0.05248852, 0.16937228, 0.10659093, 0.04318323]), 'mean_score_time': array([0.00100002, 0.00139995, 0.00140004, 0.00160005]), 'std_score_time': array([1.16800773e-07, 4.89872184e-04, 4.89891666e-04, 4.89950069e-04]), 'param_n_estimators': masked_array(data=[100, 150, 200, 250],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 100}

In [6]:
score = gdbt_n_estimators_gridcv.score(x_test, y_test)
print(score)

0.7876664330763841


In [7]:
joblib.dump(gdbt_n_estimators_gridcv, 'gdbt_n_estimators_gridcv.m')

t_end = time.time()
print('gdbt_n_estimators_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_n_estimators_gridcv 训练结束,耗时:1.2338833332061767min


## 结论

### 测试中超过了3便会降低精确度，但是训练中会提高精度，可能是小范围波动

## 3、max_depth对数据的影响(由于随机森林中的深度为10，这里就缩小范围)

In [8]:
t_start = time.time()

gdbt_max_depth_param = {'max_depth':range(3, 15, 3)}

gdbt = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=2018)
gdbt_max_depth_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbt_max_depth_param)
gdbt_max_depth_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(3, 15, 3)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [9]:
print(gdbt_max_depth_gridcv.best_estimator_)
print(gdbt_max_depth_gridcv.best_score_)
print(gdbt_max_depth_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7956116621581004
{'mean_fit_time': array([ 1.10250003,  3.10949993,  7.44559999, 13.83940003]), 'std_fit_time': array([0.1120779 , 0.08190146, 0.25873241, 0.49493799]), 'mean_score_time': array([0.00119998, 0.00160003, 0.00309999, 0.00449998]), 'std_score_time': array([0.00060009, 0.00048993, 0.00029999, 0.00049994]), 'param_max_depth': masked_array(data=[3, 6, 9, 12],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 3}, {'max_depth': 6}, {'max_d

In [10]:
score = gdbt_max_depth_gridcv.score(x_test, y_test)
print(score)

0.7876664330763841


In [11]:
joblib.dump(gdbt_max_depth_gridcv, 'gdbt_max_depth_gridcv.m')

t_end = time.time()
print('gdbt_max_depth_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_max_depth_gridcv 训练结束,耗时:4.287033331394196min


## 结论

### 当取值为9 ，12时出现过拟合现象。

## 4、max_features对数据的影响

In [17]:
t_start = time.time()

gdbt_max_features_param = {'max_features':[0.2, 0.4, 0.6, 0.8]}

gdbt = GradientBoostingClassifier(loss='exponential', n_estimators=100, max_depth=3, random_state=2018)
gdbt_max_features_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbt_max_features_param)
gdbt_max_features_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [0.2, 0.4, 0.6, 0.8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
print(gdbt_max_features_gridcv.best_estimator_)
print(gdbt_max_features_gridcv.best_score_)
print(gdbt_max_features_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7962128043282236
{'mean_fit_time': array([1.28197331, 2.01731534, 2.80096016, 3.55750349]), 'std_fit_time': array([0.04400351, 0.01898424, 0.01423323, 0.03134842]), 'mean_score_time': array([0.00350018, 0.00360019, 0.00350032, 0.00360014]), 'std_score_time': array([0.00050006, 0.00048984, 0.00050001, 0.00048995]), 'param_max_features': masked_array(data=[0.2, 0.4, 0.6, 0.8],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_features': 0.2}, {'max_features'

In [19]:
score = gdbt_max_features_gridcv.score(x_test, y_test)
print(score)

0.7869656622284513


In [20]:
joblib.dump(gdbt_max_features_gridcv, 'gdbt_max_features_gridcv.m')

t_end = time.time()
print('gdbt_max_features_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_max_features_gridcv 训练结束,耗时:1.657478137811025min


## 结论

### 小范围波动，现象不明朗，需扩大取值精度

## 5、min_samples_split  根据随机森林缩小范围

In [21]:
t_start = time.time()

gdbt_min_samples_split_param = {'min_samples_split':range(2,8,2)}

gdbt = GradientBoostingClassifier(loss='exponential', n_estimators=100, max_depth=3, max_features=0.2, random_state=2018)
gdbt_min_samples_split_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbt_min_samples_split_param)
gdbt_min_samples_split_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': range(2, 8, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
print(gdbt_min_samples_split_gridcv.best_estimator_)
print(gdbt_min_samples_split_gridcv.best_score_)
print(gdbt_min_samples_split_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7974150886684701
{'mean_fit_time': array([1.27957318, 1.24467115, 1.24397116]), 'std_fit_time': array([0.04019492, 0.00857023, 0.0133907 ]), 'mean_score_time': array([0.00350025, 0.00350015, 0.00350015]), 'std_score_time': array([0.00050008, 0.00049999, 0.00050004]), 'param_min_samples_split': masked_array(data=[2, 4, 6],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_samples_split': 2}, {'min_samples_split': 4}, {'min_samples_split': 6}], 'split0_test_score':

In [23]:
score = gdbt_min_samples_split_gridcv.score(x_test, y_test)
print(score)

0.7869656622284513


In [24]:
joblib.dump(gdbt_min_samples_split_gridcv, 'gdbt_min_samples_split_gridcv.m')

t_end = time.time()
print('gdbt_min_samples_split_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_min_samples_split_gridcv 训练结束,耗时:0.6690549333890279min


## 结论

#### 取值为4时是最大值，但因为预测结果比训练结果小可能是过拟合，可再缩小范围测试

## 6、min_samples_leaf 根据随机森林缩小范围

In [25]:
t_start = time.time()

gdbt_min_samples_leaf_param = {'min_samples_leaf':range(2,6,2)}

gdbt = GradientBoostingClassifier(loss='exponential', n_estimators=100, max_depth=3, 
                                  max_features=0.2, min_samples_split=4, random_state=2018)

gdbt_min_samples_leaf_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbt_min_samples_leaf_param)
gdbt_min_samples_leaf_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': range(2, 6, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
print(gdbt_min_samples_leaf_gridcv.best_estimator_)
print(gdbt_min_samples_leaf_gridcv.best_score_)
print(gdbt_min_samples_leaf_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=4, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7989179440937781
{'mean_fit_time': array([1.28267329, 1.24267108]), 'std_fit_time': array([0.05104056, 0.01323107]), 'mean_score_time': array([0.00360024, 0.00360022]), 'std_score_time': array([0.00048993, 0.00048996]), 'param_min_samples_leaf': masked_array(data=[2, 4],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_samples_leaf': 2}, {'min_samples_leaf': 4}], 'split0_test_score': array([0.80838323, 0.80838323]), 'split1_test_score': array([0.78443114, 0.7754491 ]),

In [27]:
score = gdbt_min_samples_leaf_gridcv.score(x_test, y_test)
print(score)

0.7883672039243167


In [28]:
joblib.dump(gdbt_min_samples_leaf_gridcv, 'gdbt_min_samples_leaf_gridcv.m')

t_end = time.time()
print('gdbt_min_samples_leaf_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_min_samples_leaf_gridcv 训练结束,耗时:0.45765950679779055min


## 结论

### 取值范围过小，分析不出实际数据。

## 7、max_leaf_nodes

In [29]:
t_start = time.time()

gdbtmax_leaf_nodes_param = {'max_features':range(10,50,10)}

gdbt = GradientBoostingClassifier(loss='exponential', n_estimators=100, max_depth=3,  max_features=0.2, 
                                  min_samples_split=4, min_samples_leaf=4, random_state=2018)

gdbt_max_leaf_nodes_gridcv = GridSearchCV(cv=10, estimator=gdbt, param_grid=gdbtmax_leaf_nodes_param)
gdbt_max_leaf_nodes_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=4, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': range(10, 50, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
print(gdbt_max_leaf_nodes_gridcv.best_estimator_)
print(gdbt_max_leaf_nodes_gridcv.best_score_)
print(gdbt_max_leaf_nodes_gridcv.cv_results_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=20, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=4, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2018, subsample=1.0, verbose=0,
              warm_start=False)
0.7995190862639014
{'mean_fit_time': array([1.05946062, 1.4515831 , 1.90790911, 2.35713482]), 'std_fit_time': array([0.06904769, 0.03117306, 0.01247307, 0.02490931]), 'mean_score_time': array([0.00380018, 0.00380013, 0.00400021, 0.0035002 ]), 'std_score_time': array([4.00018710e-04, 6.00051901e-04, 1.28392334e-07, 5.00035320e-04]), 'param_max_features': masked_array(data=[10, 20, 30, 40],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_features': 10}, {'max

In [31]:
score = gdbt_max_leaf_nodes_gridcv.score(x_test, y_test)
print(score)

0.7939733707077785


In [32]:
joblib.dump(gdbt_max_leaf_nodes_gridcv, 'gdbt_max_leaf_nodes_gridcv.m')

t_end = time.time()
print('gdbt_max_leaf_nodes_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

gdbt_max_leaf_nodes_gridcv 训练结束,耗时:1.1800341566403707min


## 结论

### 在20取值比较稳定，但可能是小范围波动，需扩大取值精度验证。