In [1]:
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost
import pandas as pd
import time

# 随机森林
### 以下过程类似对全部的参数使用GridSearchCV。通过找出每一个参数的最优值，重新代入，寻找其中的关系。这里把这些分开主要用于分析。（时间与能力有限，后续增改）
[参数说明链接](http://www.cnblogs.com/gczr/p/7141712.html)

In [2]:
data = pd.read_csv('data_all.csv')
y_data = pd.DataFrame(data['status']).values.ravel()
x_data = data.drop(columns='status').values

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=2018)
print(data.shape)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(4754, 85)
(3327, 84) (3327,) (1427, 84) (1427,)


## 1、n_estimators(建立子树的数量) 对数据的影响

In [5]:
t_start = time.time()

rf_estimator_param = {'n_estimators':range(10,250,10)}

rf = RandomForestClassifier(class_weight='balanced', random_state=2018)
rf_estimator_gridcv = GridSearchCV(cv=10, estimator=rf, param_grid=rf_estimator_param)
rf_estimator_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(10, 250, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [6]:
print(rf_estimator_gridcv.best_estimator_)
print(rf_estimator_gridcv.best_score_)
print(rf_estimator_gridcv.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False)
0.789600240456868
{'mean_fit_time': array([0.34241958, 0.71184063, 1.01035788, 1.42328141, 1.67539585,
       2.02741592, 2.3478344 , 2.74510696, 3.07054999, 3.2928884 ,
       3.62520738, 4.03963101, 4.35964947, 4.65896661, 5.00438619,
       5.29290271, 5.62661781, 6.05404634, 6.30736084, 6.71918433,
       6.87479317, 7.23431382, 7.78207648, 8.10686364]), 'std_fit_time': array([0.01550061, 0.07510669, 0.04193337, 0.07363952, 0.05132945,
       0.08585945, 0.05618875, 0.12854428, 0.17207562, 0.10007081,
       0.02830702, 0.11452399, 0.10597978, 0.09698498, 0.0866

In [7]:
score = rf_estimator_gridcv.score(x_test, y_test)
print(score)

0.775052557813595


In [9]:
joblib.dump(rf_estimator_gridcv, 'rf_estimator_gridcv.m')

t_end = time.time()
print('rf_estimator_gridcv训练结束,耗时:{}min'.format((t_end - t_start) / 60))

rf_estimator_gridcv训练结束,耗时:17.91760152578354min


## 结论

### n_estimators在cv=0取值150时，精确度达到极大值，后面开始下降。可认为是整个的最佳范围。在=210以及后面渠道0.8，猜测可能是过拟合（根据最后结果）。

## 2、max_features 对数据的影响(这里以float为例，可以表示出max_features的大小对数据量的影响)

In [16]:
t_start = time.time()

rf_max_features_param = {'max_features':[0.1, 0.3, 0.5, 0.7, 0.9]}

rf = RandomForestClassifier(n_estimators=190, class_weight='balanced', random_state=2018)
rf_max_features_gridcv = GridSearchCV(cv=10, estimator=rf, param_grid=rf_max_features_param)
rf_max_features_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [0.1, 0.3, 0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
print(rf_max_features_gridcv.best_estimator_)
print(rf_max_features_gridcv.best_score_)
print(rf_max_features_gridcv.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False)
0.7902013826269912
{'mean_fit_time': array([ 5.77363031, 14.73814483, 23.74505808, 32.48694415, 42.2097142 ]), 'std_fit_time': array([0.14267415, 0.27507797, 0.36675816, 0.37314334, 0.72284201]), 'mean_score_time': array([0.08300478, 0.07810411, 0.0762043 , 0.07410378, 0.07300425]), 'std_score_time': array([0.02506934, 0.01049288, 0.00917442, 0.0034188 , 0.00412332]), 'param_max_features': masked_array(data=[0.1, 0.3, 0.5, 0.7, 0.9],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_features': 0.1}

In [18]:
score = rf_max_features_gridcv.score(x_test, y_test)
print(score)

0.7820602662929222


In [19]:
joblib.dump(rf_max_features_gridcv, 'rf_max_features_gridcv.m')

t_end = time.time()
print('rf_max_features_gridcv训练结束,耗时:{}min'.format((t_end - t_start) / 60))

rf_max_features_gridcv训练结束,耗时:20.44069657723109min


## 结论

### 随着max_features增加，整体的差异性减小，但是模型分数差不太多，可能是特征不稳定或者取得值没有达到上线。(从cv=0看这个一直来回跳变。)

## 3、max_depth对数据的影响

In [20]:
t_start = time.time()

rf_max_depth_param = {'max_depth':range(10,80,10)}

rf = RandomForestClassifier(n_estimators=190, max_features=0.3, class_weight='balanced', random_state=2018)
rf_max_depth_gridcv = GridSearchCV(cv=10, estimator=rf, param_grid=rf_max_depth_param)
rf_max_depth_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(10, 80, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
print(rf_max_depth_gridcv.best_estimator_)
print(rf_max_depth_gridcv.best_score_)
print(rf_max_depth_gridcv.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False)
0.7917042380522994
{'mean_fit_time': array([12.37800794, 14.77227204, 14.94415474, 14.9457232 , 14.70872943,
       14.83544853, 14.6692997 ]), 'std_fit_time': array([0.43896812, 0.26140357, 0.28290771, 0.21678725, 0.14510314,
       0.18178236, 0.14032072]), 'mean_score_time': array([0.07230418, 0.07172358, 0.07450428, 0.07780399, 0.07364385,
       0.07900448, 0.07284379]), 'std_score_time': array([0.01135014, 0.00379056, 0.0053154 , 0.00722269, 0.00425868,
       0.01058363, 0.00474544]), 'param_max_depth': masked_array(data=[10, 20, 30, 40, 50, 60, 70],
             

In [22]:
score = rf_max_depth_gridcv.score(x_test, y_test)
print(score)

0.7813594954449895


In [23]:
joblib.dump(rf_max_depth_gridcv, 'rf_max_depth_gridcv.m')

t_end = time.time()
print('rf_max_depth_gridcv训练结束,耗时:{}min'.format((t_end - t_start) / 60))

rf_max_depth_gridcv训练结束,耗时:17.56621067126592min


## 结论

### 值可能没有达到上限，各个精确度差不太多。但是因为在各个cv情况下变化量大，可能是特征不稳定。

## 4、 min_samples_split对数据的影响

In [24]:
t_start = time.time()

rf_min_samples_split_param = {'min_samples_split':range(2,10,2)}

rf = RandomForestClassifier(n_estimators=190, max_features=0.3, max_depth=10, class_weight='balanced', random_state=2018)
rf_min_samples_split_gridcv = GridSearchCV(cv=10, estimator=rf, param_grid=rf_min_samples_split_param)
rf_min_samples_split_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': range(2, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
print(rf_min_samples_split_gridcv.best_estimator_)
print(rf_min_samples_split_gridcv.best_score_)
print(rf_min_samples_split_gridcv.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False)
0.7917042380522994
{'mean_fit_time': array([12.44071157, 12.36494257, 12.69827557, 12.4584456 ]), 'std_fit_time': array([0.16049592, 0.11859008, 0.27759162, 0.20234719]), 'mean_score_time': array([0.07370427, 0.07018378, 0.07840455, 0.070104  ]), 'std_score_time': array([0.00694366, 0.00499665, 0.01908087, 0.00457087]), 'param_min_samples_split': masked_array(data=[2, 4, 6, 8],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_samples_split': 2}, {'min_samples_split': 4}, {'min_samples_split': 6}, {'min_sam

In [26]:
score = rf_min_samples_split_gridcv.score(x_test, y_test)
print(score)

0.7813594954449895


In [27]:
joblib.dump(rf_min_samples_split_gridcv, 'rf_min_samples_split_gridcv.m')

t_end = time.time()
print('rf_min_samples_split_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

rf_min_samples_split_gridcv 训练结束,耗时:8.816150160630544min


## 结论

### 整体效果不明朗，可能取值没有达到上限， 可能特征不稳定

## 5、min_samples_leaf 对数据的影响

In [28]:
t_start = time.time()

rf_min_samples_leaf_param = {'min_samples_leaf':range(2,10,2)}

rf = RandomForestClassifier(n_estimators=190, max_features=0.3, max_depth=10, min_samples_split=2, class_weight='balanced', random_state=2018)
rf_min_samples_leaf_gridcv = GridSearchCV(cv=10, estimator=rf, param_grid=rf_min_samples_leaf_param)
rf_min_samples_leaf_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': range(2, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
print(rf_min_samples_leaf_gridcv.best_estimator_)
print(rf_min_samples_leaf_gridcv.best_score_)
print(rf_min_samples_leaf_gridcv.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False)
0.7947099489029156
{'mean_fit_time': array([12.40980978, 11.58576264, 11.13383682, 10.71811311]), 'std_fit_time': array([0.37287417, 0.12415501, 0.14534085, 0.20399659]), 'mean_score_time': array([0.07010403, 0.07620435, 0.07280416, 0.07310419]), 'std_score_time': array([0.0037804 , 0.01041983, 0.01008811, 0.00763514]), 'param_min_samples_leaf': masked_array(data=[2, 4, 6, 8],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_samples_leaf': 2}, {'min_samples_leaf': 4}, {'min_samples_leaf': 6}, {'min_samples

In [30]:
score = rf_min_samples_leaf_gridcv.score(x_test, y_test)
print(score)

0.7820602662929222


In [31]:
joblib.dump(rf_min_samples_leaf_gridcv, 'rf_min_samples_leaf_gridcv.m')

t_end = time.time()
print('rf_min_samples_leaf_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

rf_min_samples_leaf_gridcv 训练结束,耗时:8.12963165442149min


## 结论

### 特征不稳定， 值并未达到上限。

## 6、max_leaf_nodes 对数据的影响

In [3]:
t_start = time.time()

rf_max_leaf_nodes_param = {'max_leaf_nodes':range(10,50,10)}

rf = RandomForestClassifier(n_estimators=190, max_features=0.3, max_depth=10, min_samples_split=2, 
                            min_samples_leaf=2, class_weight='balanced', random_state=2018)
rf_max_leaf_nodesf_gridcv = GridSearchCV(cv=10, estimator=rf, param_grid=rf_max_leaf_nodes_param)
rf_max_leaf_nodesf_gridcv.fit(X=x_train, y=y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': range(10, 50, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [4]:
print(rf_max_leaf_nodesf_gridcv.best_estimator_)
print(rf_max_leaf_nodesf_gridcv.best_score_)
print(rf_max_leaf_nodesf_gridcv.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.3,
            max_leaf_nodes=40, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=2018,
            verbose=0, warm_start=False)
0.7679591223324316
{'mean_fit_time': array([ 6.77908766,  8.76840155,  9.47484188, 10.06327558]), 'std_fit_time': array([0.27612991, 0.64582644, 0.7413639 , 0.42569193]), 'mean_score_time': array([0.06620388, 0.06870403, 0.06930397, 0.07360418]), 'std_score_time': array([0.00667564, 0.01185856, 0.01162008, 0.01672328]), 'param_max_leaf_nodes': masked_array(data=[10, 20, 30, 40],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_leaf_nodes': 10}, {'max_leaf_nodes': 20}, {'max_leaf_nodes': 30}, {'max_leaf_nodes

In [5]:
score = rf_max_leaf_nodesf_gridcv.score(x_test, y_test)
print(score)

0.747021723896286


In [6]:
joblib.dump(rf_max_leaf_nodesf_gridcv, 'rf_max_leaf_nodesf_gridcv.m')

t_end = time.time()
print('rf_max_leaf_nodesf_gridcv 训练结束,耗时:{}min'.format((t_end - t_start) / 60))

rf_max_leaf_nodesf_gridcv 训练结束,耗时:6.275425597031911min


## 结论

### 这个的效果会比较明朗，随着_max_leaf_nodes增加，精确度会提高，但并未达到上限