In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

print('------------start------------')

------------start------------


In [2]:
train = pd.read_csv('/Applications/py_workspace/data_mining/data/train_modified.csv')
target = 'Disbursed'  #Disbursed的值就是二分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

In [5]:
#可见样本分类严重不均匀,除了id列，所有的列都是样本特征
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [6]:
#先用默认参数做一次训练
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X,y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:,1]
print('Accuracy: %0.4f' % metrics.accuracy_score(y.values, y_pred))
print('auc score(train): %f' %metrics.roc_auc_score(y, y_predprob))

Accuracy: 0.9852
auc score(train): 0.900531


In [9]:
#开始调参，优化训练，先从步长和迭代次数开始，采用gridsearchcv，从小达到网格搜索，将learning_rate设为0.1
#迭代次数 n_estimator, 弱学习器的个数，个数太少，容易欠拟合，太多容易过拟合，默认为100
# learning_rate: 弱学习器的权重缩减系数，即步长，正则化项，防止过拟合，较小的步长则需要更多的迭代次数，通常和n_estimator一起调整
# min_samples_split : 这个值限制了子树继续划分的次数，当某个节点的样本数少于min_samples_split时，停止划分
# min_samples_leaf: 叶子结点最少的样本数，如果某叶子结点数小于这个值，则会被剪枝，默认为1， 样本树较大时，增大这个值
# max_features : 划分时考虑的最大特征树, 可以使用很多种类型的值，控制决策树的生成时间
# subsample：子采用比例
param_test1 = {'n_estimators': range(20, 81, 10)}
gbm1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate = 0.1,  min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), param_grid= param_test1, scoring='roc_auc', iid= False, cv=5 )
gbm1.fit(X, y)
gbm1.best_params_, gbm1.best_score_




({'n_estimators': 40}, 0.8145309324186991)

In [10]:
# 调整决策树的参数，对决策树的最大深度ma_depth 和内部结点再划分所需最小样本树进行网格搜索

param_test2 = {'max_depth': range(3,14,2), 'min_samples_split': range(100, 801, 200)}

gbm2 = GridSearchCV(estimator= GradientBoostingClassifier(n_estimators= 40, learning_rate= 0.1, min_samples_leaf=20, max_features='sqrt', subsample=0.8, random_state=10), param_grid=param_test2, iid=False, cv=5)

gbm2.fit(X, y)
gbm2.cv_results_, gbm2.best_params_, gbm2.best_score_

({'mean_fit_time': array([0.22730985, 0.2181242 , 0.21406155, 0.21741529, 0.30348263,
         0.29530864, 0.29138165, 0.28919148, 0.38285651, 0.36578426,
         0.35447736, 0.34656487, 0.46921711, 0.42896695, 0.40579576,
         0.38906217, 0.53773556, 0.47845616, 0.44556098, 0.42233095,
         0.59803023, 0.50959234, 0.47933025, 0.4334208 ]),
  'std_fit_time': array([0.01446254, 0.00369881, 0.00110199, 0.00148393, 0.00201615,
         0.00189224, 0.00285452, 0.00322006, 0.00432313, 0.00295606,
         0.00212676, 0.00307929, 0.00461129, 0.00522346, 0.00294563,
         0.00229689, 0.00541514, 0.00255906, 0.00781387, 0.00402202,
         0.00986126, 0.00643464, 0.01329363, 0.00549807]),
  'mean_score_time': array([0.00506563, 0.00497694, 0.00518098, 0.00527287, 0.00662603,
         0.00582681, 0.00612812, 0.00613432, 0.0074677 , 0.00775104,
         0.00763683, 0.00737038, 0.00900726, 0.00881925, 0.00791292,
         0.00863433, 0.01010642, 0.00956855, 0.00916805, 0.00923438,
  

In [12]:
#最大深度是3，由于叶子节点的划分还和决策树的其他指标有关，因此，先不做确认
param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
gbm3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=3,
                                     max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gbm3.fit(X,y)
gbm3.cv_results_, gbm3.best_params_, gbm3.best_score_

({'mean_fit_time': array([0.22166729, 0.21133714, 0.20978651, 0.20958757, 0.21032152,
         0.21053214, 0.21085687, 0.21102891, 0.21864595, 0.21742163,
         0.21753922, 0.2160871 , 0.2202188 , 0.21830788, 0.22087827,
         0.21463389, 0.21873069, 0.21540976, 0.21838326, 0.21990676,
         0.2178792 , 0.21853156, 0.21745596, 0.21676245, 0.2184711 ,
         0.21941423, 0.2189116 , 0.21889458, 0.21893268, 0.21587968]),
  'std_fit_time': array([0.01355302, 0.00076657, 0.00121727, 0.00146292, 0.00086583,
         0.00186807, 0.00068864, 0.00052917, 0.00183057, 0.0004226 ,
         0.0015694 , 0.00175484, 0.00188206, 0.00113489, 0.00292743,
         0.00507548, 0.00135179, 0.00355505, 0.00120116, 0.00250634,
         0.00185482, 0.00238392, 0.00130623, 0.00074965, 0.00211425,
         0.00084706, 0.00239854, 0.00104598, 0.00212828, 0.00082923]),
  'mean_score_time': array([0.00649691, 0.00652828, 0.00625606, 0.00614395, 0.00633559,
         0.00633922, 0.00613837, 0.00657387, 0.

In [13]:
gbm4 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=3, min_samples_leaf =100, 
               min_samples_split =1400, max_features='sqrt', subsample=0.8, random_state=10)
gbm4.fit(X,y)
y_pred = gbm4.predict(X)
y_predprob = gbm4.predict_proba(X)[:,1]
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.836524


In [15]:
#调参后准确率有所下降，主要是我们只用了80%的样本，接下来继续调整max_features

param_test4 = {'max_features': range(7, 20, 2)}
gbm5 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=40, min_samples_leaf= 100, learning_rate=0.1, min_samples_split=1400, subsample=0.8, random_state=10), param_grid=param_test4, iid=False, cv=5)
gbm5.fit(X,y)
gbm5.best_params_, gbm5.best_score_

({'max_features': 7}, 0.984)

In [16]:
# 网格搜索subsample

param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gbm6 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=3, min_samples_leaf =100, 
               min_samples_split =1400, max_features=3, random_state=10), 
                       param_grid = param_test5, scoring='roc_auc',iid=False, cv=5)
gbm6.fit(X,y)
gbm6.best_params_, gbm6.best_score_

({'subsample': 0.9}, 0.8033639799288619)

In [17]:
#减小步长，增大迭代次数
gbm6 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=80,max_depth=3, min_samples_leaf =100, 
               min_samples_split =1400, max_features='sqrt', subsample=0.9, random_state=10)
gbm6.fit(X,y)
y_pred = gbm6.predict(X)
y_predprob = gbm6.predict_proba(X)[:,1]
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.838592


In [20]:
#减小步长，增大迭代次数
gbm6 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=320,max_depth=3, min_samples_leaf =100, 
               min_samples_split =1400, max_features='sqrt', subsample=0.9, random_state=10)
gbm6.fit(X,y)
y_pred = gbm6.predict(X)
y_predprob = gbm6.predict_proba(X)[:,1]
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.839670


In [21]:
#减小步长，增大迭代次数
gbm7 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=400,max_depth=3, min_samples_leaf =100, 
               min_samples_split =1400, max_features='sqrt', subsample=0.9, random_state=10)
gbm7.fit(X,y)
y_pred = gbm7.predict(X)
y_predprob = gbm7.predict_proba(X)[:,1]
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.844555


In [None]:
#可以继续调整步长和迭代次数，但是感觉这次训练从找到的最有max_depth=3时就有问题了，不再继续训练，总体思路基本如上