In [1]:
#http://www.jianshu.com/p/02cfaae3fd01

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 导入数据
X_train, X_test, y_train, y_test = train_test_split(load_boston().data, load_boston().target, test_size=0.2)


"""初始化算法，设置参数

一些主要参数
loss: 损失函数，GBDT回归器可选'ls', 'lad', 'huber', 'quantile'。
learning_rate: 学习率/步长。
n_estimators: 迭代次数，和learning_rate存在trade-off关系。
criterion: 衡量分裂质量的公式，一般默认即可。
subsample: 样本采样比例。
max_features: 最大特征数或比例。

决策树相关参数包括max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_split, 多数用来设定决策树分裂停止条件。

verbose: 日志level。
具体说明和其它参数请参考官网API。
"""
reg_model = GradientBoostingRegressor(
    loss='ls',
    learning_rate=0.02,
    n_estimators=200,
    subsample=0.8,
    max_features=0.8,
    max_depth=3,
    verbose=2
)

# 训练模型
reg_model.fit(X_train, y_train)

# 评估模型
prediction_train = reg_model.predict(X_train)
rmse_train = mean_squared_error(y_train, prediction_train)
prediction_test = reg_model.predict(X_test)
rmse_test = mean_squared_error(y_test, prediction_test)
print "RMSE for training dataset is %f, for testing dataset is %f." % (rmse_train, rmse_test)
"""
Output:
RMSE for training dataset is 4.239157, for testing dataset is 10.749044.
"""


      Iter       Train Loss      OOB Improve   Remaining Time 
         1          78.2922           2.8162            0.70s
         2          81.7426           2.0762            0.43s
         3          74.3407           2.5626            0.32s
         4          73.8844           2.5643            0.26s
         5          70.8368           2.1406            0.23s
         6          70.1122           1.6746            0.20s
         7          61.0798           2.5003            0.19s
         8          67.7408           1.3437            0.17s
         9          61.0342           2.2634            0.16s
        10          55.4914           2.5608            0.16s
        11          59.4668           1.8777            0.15s
        12          58.2374           1.4695            0.15s
        13          51.6274           1.7085            0.14s
        14          54.1443           1.4752            0.14s
        15          51.5884           1.1835            0.13s
       

'Output:\nRMSE for training dataset is 4.239157, for testing dataset is 10.749044.\n'

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# 导入数据
X_train, X_test, y_train, y_test = train_test_split(load_iris().data, load_iris().target, test_size=0.2)


"""初始化算法，设置参数

一些主要参数
loss: 损失函数，GBDT分类器可选'deviance', 'exponential'。
learning_rate: 学习率/步长。
n_estimators: 迭代次数，和learning_rate存在trade-off关系。
criterion: 衡量分裂质量的公式，一般默认即可。
subsample: 样本采样比例。
max_features: 最大特征数或比例。

决策树相关参数包括max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_split, 多数用来设定决策树分裂停止条件。

verbose: 日志level。
具体说明和其它参数请参考官网API。
"""
clf_model = GradientBoostingClassifier(
    loss='deviance',
    learning_rate=0.01,
    n_estimators=50,
    subsample=0.8,
    max_features=1,
    max_depth=3,
    verbose=2
)

# 训练模型
clf_model.fit(X_train, y_train)

# 评估模型
prediction_train = clf_model.predict(X_train)
cm_train = confusion_matrix(y_train, prediction_train)
prediction_test = clf_model.predict(X_test)
cm_test = confusion_matrix(y_test, prediction_test)
print "Confusion matrix for training dataset is \n%s\n for testing dataset is \n%s." % (cm_train, cm_test)
"""Output:
Confusion matrix for training dataset is 
[[40  0  0]
 [ 0 40  1]
 [ 0  1 38]]
 for testing dataset is 
[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]].
"""


      Iter       Train Loss      OOB Improve   Remaining Time 
         1         103.6549           0.4223            0.07s
         2         102.0569           0.3399            0.07s
         3         100.6466           0.3734            0.07s
         4          98.7931           0.4003            0.06s
         5          97.4915           0.3363            0.07s
         6          95.8733           0.3883            0.07s
         7          94.4909           0.3542            0.07s
         8          93.3805           0.2342            0.06s
         9          91.9886           0.3094            0.06s
        10          90.6541           0.3563            0.06s
        11          89.2891           0.3070            0.06s
        12          88.1824           0.3236            0.06s
        13          86.5349           0.3320            0.06s
        14          85.5641           0.3242            0.05s
        15          83.9235           0.2834            0.05s
       

'Output:\nConfusion matrix for training dataset is \n[[40  0  0]\n [ 0 40  1]\n [ 0  1 38]]\n for testing dataset is \n[[10  0  0]\n [ 0  8  1]\n [ 0  0 11]].\n'