In [1]:
# 导入常用包
import xgboost as xgb
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel

In [2]:
# 数据集
cancer = datasets.load_breast_cancer()
X = cancer.data
Y = cancer.target

In [3]:
# 数据集的情况
# X.shape
# Y.shape
# X, Y

In [4]:
# 拆分训练集、测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/5., random_state = 8)

In [5]:
xgb_train = xgb.DMatrix(X_train, label = Y_train)
xgb_test  = xgb.DMatrix(X_test,  label = Y_test)

In [6]:
params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "eta": 1,
          "max_depth": 3,
          "eval_metric": "auc"
         }

In [7]:
watchlist = [(xgb_test, 'eval'), (xgb_train, 'train')]

In [8]:
bst = xgb.train(params, xgb_train, num_boost_round = 20, evals = watchlist)

[0]	eval-auc:0.96116	train-auc:0.97537
[1]	eval-auc:0.98306	train-auc:0.99491
[2]	eval-auc:0.99536	train-auc:0.99795
[3]	eval-auc:0.99521	train-auc:0.99932
[4]	eval-auc:0.99680	train-auc:0.99987
[5]	eval-auc:0.99776	train-auc:0.99994
[6]	eval-auc:0.99744	train-auc:1.00000
[7]	eval-auc:0.99808	train-auc:1.00000
[8]	eval-auc:0.99808	train-auc:1.00000
[9]	eval-auc:0.99744	train-auc:1.00000
[10]	eval-auc:0.99808	train-auc:1.00000
[11]	eval-auc:0.99776	train-auc:1.00000
[12]	eval-auc:0.99712	train-auc:1.00000
[13]	eval-auc:0.99776	train-auc:1.00000
[14]	eval-auc:0.99808	train-auc:1.00000
[15]	eval-auc:0.99808	train-auc:1.00000
[16]	eval-auc:0.99744	train-auc:1.00000
[17]	eval-auc:0.99744	train-auc:1.00000
[18]	eval-auc:0.99776	train-auc:1.00000
[19]	eval-auc:0.99776	train-auc:1.00000


# 通过前n棵树进行预测 和 通过全部树进行预测 的比较

In [9]:
# 使用前n棵树进行预测
pred_1 = bst.predict(xgb_test, ntree_limit = 10)

# 计算前10棵树预测的AUC
roc_auc_score(Y_test, pred_1)

0.9974424552429668

In [10]:
# 使用所有决策树进行预测
pred_2 = bst.predict(xgb_test)

# 计算所有决策树进行预测的AUC。最终预测出来的两者AUC不同
roc_auc_score(Y_test, pred_2)

0.9977621483375958

# 预测叶子节点索引

In [11]:
# 通过前10棵树预测
# 叶子节点索引列数 = 预测树的棵数

leaf_index = bst.predict(xgb_test, ntree_limit = 10, pred_leaf = True)
print(leaf_index.shape)
print(leaf_index)

(114, 10)
[[7 7 8 ... 1 1 3]
 [7 8 8 ... 1 1 3]
 [7 7 7 ... 3 1 3]
 ...
 [7 7 8 ... 1 1 4]
 [7 7 7 ... 3 1 3]
 [7 8 8 ... 1 1 3]]


In [12]:
# 通过整个模型预测（20棵树预测）
# 叶子节点索引列数 = 预测树的棵数

leaf_index = bst.predict(xgb_test, pred_leaf = True)
print(leaf_index.shape)
print(leaf_index)

(114, 20)
[[7 7 8 ... 2 1 1]
 [7 8 8 ... 2 1 2]
 [7 7 7 ... 1 1 1]
 ...
 [7 7 8 ... 2 1 1]
 [7 7 7 ... 1 1 2]
 [7 8 8 ... 2 1 2]]
