In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
rng = np.random.RandomState(1994)

In [4]:
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')

### 使用 GBLinear作为booster

In [11]:
params = {'silent': 1, 'objective': 'binary:logistic',
          'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1}

In [12]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [13]:
num_boost_round = 4

In [14]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist)

[0]	eval-error:0.019863	train-error:0.015047
[1]	eval-error:0.005587	train-error:0.00476
[2]	eval-error:0	train-error:0.001842
[3]	eval-error:0	train-error:0.000614


In [15]:
preds = bst.predict(dtest)

### 使用 DART(Dropout + Gradient Boosting) 作为booster

In [16]:
params = {'max_depth': 5, 'objective': 'binary:logistic', 'booster': 'dart', 'silent': 1}

In [17]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [18]:
num_boost_round = 5

In [19]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist)

[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0.001228
[2]	eval-error:0	train-error:0.000614
[3]	eval-error:0	train-error:0.000614
[4]	eval-error:0	train-error:0


In [20]:
preds = bst.predict(dtest, ntree_limit=num_boost_round)

In [21]:
labels = dtest.get_label()

In [22]:
print sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))

0.0


In [23]:
params['learning_rate'] = 0.1
params['rate_drop'] = 0.1

In [24]:
preds_list = []

In [25]:
for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]:
    params['sample_type'] = p[0]
    params['normalize_type'] = p[1]
    bst = xgb.train(params, dtrain, num_boost_round, watchlist)
    preds = bst.predict(dtest, ntree_limit=num_boost_round)
    err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
    preds_list.append(preds)

[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0.001228
[2]	eval-error:0	train-error:0.001228
[3]	eval-error:0	train-error:0.001228
[4]	eval-error:0	train-error:0.001228
[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0.001228
[2]	eval-error:0	train-error:0.001228
[3]	eval-error:0	train-error:0.001228
[4]	eval-error:0	train-error:0.001228
[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0.001228
[2]	eval-error:0	train-error:0.001228
[3]	eval-error:0	train-error:0.001228
[4]	eval-error:0	train-error:0.001228
[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0.001228
[2]	eval-error:0	train-error:0.001228
[3]	eval-error:0	train-error:0.001228
[4]	eval-error:0	train-error:0.001228


## 参数eta(shrinkage)

In [28]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [29]:
num_boost_round = 5

In [30]:
params = {'max_detph': 2, 'eta': 0, 'silent': 1, 'objective': 'binary:logistic'}

In [31]:
evals_result ={}

In [37]:
# 将eta设置为0，看看学习率是否起作用, 为每一次迭代过程设置学习率
bst = xgb.train(params, dtrain, num_boost_round, watchlist, learning_rates=[0.8, 0.7, 0.6, 0.5, 0.4],
                evals_result=evals_result) #evals_result保存每一次迭代的计算结果

[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0.000614
[2]	eval-error:0	train-error:0
[3]	eval-error:0	train-error:0
[4]	eval-error:0	train-error:0


In [38]:
eval_erros = list(map(float, evals_result['eval']['error']))

In [39]:
for err in eval_erros:
    print err

0.0
0.0
0.0
0.0
0.0


In [40]:
# 将learning_rate设置为0，看看learning_rate是否起作用
params = {'max_depth': 2, 'learning_rate': 0, 'silent': 1, 'objective': 'binary:logistic'}

In [41]:
evals_result = {}

In [42]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist, learning_rates=[0.8, 0.7, 0.6, 0.5, 0.4],
                evals_result=evals_result)

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.036623	train-error:0.037617
[2]	eval-error:0.015518	train-error:0.013358
[3]	eval-error:0.027312	train-error:0.021495
[4]	eval-error:0.021105	train-error:0.015661


In [43]:
eval_erros = list(map(float, evals_result['eval']['error']))

In [45]:
# 错误在减小，起作用了
for error in eval_erros:
    print error

0.042831
0.036623
0.015518
0.027312
0.021105


In [46]:
params = {'max_depth': 2, 'silent': 1, 'objective': 'binary:logistic'}

In [47]:
evals_result = {}

In [48]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist, learning_rates=[0,0,0,0,0],
                evals_result=evals_result)

[0]	eval-error:0.481688	train-error:0.482113
[1]	eval-error:0.481688	train-error:0.482113
[2]	eval-error:0.481688	train-error:0.482113
[3]	eval-error:0.481688	train-error:0.482113
[4]	eval-error:0.481688	train-error:0.482113


In [49]:
eval_errors = list(map(float, evals_result['eval']['error']))

In [51]:
# 学习率为0， 模型不照
for error in eval_errors:
    print error

0.481688
0.481688
0.481688
0.481688
0.481688


#### 自定义衰减函数作为learning_rates

In [53]:
# 使用自定义衰减函数作为learning_rates
def eta_decay(ithround, num_boost_round):
    return num_boost_round / (ithround + 1)

In [55]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist, learning_rates=eta_decay)

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.042831	train-error:0.046522
[2]	eval-error:0.042831	train-error:0.046522
[3]	eval-error:0.042831	train-error:0.046522
[4]	eval-error:0.042831	train-error:0.046522


## 自定义损失函数

In [56]:
params = {'max_depth': 2, 'eta': 1, 'silent': 1}

In [57]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [58]:
num_boost_round = 2

In [71]:
# 返回梯度和Hess矩阵
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess

In [72]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

In [73]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist, logregobj, evalerror)

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.021726	train-error:0.022263


In [74]:
preds = bst.predict(dtest)

In [75]:
labels = dtest.get_label()

In [76]:
# 结合交叉验证
xgb.cv(params, dtrain, num_boost_round, nfold=5, seed=0,
       obj=logregobj, feval=evalerror)

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.05576,0.015827,0.050691,0.009194
1,0.021198,0.003813,0.021313,0.002075


In [77]:
# 测试train()中maximize 参数

In [78]:
def neg_evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels == (preds > 0.0))) / len(labels)

In [79]:
bst2 = xgb.train(params, dtrain, num_boost_round, watchlist, logregobj, neg_evalerror, maximize=True)

[0]	eval-error:0.957169	train-error:0.953478
[1]	eval-error:0.978274	train-error:0.977737


In [80]:
preds = bst2.predict(dtest)

### 同时使用多个评价指标

In [87]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [88]:
params = {'max-depth': 2, 'eta': 0.2, 'silent': 1, 'objective': 'binary:logistic'}

In [89]:
params['eval_metric'] = ['auc', 'logloss', 'error']

In [90]:
evals_result = {}

In [91]:
bst = xgb.train(params, dtrain, num_boost_round, watchlist, evals_result=evals_result)

[0]	eval-auc:1	eval-logloss:0.514188	eval-error:0	train-auc:0.999238	train-logloss:0.514426	train-error:0.000614
[1]	eval-auc:1	eval-logloss:0.393842	eval-error:0	train-auc:0.999238	train-logloss:0.39443	train-error:0.001228


In [92]:
len(evals_result['eval'])

3