In [1]:
%matplotlib inline
import numpy as np
import xgboost as xgb
import pandas as pd

# 基本的使用

## 二分类

In [132]:
dtrain = xgb.DMatrix('agaricus.txt.train')

In [133]:
dtest = xgb.DMatrix('agaricus.txt.test')

In [134]:
param = {'max_detph': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}

### 设定验证集，输出每一次迭代表现

In [135]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [136]:
num_boost_round = 5

In [137]:
bst = xgb.train(param, dtrain, num_boost_round, watchlist)

[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0
[2]	eval-error:0	train-error:0
[3]	eval-error:0	train-error:0
[4]	eval-error:0	train-error:0


### 保存每一次迭代的训练集和验证集结果

In [64]:
# 使用train()中的evals_result参数
res2 = {}
xgb.train(param, dtrain, num_round, watchlist, evals_result=res2)

[0]	eval-error:0	train-error:0.000614
[1]	eval-error:0	train-error:0
[2]	eval-error:0	train-error:0
[3]	eval-error:0	train-error:0
[4]	eval-error:0	train-error:0


<xgboost.core.Booster at 0x103bed590>

In [65]:
for key, value in res2.iteritems():
    print key, value

train {'error': [0.000614, 0.0, 0.0, 0.0, 0.0]}
eval {'error': [0.0, 0.0, 0.0, 0.0, 0.0]}


### 进行预测

In [76]:
# 得到概率值
preds = bst.predict(dtest)
print preds

[ 0.0033117   0.99624288  0.0033117  ...,  0.99777812  0.00309621
  0.99777812]


In [77]:
# 得到测试集中的y值
labels = dtest.get_label()

In [79]:
print sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))

0.0


### 保存模型

In [80]:
bst.save_model('xgb.model')

In [81]:
dtest.save_binary('dtest.buffer')

### 读取模型和数据

In [82]:
bst2 = xgb.Booster(model_file='xgb.model')

In [83]:
dtest2 = xgb.DMatrix('dtest.buffer')

In [84]:
preds2 = bst2.predict(dtest2)

In [85]:
assert np.sum(np.abs(preds2 - preds)) == 0

## 多分类

In [86]:
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')

In [87]:
# 通过num_class参数设定多分类的类别数即可
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'num_class': 2}

In [88]:
watchlist = [(dtest, 'eval'), (dtrain, 'train')]

In [89]:
num_boost_round = 5

In [90]:
bst = xgb.train(param, dtrain, num_boost_round, watchlist)

[0]	eval-merror:0.042831	train-merror:0.046522
[1]	eval-merror:0.021726	train-merror:0.022263
[2]	eval-merror:0.006207	train-merror:0.007063
[3]	eval-merror:0.018001	train-merror:0.0152
[4]	eval-merror:0.006207	train-merror:0.007063


### 预测

In [96]:
# 多分类，得到类别值
preds = bst.predict(dtest)

print preds

[ 0.  1.  0. ...,  1.  0.  1.]


In [97]:
labels = dtest.get_label()

In [98]:
err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(len(preds))

In [99]:
print err

0.00620732464308


### 保存模型和数据

In [100]:
dtest.save_binary('dtest.buffer')
bst.save_model('xgb.model')

### 从本地读取模型和数据

In [101]:
bst2 = xgb.Booster(model_file='xgb.model')

In [102]:
dtest2 = xgb.DMatrix('dtest.buffer')

In [103]:
preds2 = bst2.predict(dtest2)

In [104]:
assert np.sum(np.abs(preds2 - preds)) == 0

## DMatrix基本使用

In [105]:
data = np.random.randn(5, 5)

In [106]:
data

array([[-0.1440365 , -1.53901335,  1.52256072,  0.24961937,  0.00816078],
       [ 1.50922483,  1.61828149,  0.48580408,  0.64529202, -0.38986845],
       [ 1.6335635 , -0.63496332, -1.37183114,  0.70000331, -1.37422337],
       [ 0.45820332, -1.45275691, -0.31891843, -0.63120163, -0.50904278],
       [ 1.54513591, -0.21328514,  1.57111063, -0.32387795,  1.64972363]])

In [107]:
dm = xgb.DMatrix(data)

In [108]:
dm.feature_names = list('abcde')

In [109]:
assert dm.feature_names == list('abcde')

In [110]:
dm.feature_types = 'q'

In [111]:
assert dm.feature_types == list('qqqqq')

In [114]:
dm.feature_types = list('qiqiq')

In [121]:
# All feature_names must be {int, float, i, q}
dm.feature_types = 'qi' 

ValueError: All feature_names must be {int, float, i, q}

### 特征名字

In [138]:
data = np.random.randn(100, 5)

In [139]:
target = np.array([0, 1] * 50)

In [140]:
cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'],
         [u'特征1', u'特征2', u'特征3', u'特征4', u'特征5']]

In [145]:
for features in cases:
    dm = xgb.DMatrix(data, label=target,
                     feature_names=features)
    
    params = {'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'eta': 0.3,
              'num_class': 2}
    bst = xgb.train(params, dm, num_boost_round=10)
    scores = bst.get_fscore() # 得到特征的重要性
    print type(scores)
    for feature in list(sorted(k for k in scores)):
        print feature
    dummy = np.random.randn(5, 5)
    dm = xgb.DMatrix(dummy, feature_names=features)
    bst.predict(dm)
    
    
    

<type 'dict'>
Feature1
Feature2
Feature3
Feature4
Feature5
<type 'dict'>
特征1
特征2
特征3
特征4
特征5


### 特征重要性

In [147]:
data = np.random.randn(100, 5)
target = np.array([0, 1]*50)

In [148]:
features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5']

In [149]:
dm = xgb.DMatrix(data, label=target,
                 feature_names=features)

In [150]:
params = {'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'eta': 0.3,
          'num_class': 2}

In [151]:
bst = xgb.train(params, dm, num_boost_round=20)

In [152]:
scores1 = bst.get_score()

In [153]:
scores2 = bst.get_score(importance_type='weight')

In [154]:
scores3 = bst.get_score(importance_type='cover')

In [155]:
scores4 = bst.get_score(importance_type='gain')

In [156]:
fscores = bst.get_fscore()

In [157]:
assert scores1 == fscores

## 交叉验证

In [158]:
dm = xgb.DMatrix('agaricus.txt.train')

In [159]:
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}

In [169]:
# 只需要向cv方法提供训练集即可，内部会自动划分为训练集和验证集, 返回 dict
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)

In [170]:
type(cv)

dict

In [171]:
len(cv)

4

In [172]:
for key, value in cv.iteritems():
    print key

train-error-std
test-error-mean
test-error-std
train-error-mean
