In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from preprocessing import *

## 1. 获取训练数据

In [2]:
train, test, features = read_test_train()

Read people.csv...
Load train.csv...
Load test.csv...
Process tables...
Merge...


## 2. 训练模型

In [None]:
#### 直接利用特征进行xgboost的构造

target = 'outcome'
random_state = 0
eta = 0.3
max_depth = 5
subsample = 0.5
colsample_bytree = 1
start_time = time.time()

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": eta,
    "tree_method": 'exact',
    "max_depth": max_depth,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": random_state,
}
num_boost_round = 900
early_stopping_rounds = 10
test_size = 0.5

X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
# X_train = X_train.sample(20000) # 数据量较大挑选部分数据
# X_valid = X_valid.sample(10000) # 数据量较大挑选部分数据
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

print("Validating...")
check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration+1)
score = roc_auc_score(X_valid[target].values, check)
print('Check error value: {:.6f}'.format(score))

XGBoost params. ETA: 0.3, MAX_DEPTH: 5, SUBSAMPLE: 0.5, COLSAMPLE_BY_TREE: 1
Length train: 1098645
Length valid: 1098646
[0]	train-auc:0.914297	eval-auc:0.913981
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 10 rounds.
[1]	train-auc:0.920316	eval-auc:0.920048
[2]	train-auc:0.923772	eval-auc:0.923377
[3]	train-auc:0.925308	eval-auc:0.924991
[4]	train-auc:0.926667	eval-auc:0.926345
[5]	train-auc:0.928248	eval-auc:0.927909
[6]	train-auc:0.930125	eval-auc:0.929885
[7]	train-auc:0.931899	eval-auc:0.931652
[8]	train-auc:0.93319	eval-auc:0.932924
[9]	train-auc:0.933634	eval-auc:0.933403
[10]	train-auc:0.934548	eval-auc:0.934319
[11]	train-auc:0.935871	eval-auc:0.935621
[12]	train-auc:0.936683	eval-auc:0.936424
[13]	train-auc:0.937376	eval-auc:0.93711
[14]	train-auc:0.938037	eval-auc:0.937733
[15]	train-auc:0.938984	eval-auc:0.938662
[16]	train-auc:0.939545	eval-auc:0.939206
[17]	train-auc:0.940203	eval-auc:0.9

### 利用全量数据，合适参数进行训练

In [22]:
target = 'outcome'
random_state = 0
eta = 0.3
max_depth = 8
subsample = 0.5
colsample_bytree = 1
start_time = time.time()

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": eta,
    "tree_method": 'exact',
    "max_depth": max_depth,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": random_state,
}
num_boost_round = 465
early_stopping_rounds = 10
test_size = 0.5

# X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
X_train = X_train.sample(200000) # 数据量较大挑选部分数据
X_valid = X_valid.sample(100000) # 数据量较大挑选部分数据
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

XGBoost params. ETA: 0.3, MAX_DEPTH: 8, SUBSAMPLE: 0.5, COLSAMPLE_BY_TREE: 1
Length train: 200000
Length valid: 100000
[0]	train-auc:0.931049
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.937465
[2]	train-auc:0.943374
[3]	train-auc:0.945279
[4]	train-auc:0.947434
[5]	train-auc:0.949003
[6]	train-auc:0.952291
[7]	train-auc:0.953214
[8]	train-auc:0.95423
[9]	train-auc:0.955256
[10]	train-auc:0.956974
[11]	train-auc:0.957965
[12]	train-auc:0.959535
[13]	train-auc:0.960425
[14]	train-auc:0.96087
[15]	train-auc:0.962182
[16]	train-auc:0.963014
[17]	train-auc:0.964198
[18]	train-auc:0.965488
[19]	train-auc:0.966445
[20]	train-auc:0.967798
[21]	train-auc:0.968202
[22]	train-auc:0.969522
[23]	train-auc:0.97078
[24]	train-auc:0.97165
[25]	train-auc:0.97212
[26]	train-auc:0.972788
[27]	train-auc:0.973315
[28]	train-auc:0.974395
[29]	train-auc:0.975147
[30]	train-auc:0.975858
[31]	train-auc:0.976469
[32]	train-auc:0.977293
[33]	train-auc:0.978305
[34]	train-auc:0.978896

[327]	train-auc:0.999733
[328]	train-auc:0.999738
[329]	train-auc:0.99974
[330]	train-auc:0.999747
[331]	train-auc:0.999753
[332]	train-auc:0.999755
[333]	train-auc:0.999758
[334]	train-auc:0.999762
[335]	train-auc:0.999764
[336]	train-auc:0.999767
[337]	train-auc:0.999768
[338]	train-auc:0.99977
[339]	train-auc:0.999775
[340]	train-auc:0.999777
[341]	train-auc:0.999781
[342]	train-auc:0.999786
[343]	train-auc:0.999788
[344]	train-auc:0.99979
[345]	train-auc:0.999795
[346]	train-auc:0.999796
[347]	train-auc:0.9998
[348]	train-auc:0.999803
[349]	train-auc:0.999806
[350]	train-auc:0.999809
[351]	train-auc:0.999812
[352]	train-auc:0.999816
[353]	train-auc:0.999818
[354]	train-auc:0.999823
[355]	train-auc:0.999823
[356]	train-auc:0.999828
[357]	train-auc:0.999832
[358]	train-auc:0.999835
[359]	train-auc:0.999837
[360]	train-auc:0.999841
[361]	train-auc:0.999842
[362]	train-auc:0.999844
[363]	train-auc:0.999846
[364]	train-auc:0.999849
[365]	train-auc:0.999853
[366]	train-auc:0.999853
[367]

## 3. 保存模型

In [20]:
gbm.save_model('redhat_bussiness_verone_20181130_2.model')

In [None]:
enc # one hot
enc.fit_transform(Xtrain)
enc.transform(Xtest)

In [None]:
alldata
xtrain
xtest

In [None]:
enc.fit(alldata)
enc.transform(xtrain)
enc.transform(xtest)