In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from preprocessing import *

## 1. 获取训练数据

In [2]:
train, test, features = read_test_train()

Read people.csv...
Load train.csv...
Load test.csv...
Process tables...
Merge...


## 2. 训练模型

In [5]:
#### 直接利用特征进行xgboost的构造

target = 'outcome'
random_state = 0
eta = 0.3
max_depth = 5
subsample = 0.5
colsample_bytree = 1
start_time = time.time()

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": eta,
    "tree_method": 'exact',
    "max_depth": max_depth,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": random_state,
}
num_boost_round = 900
early_stopping_rounds = 10
test_size = 0.5

X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
# X_train = X_train.sample(20000) # 数据量较大挑选部分数据
# X_valid = X_valid.sample(10000) # 数据量较大挑选部分数据
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

print("Validating...")
check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration+1)
score = roc_auc_score(X_valid[target].values, check)
print('Check error value: {:.6f}'.format(score))

XGBoost params. ETA: 0.3, MAX_DEPTH: 5, SUBSAMPLE: 0.5, COLSAMPLE_BY_TREE: 1
Length train: 1098645
Length valid: 1098646
[0]	train-auc:0.914297	eval-auc:0.913981
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 10 rounds.
[1]	train-auc:0.920316	eval-auc:0.920048
[2]	train-auc:0.923772	eval-auc:0.923377
[3]	train-auc:0.925308	eval-auc:0.924991
[4]	train-auc:0.926667	eval-auc:0.926345
[5]	train-auc:0.928248	eval-auc:0.927909
[6]	train-auc:0.930125	eval-auc:0.929885
[7]	train-auc:0.931899	eval-auc:0.931652
[8]	train-auc:0.93319	eval-auc:0.932924
[9]	train-auc:0.933634	eval-auc:0.933403
[10]	train-auc:0.934548	eval-auc:0.934319
[11]	train-auc:0.935871	eval-auc:0.935621
[12]	train-auc:0.936683	eval-auc:0.936424
[13]	train-auc:0.937376	eval-auc:0.93711
[14]	train-auc:0.938037	eval-auc:0.937733
[15]	train-auc:0.938984	eval-auc:0.938662
[16]	train-auc:0.939545	eval-auc:0.939206
[17]	train-auc:0.940203	eval-auc:0.9

[188]	train-auc:0.979729	eval-auc:0.978268
[189]	train-auc:0.979768	eval-auc:0.978298
[190]	train-auc:0.979872	eval-auc:0.978395
[191]	train-auc:0.980035	eval-auc:0.978541
[192]	train-auc:0.980119	eval-auc:0.978622
[193]	train-auc:0.980157	eval-auc:0.978652
[194]	train-auc:0.980219	eval-auc:0.978715
[195]	train-auc:0.980251	eval-auc:0.978741
[196]	train-auc:0.980255	eval-auc:0.978735
[197]	train-auc:0.980337	eval-auc:0.978811
[198]	train-auc:0.980376	eval-auc:0.978843
[199]	train-auc:0.980482	eval-auc:0.978938
[200]	train-auc:0.980578	eval-auc:0.979038
[201]	train-auc:0.980629	eval-auc:0.979083
[202]	train-auc:0.980658	eval-auc:0.979125
[203]	train-auc:0.980725	eval-auc:0.979193
[204]	train-auc:0.980741	eval-auc:0.979208
[205]	train-auc:0.980759	eval-auc:0.979219
[206]	train-auc:0.980801	eval-auc:0.97926
[207]	train-auc:0.980866	eval-auc:0.979317
[208]	train-auc:0.980905	eval-auc:0.97934
[209]	train-auc:0.980946	eval-auc:0.979373
[210]	train-auc:0.980966	eval-auc:0.979392
[211]	train-a

[380]	train-auc:0.987917	eval-auc:0.986018
[381]	train-auc:0.98797	eval-auc:0.986067
[382]	train-auc:0.988009	eval-auc:0.986097
[383]	train-auc:0.988065	eval-auc:0.986161
[384]	train-auc:0.988078	eval-auc:0.986172
[385]	train-auc:0.988092	eval-auc:0.986179
[386]	train-auc:0.98816	eval-auc:0.986256
[387]	train-auc:0.98817	eval-auc:0.986257
[388]	train-auc:0.988187	eval-auc:0.986272
[389]	train-auc:0.988224	eval-auc:0.98631
[390]	train-auc:0.988236	eval-auc:0.986326
[391]	train-auc:0.988268	eval-auc:0.986362
[392]	train-auc:0.988311	eval-auc:0.986398
[393]	train-auc:0.988322	eval-auc:0.986405
[394]	train-auc:0.988387	eval-auc:0.986472
[395]	train-auc:0.988394	eval-auc:0.986481
[396]	train-auc:0.988412	eval-auc:0.986499
[397]	train-auc:0.988427	eval-auc:0.986508
[398]	train-auc:0.988436	eval-auc:0.986517
[399]	train-auc:0.988469	eval-auc:0.98655
[400]	train-auc:0.988489	eval-auc:0.986569
[401]	train-auc:0.988544	eval-auc:0.986623
[402]	train-auc:0.988569	eval-auc:0.986643
[403]	train-auc:

[572]	train-auc:0.991975	eval-auc:0.990012
[573]	train-auc:0.991998	eval-auc:0.990033
[574]	train-auc:0.992018	eval-auc:0.990057
[575]	train-auc:0.992035	eval-auc:0.99008
[576]	train-auc:0.992058	eval-auc:0.990106
[577]	train-auc:0.992057	eval-auc:0.990102
[578]	train-auc:0.992072	eval-auc:0.990116
[579]	train-auc:0.992091	eval-auc:0.990133
[580]	train-auc:0.992105	eval-auc:0.990147
[581]	train-auc:0.992136	eval-auc:0.990176
[582]	train-auc:0.99214	eval-auc:0.990179
[583]	train-auc:0.992154	eval-auc:0.990192
[584]	train-auc:0.992163	eval-auc:0.9902
[585]	train-auc:0.992172	eval-auc:0.990207
[586]	train-auc:0.992179	eval-auc:0.990215
[587]	train-auc:0.992195	eval-auc:0.99023
[588]	train-auc:0.992214	eval-auc:0.990251
[589]	train-auc:0.992234	eval-auc:0.990278
[590]	train-auc:0.99226	eval-auc:0.990308
[591]	train-auc:0.99227	eval-auc:0.990319
[592]	train-auc:0.992285	eval-auc:0.990337
[593]	train-auc:0.992301	eval-auc:0.990355
[594]	train-auc:0.992306	eval-auc:0.990359
[595]	train-auc:0.

[764]	train-auc:0.994116	eval-auc:0.992087
[765]	train-auc:0.994117	eval-auc:0.992087
[766]	train-auc:0.994119	eval-auc:0.99209
[767]	train-auc:0.994123	eval-auc:0.992093
[768]	train-auc:0.994134	eval-auc:0.992105
[769]	train-auc:0.994143	eval-auc:0.992111
[770]	train-auc:0.994149	eval-auc:0.992113
[771]	train-auc:0.99416	eval-auc:0.992123
[772]	train-auc:0.994156	eval-auc:0.992118
[773]	train-auc:0.994158	eval-auc:0.992122
[774]	train-auc:0.994171	eval-auc:0.992136
[775]	train-auc:0.994182	eval-auc:0.99215
[776]	train-auc:0.994192	eval-auc:0.992158
[777]	train-auc:0.9942	eval-auc:0.992165
[778]	train-auc:0.994212	eval-auc:0.992179
[779]	train-auc:0.994216	eval-auc:0.992184
[780]	train-auc:0.994223	eval-auc:0.992193
[781]	train-auc:0.994229	eval-auc:0.992201
[782]	train-auc:0.994238	eval-auc:0.992211
[783]	train-auc:0.994246	eval-auc:0.992221
[784]	train-auc:0.994249	eval-auc:0.992224
[785]	train-auc:0.994261	eval-auc:0.992238
[786]	train-auc:0.994268	eval-auc:0.992247
[787]	train-auc:

### 利用全量数据，合适参数进行训练

In [22]:
target = 'outcome'
random_state = 0
eta = 0.3
max_depth = 8
subsample = 0.5
colsample_bytree = 1
start_time = time.time()

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": eta,
    "tree_method": 'exact',
    "max_depth": max_depth,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": random_state,
}
num_boost_round = 465
early_stopping_rounds = 10
test_size = 0.5

# X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
X_train = X_train.sample(200000) # 数据量较大挑选部分数据
X_valid = X_valid.sample(100000) # 数据量较大挑选部分数据
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

XGBoost params. ETA: 0.3, MAX_DEPTH: 8, SUBSAMPLE: 0.5, COLSAMPLE_BY_TREE: 1
Length train: 200000
Length valid: 100000
[0]	train-auc:0.931049
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.937465
[2]	train-auc:0.943374
[3]	train-auc:0.945279
[4]	train-auc:0.947434
[5]	train-auc:0.949003
[6]	train-auc:0.952291
[7]	train-auc:0.953214
[8]	train-auc:0.95423
[9]	train-auc:0.955256
[10]	train-auc:0.956974
[11]	train-auc:0.957965
[12]	train-auc:0.959535
[13]	train-auc:0.960425
[14]	train-auc:0.96087
[15]	train-auc:0.962182
[16]	train-auc:0.963014
[17]	train-auc:0.964198
[18]	train-auc:0.965488
[19]	train-auc:0.966445
[20]	train-auc:0.967798
[21]	train-auc:0.968202
[22]	train-auc:0.969522
[23]	train-auc:0.97078
[24]	train-auc:0.97165
[25]	train-auc:0.97212
[26]	train-auc:0.972788
[27]	train-auc:0.973315
[28]	train-auc:0.974395
[29]	train-auc:0.975147
[30]	train-auc:0.975858
[31]	train-auc:0.976469
[32]	train-auc:0.977293
[33]	train-auc:0.978305
[34]	train-auc:0.978896

[327]	train-auc:0.999733
[328]	train-auc:0.999738
[329]	train-auc:0.99974
[330]	train-auc:0.999747
[331]	train-auc:0.999753
[332]	train-auc:0.999755
[333]	train-auc:0.999758
[334]	train-auc:0.999762
[335]	train-auc:0.999764
[336]	train-auc:0.999767
[337]	train-auc:0.999768
[338]	train-auc:0.99977
[339]	train-auc:0.999775
[340]	train-auc:0.999777
[341]	train-auc:0.999781
[342]	train-auc:0.999786
[343]	train-auc:0.999788
[344]	train-auc:0.99979
[345]	train-auc:0.999795
[346]	train-auc:0.999796
[347]	train-auc:0.9998
[348]	train-auc:0.999803
[349]	train-auc:0.999806
[350]	train-auc:0.999809
[351]	train-auc:0.999812
[352]	train-auc:0.999816
[353]	train-auc:0.999818
[354]	train-auc:0.999823
[355]	train-auc:0.999823
[356]	train-auc:0.999828
[357]	train-auc:0.999832
[358]	train-auc:0.999835
[359]	train-auc:0.999837
[360]	train-auc:0.999841
[361]	train-auc:0.999842
[362]	train-auc:0.999844
[363]	train-auc:0.999846
[364]	train-auc:0.999849
[365]	train-auc:0.999853
[366]	train-auc:0.999853
[367]

## 3. 保存模型

In [20]:
gbm.save_model('redhat_bussiness_verone_20181130_2.model')

In [None]:
enc # one hot
enc.fit_transform(Xtrain)
enc.transform(Xtest)

In [None]:
alldata
xtrain
xtest

In [None]:
enc.fit(alldata)
enc.transform(xtrain)
enc.transform(xtest)