-
Notifications
You must be signed in to change notification settings - Fork 1
/
xgboost.ensemble.py
55 lines (45 loc) · 1.57 KB
/
xgboost.ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
import xgboost as xgb
import cPickle as pickle
from misc import load_data, preprocess_labels, preprocess_data
import paths
import os
np.random.seed(1337) # for reproducibility
mode = 'submission' # crossvalidate|submission
model_name = 'dump_xgboost_ensemble'
print("Loading data...")
X_train, labels = load_data(paths.train_file, train=True)
y_train = preprocess_labels(labels, categorical=False)
X_train = preprocess_data(X_train)
dtrain = xgb.DMatrix(X_train, label=y_train)
n_classes = max(y_train) + 1
n_round = 332
param = {'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'eval.metric': 'merror',
'num_class': n_classes,
'max_depth': 16,
'eta': 0.05,
'sub_sample': 0.9,
'colsample_bytree': 0.8,
'min_child_weight': 4}
if mode == 'crossvalidate':
# cross validate
print("Cross validating...")
xgb.cv(param, dtrain, n_round, nfold=3, seed=0,
metrics={'merror', 'mlogloss'}, show_stdv=False)
else:
# full training and submission
print('Full training...')
watchlist = [(dtrain, 'train')]
bst = xgb.train(param, dtrain, n_round, watchlist)
# predict
print('Predicting...')
X_test, ids = load_data(paths.test_file, train=False)
X_test = preprocess_data(X_test)
dtest = xgb.DMatrix(X_test)
probs = bst.predict(dtest)
with open(os.path.join(paths.model_path, model_name+'.pkl'), 'wb') as f:
pickle.dump(probs, f, protocol=pickle.HIGHEST_PROTOCOL)