In [43]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 21 22:53:33 2016

@author: caoxiang
"""

import numpy as np
import pandas as pd
import xgboost as xgb
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from math import exp, log
from bayes_opt import BayesianOptimization


DATA_TRAIN_PATH = 'train.csv'
DATA_TEST_PATH = 'test.csv'
shift = 200

def timer(start_time = None):
    """
    define a timer, compute time. Need initial start_time = timer() first
    """
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        tmin, tsec = divmod((datetime.now() - start_time).total_seconds(), 60)
        print(' Time taken: %i minutes and %s seconds.' %
              (tmin, round(tsec, 2)))
        

def scale_data(X, scaler=None):
    """
    standardize variable
    """
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler


def load_data(path_train = DATA_TRAIN_PATH, path_test = DATA_TEST_PATH, shift=200):
    train_loader = pd.read_csv(path_train)
    train = train_loader.drop(['id', 'loss'], axis=1)
    test_loader = pd.read_csv(path_test)
    test = test_loader.drop(['id'], axis=1)
    ntrain = train.shape[0]
    ntest = test.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)
    numeric_feats = train_test.dtypes[train_test.dtypes != 'object'].index

    features = train.columns
    cats = [feat for feat in features if 'cat' in feat]
    
    # factorize categorical features
    for feat in cats:
        train_test[feat] = pd.factorize(train_test[feat], sort=True)[0] #.factorize() returns a tuple, [0] is array, [1] is index
    x_train = train_test.iloc[:ntrain, :]
    x_test = train_test.iloc[ntrain:, :]
    train_test_scaled, scaler = scale_data(train_test)
    train, _ = scale_data(x_train, scaler)
    test, _ = scale_data(x_test, scaler)

    train_labels = np.log(np.array(train_loader['loss'] + shift))
    train_ids = train_loader['id'].values.astype(np.int32)
    test_ids = test_loader['id'].values.astype(np.int32)

    return train, train_labels, test, train_ids, test_ids
    
train, target, test, _, ids = load_data()
d_train_full = xgb.DMatrix(train, label=target)
#d_train_full = xgb.DMatrix(train[0:3000, ], label=target[0:3000])
d_test = xgb.DMatrix(test)

In [6]:
def xgb_object(
               gamma,
               min_child_weight,
               colsample_bytree,
               subsample,
               max_depth,
               best_score,
               train = d_train_full):
    params = {}
    params['booster'] = 'gbtree'
    params['objective'] = "reg:linear"
    params['eval_metric'] = 'mae'
    params['eta'] = 0.1
    params['gamma'] = gamma
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = colsample_bytree
    params['subsample'] = subsample
    params['max_depth'] = int(max_depth)
    params['max_delta_step'] = 0
    params['silent'] = 1
    params['random_state'] = 1989
    params['alpha'] = 1
    params['best_score'] = best_score
    
    clf_cv = xgb.cv(params, train, num_boost_round=100000, nfold=3,
                    early_stopping_rounds = 25, verbose_eval=30)
    
    object_score = min(clf_cv['test-mae-mean'])
    return -1 * object_score

In [7]:
start_time = timer()
# tuning parameters through bayes-opt

# bayes optimization params
init_points = 25
n_iter = 5
kappa = 2
acq = 'ei'
xi = 0.0

# xgboost params. We don't need define them here, as there are set in xgb_object
# params = {}
# params['booster'] = 'gbtree'
# params['objective'] = "reg:linear"
# params['eval_metric'] = 'mae'
# params['eta'] = 0.1
# params['max_delta_step'] = 0
# params['silent'] = 1
# params['random_state'] = 1989
# params['alpha'] = 1

# xgboost params need tuning
bayes_ranges = {
                'min_child_weight': (1,10),
                'colsample_bytree': (0.5, 0.5),
                'subsample': (0.8, 0.8),
                'max_depth': (10,15),
                'best_score': (1,10),
                'gamma': (1,10)
                }

xgbBO = BayesianOptimization(xgb_object, bayes_ranges)
xgbBO.maximize(init_points = init_points, n_iter = n_iter, kappa = kappa, acq = acq, xi = xi)
timer(start_time)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   best_score |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
[0]	train-mae:6.57002+0.00198388	test-mae:6.56999+0.00448705
[30]	train-mae:0.446409+0.00034011	test-mae:0.452444+0.00170476
[60]	train-mae:0.36133+0.000669874	test-mae:0.377695+0.000115535
[90]	train-mae:0.352722+0.000305682	test-mae:0.374635+0.00014994
[120]	train-mae:0.348018+8.14916e-05	test-mae:0.373467+0.000207879
[150]	train-mae:0.344775+0.000191046	test-mae:0.372937+0.000109149
[180]	train-mae:0.342408+0.000329463	test-mae:0.372658+0.000175853
[210]	train-mae:0.340234+0.000289179	test-mae:0.372484+0.000221805
[240]	train-mae:0.338583+0.00019222	test-mae:0.372445+0.00022617
[270]	train-mae:0.337005+8.34839e-05	test-mae:0.372386+0.000231192
[300]	train-mae:0.335508+0.00016959	test-mae:0.372354+0.0

  " state: %s" % convergence_dict)


[31mBayesian Optimization[0m
[94m------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   best_score |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
[0]	train-mae:6.57002+0.00198217	test-mae:6.56999+0.00448575
[30]	train-mae:0.447083+0.00040175	test-mae:0.45199+0.00171526
[60]	train-mae:0.36575+0.000204275	test-mae:0.377626+0.000296862
[90]	train-mae:0.359532+0.000295947	test-mae:0.374579+0.000237954
[120]	train-mae:0.356762+0.000469852	test-mae:0.37352+0.000283998
[150]	train-mae:0.35496+0.000421218	test-mae:0.373029+0.000217414
[180]	train-mae:0.353663+0.000340865	test-mae:0.372681+0.000241287
[210]	train-mae:0.352558+0.000314478	test-mae:0.372492+0.000283227
[240]	train-mae:0.351469+0.000290704	test-mae:0.372318+0.000273822
[270]	train-mae:0.350598+0.000321011	test-mae:0.372209+0.000289132
[300]	train-mae:0.349707+0.000383357	test-mae:0.37

  " state: %s" % convergence_dict)


   27 | 04m37s |   -0.37314 |       9.7720 |             0.5000 |    1.1816 |     12.2393 |             5.3981 |      0.8000 | 
[0]	train-mae:6.57002+0.00198388	test-mae:6.56999+0.00448705
[30]	train-mae:0.448862+0.000280061	test-mae:0.453168+0.00159608
[60]	train-mae:0.367781+0.0003429	test-mae:0.378651+0.00017626
[90]	train-mae:0.361077+0.000367225	test-mae:0.375208+0.000230029
[120]	train-mae:0.357804+0.000308774	test-mae:0.37397+0.000243018
[150]	train-mae:0.355803+0.000322859	test-mae:0.373409+0.000135383
[180]	train-mae:0.354172+0.00026372	test-mae:0.373025+0.000191148
[210]	train-mae:0.352907+0.000293012	test-mae:0.372799+0.000191005
[240]	train-mae:0.351718+0.000343012	test-mae:0.372635+0.000183787
[270]	train-mae:0.350646+0.000260095	test-mae:0.37253+0.000215509
[300]	train-mae:0.349781+6.04152e-05	test-mae:0.372402+0.000228622
[330]	train-mae:0.349002+9.56068e-05	test-mae:0.372305+0.000214967
[360]	train-mae:0.348353+8.93843e-05	test-mae:0.372303+0.000219797
   28 | 06m25s | 

In [45]:
params = {}
params['booster'] = 'gbtree'
params['objective'] = "reg:linear"
params['eval_metric'] = 'mae'
params['eta'] = 0.1
params['max_delta_step'] = 0
params['silent'] = 1
params['random_state'] = 1989
params['alpha'] = 1

tuned_params = {'best_score': 1.1106,
                'gamma': 2.3452,
                'max_depth': 13.8907,
                'min_child_weight': 9.6712,
                'colsample_bytree': 0.5,
                'subsample': 0.8
    
}

In [46]:
#tuned_params = xgbBO.res['max']['max_params']
params.update(tuned_params)
params['eta'] = 0.01
params['max_depth'] = int(params['max_depth'])

####################################
#  Build Model
####################################
watchlist = [(d_train_full, 'train_full')]

clf = xgb.train(params,
                d_train_full,
                2230,
                watchlist,
                verbose_eval = 100
                )
                # early_stopping_rounds=early_stopping
                # I didn't use early_stopping here, as I tuned the best # iteration

        
y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift # remember to subtract shift
      

now = datetime.now()

result = pd.DataFrame(y_pred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
folds = 1
sub_file = 'submission_' + str(folds) +'fold-average-xgb_' + '_bayes_'+str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

print("\n Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')

[0]	train_full-mae:7.22686
[100]	train_full-mae:2.64646
[200]	train_full-mae:0.981137
[300]	train_full-mae:0.484066
[400]	train_full-mae:0.388969
[500]	train_full-mae:0.370423
[600]	train_full-mae:0.364833
[700]	train_full-mae:0.362112
[800]	train_full-mae:0.36034
[900]	train_full-mae:0.359028
[1000]	train_full-mae:0.357945
[1100]	train_full-mae:0.357089
[1200]	train_full-mae:0.356394
[1300]	train_full-mae:0.355788
[1400]	train_full-mae:0.355196
[1500]	train_full-mae:0.354689
[1600]	train_full-mae:0.354217
[1700]	train_full-mae:0.353786
[1800]	train_full-mae:0.3534
[1900]	train_full-mae:0.353051
[2000]	train_full-mae:0.352716
[2100]	train_full-mae:0.352378
[2200]	train_full-mae:0.352043


NameError: name 'folds' is not defined