In [1]:
import time
import numpy as np
import pandas as pd
import operator
from collections import Counter

import xgboost as xgb
#import lightgbm as lgb
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBClassifier

import matplotlib.pyplot as plt
from contextlib import contextmanager
@contextmanager
def timer(name):
    start_time = time.time()
    yield
    print(f'[{name} done in {time.time() - start_time:.2f} s]')

In [3]:
train = pd.read_csv('data/processed_train.csv')
test = pd.read_csv('data/processed_test.csv')
print(Counter(train['day']).keys())
print(Counter(test['day']).keys())

dict_keys([17, 18, 19, 20, 21, 22, 23, 24])
dict_keys([25, 24])


In [4]:
# sort data according to day and time
train = train.sort_values(by = ['day','time']).reset_index().iloc[:, 1:]
# convert hour to time slot
# is_midnight: 0, is_morning: 1, is_afternoon: 2, is_night: 3
def f(x):
    if x <= 7:
        return 0
    elif x > 7 and x <= 13:
        return 1
    elif x > 13 and x <= 19:
        return 2
    else:
        return 3
train['hour'] = train['hour'].apply(lambda x: f(x))

In [5]:
# delete some features
# check importance of features
exclude_features = ['instance_id','context_id', 'context_timestamp', 'is_trade','datetime', 'day', 'time']
df_train = train[(train['day'] >= 17) & (train['day'] <= 22)]
df_val = train[(train['day'] >= 23) & (train['day'] <= 24)]
y_train = df_train['is_trade']
x_train = df_train.drop(exclude_features, axis = 1)
y_val = df_val['is_trade']
x_val = df_val.drop(exclude_features, axis = 1)

In [6]:
def logloss(act, pred):
    epsilon = 1e-15
    pred = np.maximum(epsilon, pred)
    pred = np.minimum(1-epsilon, pred)
    ll = sum(act*np.log(pred) + (1-act)*np.log(1-pred))
    ll = ll * -1.0/len(act)
    return ll

In [17]:
# prepare for training
params = {'max_depth':8, # 3-10
          'eta':0.05, # 0.01-0.2 analogous to learning rate
          'subsample':0.7, #0.5-1 denote the fraction of observations to be randomly samples for each tree.
          'colsample_bytree':0.5, #0.5-1 denote the fraction of columns to be randomly samples for each tree.
          'min_child_weight':10,
          
          'seed':123,
          'nthread':25,
          'eval_metric':'logloss', #rmse, logloss, auc
          'objective':'binary:logistic',
          'silent':1
          
          # handle imbalanced dataset
          #'scale_pos_weight':0.5
          # 'max_delta_step': 1
          }
num_boost_round = 500

dtrain = xgb.DMatrix(x_train, y_train)
dvalid = xgb.DMatrix(x_val, y_val)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [24]:
with timer('Train XBOOST'):
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, 
                    early_stopping_rounds=200, verbose_eval=50)

[0]	train-logloss:0.648111	eval-logloss:0.647771
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 200 rounds.
[50]	train-logloss:0.108223	eval-logloss:0.100862
[100]	train-logloss:0.084422	eval-logloss:0.078325
[150]	train-logloss:0.080109	eval-logloss:0.076948
[200]	train-logloss:0.077294	eval-logloss:0.076755
[250]	train-logloss:0.075364	eval-logloss:0.076745
[300]	train-logloss:0.073384	eval-logloss:0.076756
[350]	train-logloss:0.071698	eval-logloss:0.0768
[400]	train-logloss:0.069882	eval-logloss:0.076883
Stopping. Best iteration:
[224]	train-logloss:0.076325	eval-logloss:0.076727

[Train XBOOST done in 839.52 s]


In [43]:
# Paramter Tunning:
# According to previous training, we choose num_boost_round = 224

# Tune max_depth and min_child_weight
# max_depth: 3-10   tried (4,10,1)
# min_child_weight: 1-10  tried (2,18,2)

max_depths = range(9,10,1)
min_child_weights = range(14,15,1)

iter_ = 0
best_iter = 0
best_logloss = 0.07672
best_model = None

for max_depth in max_depths:
    for min_child_weight in min_child_weights:
        with timer('One round:'):
            print('----------- -------------')
            print('max_depth = ',max_depth)
            print('min_child_weight = ',min_child_weight)
            params1 = dict(params,min_child_weight = min_child_weight,max_depth = max_depth)
            model = xgb.train(params1,dtrain,num_boost_round = 224,evals = watchlist, 
                              early_stopping_rounds=200, verbose_eval = False)
            print('Model %d best logloss = %.6f'%(iter_,model.best_score))
            
            if model.best_score < best_logloss:
                best_logloss = model.best_score
                best_iter = iter_
                best_model = model
                print('Best so far!')
                print('Minimal logloss', best_logloss)
            
            iter_ += 1
    

----------- -------------
max_depth =  9
min_child_weight =  14
Model 0 best logloss = 0.076642
Best so far!
Minimal logloss 0.076642
[One round: done in 502.60 s]


In [49]:
# Tune scale_pos_weight: default is 1
# We tried 0.5-0.9
# the best_logloss is still the default one

# Tune gamma: default is 0, larger gamma, more conservative
# 'gamma':[i/10.0 for i in range(0,5)]
# the best_logloss is still the default one

params1 = dict(params,max_depth = 9, min_child_weight = 14)
gammas = [i/10.0 for i in range(1,6)]

iter_ = 0
best_iter = 0
best_logloss = 0.076642
best_model = None

for gamma in gammas:
    with timer('One round:'):
            print('----------- -------------')
            print('gamma = ',gamma)
            params2 = dict(params1,gamma = gamma)
            model = xgb.train(params1,dtrain,num_boost_round = 224,evals = watchlist, 
                              early_stopping_rounds=200, verbose_eval = False)
            print('Model %d best logloss = %.6f'%(iter_,model.best_score))
            
            if model.best_score < best_logloss:
                best_logloss = model.best_score
                best_iter = iter_
                best_model = model
                print('Best so far!')
                print('Minimal logloss', best_logloss)
            
            iter_ += 1
    

----------- -------------
gamma =  0.1
Model 0 best logloss = 0.076642
[One round: done in 502.83 s]
----------- -------------
gamma =  0.2
Model 1 best logloss = 0.076642
[One round: done in 520.05 s]
----------- -------------
gamma =  0.3
Model 2 best logloss = 0.076642
[One round: done in 492.09 s]
----------- -------------
gamma =  0.4
Model 3 best logloss = 0.076642
[One round: done in 508.93 s]
----------- -------------
gamma =  0.5
Model 4 best logloss = 0.076642
[One round: done in 493.35 s]


In [52]:
# Tune subsample and colsample_bytree
# 'subsample':[i/10.0 for i in range(6,10)],
# 'colsample_bytree':[i/10.0 for i in range(6,10)]

# subsample =  0.8
# colsample =  0.5
# best_logloss = 0.076573

params1 = dict(params,max_depth = 9, min_child_weight = 14)

subsamples = [i/10.0 for i in range(5,10)]
colsample_bytrees = [i/10.0 for i in range(5,10)]

iter_ = 0
best_iter = 0
best_logloss = 0.076642
best_model = None

for subsample in subsamples:
    for colsample_bytree in colsample_bytrees:
        with timer('One round:'):
            print('----------- -------------')
            print('subsample = ',subsample)
            print('colsample = ',colsample_bytree)
            params1 = dict(params1,subsample = subsample,colsample = colsample_bytree)
            model = xgb.train(params1,dtrain,num_boost_round = 224,evals = watchlist, 
                              early_stopping_rounds=200, verbose_eval = False)
            print('Model %d best logloss = %.6f'%(iter_,model.best_score))
            
            if model.best_score < best_logloss:
                best_logloss = model.best_score
                best_iter = iter_
                best_model = model
                print('Best so far!')
                print('Minimal logloss', best_logloss)
            
            iter_ += 1

----------- -------------
subsample =  0.5
colsample =  0.5
Model 0 best logloss = 0.076845
[One round: done in 483.47 s]
----------- -------------
subsample =  0.5
colsample =  0.6
Model 1 best logloss = 0.076845
[One round: done in 483.78 s]
----------- -------------
subsample =  0.5
colsample =  0.7
Model 2 best logloss = 0.076845
[One round: done in 527.93 s]
----------- -------------
subsample =  0.5
colsample =  0.8
Model 3 best logloss = 0.076845
[One round: done in 484.71 s]
----------- -------------
subsample =  0.5
colsample =  0.9
Model 4 best logloss = 0.076845
[One round: done in 515.17 s]
----------- -------------
subsample =  0.6
colsample =  0.5
Model 5 best logloss = 0.076805
[One round: done in 524.16 s]
----------- -------------
subsample =  0.6
colsample =  0.6
Model 6 best logloss = 0.076805
[One round: done in 498.08 s]
----------- -------------
subsample =  0.6
colsample =  0.7
Model 7 best logloss = 0.076805
[One round: done in 509.16 s]
----------- ------------

In [64]:
# Tuning Regularization Parameters
# lambda: [1e-5, 1e-2, 0.1, 1, 100]  L2 (similar to ridge regression) default = 1
# alpha:  L1 (similar to lasso regression)  default = 0
params2 = dict(params1,subsample = 0.8, colsample_bytree = 0.5)

#lambdas = [1e-5, 1e-2, 0.1, 1, 100]
#lambdas = [0.5, 0.8, 1.5, 1.8, 2.5]
#lambdas = [0.51, 0.52, 0.53, 0.54, 0.55]
lambdas = [0.5]

iter_ = 0
best_iter = 0
best_logloss = 0.076573
best_model = None

for la in lambdas:
    with timer('One round:'):
            print('----------- -------------')
            print('lambda = ',la)
            params2['lambda'] = la
            model = xgb.train(params2,dtrain,num_boost_round = 224,evals = watchlist, 
                              early_stopping_rounds=200, verbose_eval = False)
            print('Model %d best logloss = %.6f'%(iter_,model.best_score))
            
            if model.best_score < best_logloss:
                best_logloss = model.best_score
                best_iter = iter_
                best_model = model
                print('Best so far!')
                print('Minimal logloss', best_logloss)
            
            iter_ += 1




----------- -------------
lambda =  0.5
Model 0 best logloss = 0.076546
Best so far!
Minimal logloss 0.076546
[One round: done in 514.38 s]


We can get only optimize the model in a small extent by parameter tunning. 
The best logloss of this model is 0.076546
If we want to greatly improve the model, we need to reply on other techniques, such as feature engineering, ensemble of model, stacking and etc.