In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pyplot as py

import log

%matplotlib inline
seed = 0
np.random.seed(seed)

In [2]:
# preparing training data.
train_sample_size = 10000
test_size = 1000

train = np.load('../../nn/data/train_scaled.npy')

train = train[:train_sample_size,:]
labels = train[:,0]   
train = train[:,1:]

zero_important_features = [3, 5, 15, 22, 23, 24, 25, 26, 35, 39, 45, 48, 49, 56, 57, 58, 59, 60,\
                           62, 68, 71, 72, 73, 74, 79, 80, 83, 88, 91, 94, 95, 96, 97, 118, 119, 120]

above_0_important_features = [ i for i in range(train.shape[1]) if i not in zero_important_features ]
train = train[:, above_0_important_features]

x_train, x_valid, y_train, y_valid = train_test_split(train,labels,random_state=seed,test_size=test_size)

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

del train, labels, x_train, x_valid, y_train, y_valid

In [None]:
# preparing test data.
if 1 :
    test = np.load('../../nn/data/test_scaled.npy')
    test = test[:, above_0_important_features]
    dtest = xgb.DMatrix(test)
    del test

In [22]:
# defining Gini's Score function.
def gini(labels, preds) :
    return roc_auc_score(labels, preds) * 2. - 1
#----------------------------
def gini_xgb(preds, dtrain):
    return [('gini', gini(dtrain.get_label(), preds))]

In [103]:
v = bla[2].cvfolds[4]

In [106]:
v.bst.attribu

{}

In [109]:
count

24

#  Automated Parameter Tuning

In [128]:
ROUNDS = 10
ESROUNDS = 30
params = {
    'max_delta_step' : 0,
    'scale_pos_weight' : 5.3, #1,
    
    'max_depth' : 10,
    'min_child_weight' : 7,
    
    'subsample' : 0.9,
    'colsample_bytree' : 0.5,
    
    'reg_alpha' : 5,
    'reg_lambda' : 0.0001,
    
    'gamma' : 0,
    
    'eta' : 0.1,
    
    'objective' : "binary:logistic",
    'n_jobs' : -1,
    'eval_metric' : 'auc',
    'random_seed' : seed
}

In [107]:
# implement callback with the help of dspace's kernel 
#    https://www.kaggle.com/danielpace/save-xgboost-out-of-fold-predictions
class OOFCallback:
    def  __init__(self, dev_preds_dict, test_preds_dict, dev_gini_score, maximize=True):
        """
        :param dict dev_preds_dict: Should be an empty dict which can later be
            retrieved.
        :param bool maximize: If True, higher metric scores treated as better.
        """
        self.best_eval_metric = None
        self.dev_preds_dict = dev_preds_dict
        self.test_preds_dict = test_preds_dict
        self.dev_gini_score = dev_gini_score
        self.maximize=maximize

    def __call__(self, cbenv):
        global bla
        bla.append(cbenv)
        current_val_score = cbenv.evaluation_result_list[1][1]
        if self.best_eval_metric is None:
            self.best_eval_metric = current_val_score
        if self.maximize:
            if current_val_score >= self.best_eval_metric:
                self.best_eval_metric = current_val_score
                self._compute_oof_preds(cbenv.cvfolds)
        elif current_val_score <= self.best_eval_metric:
            self.best_eval_metric = current_val_score
            self._compute_oof_preds(cbenv.cvfolds)

    def _compute_oof_preds(self, cvfolds):
        global count
        count+=1
        for i, fold in enumerate(cvfolds):
            self.dev_preds_dict[i] = fold.bst.predict(dvalid)
            #self.test_preds_dict[i] = fold.bst.predict(dtest)
            #self.dev_preds_dict[i] = fold.bst.predict(fold.dtest)
            
        self.dev_preds = []
        for p in self.dev_preds_dict.values() :
            if len(self.dev_preds) == 0 :
                self.dev_preds = p
            else :
                self.dev_preds += p
        self.dev_preds /= len(self.dev_preds_dict)
        self.dev_gini_score = gini_xgb(self.dev_preds, dvalid)[0][1]
        #global bla
        #bla = self.dev_gini_score
            
bla = []
count = 0
dev_preds_dict = {}
test_preds_dict = {}
dev_gini_score = ()

In [126]:
class EarlyStopingByTest : 
    def __init__(self) :
        self.best_score = -1
    def __call__(self, cbenv) :
        cvfolds = cbenv.cvfolds[0]
        if cbenv.iteration == 10 :
            p = cvfolds.bst.predict(dvalid)
            print roc_auc_score(dvalid.get_label(), p)

In [131]:
def fpreproc(dtrain_, dtest_, param_):
    label = dtrain_.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param_['scale_pos_weight'] = ratio
    return (dtrain_, dvalid, param_)

def step_cv(names, values) : 
    
    for p,v in zip(names, values) :
        if p not in params.keys() :
            raise "Error %s not in parameters" % p
        params[p] = v
    
    t = np.datetime64('now')
    
    cv_results = xgb.cv(params, dtrain, 
                    num_boost_round=ROUNDS, early_stopping_rounds=ESROUNDS, seed=seed,
                    nfold=5, stratified = True,
                    metrics=('auc'),
                    fpreproc=fpreproc,
                    verbose_eval = True,
                    callbacks=[EarlyStopingByTest()]
                    #callbacks=[OOFCallback(dev_preds_dict, test_preds_dict, dev_gini_score)]
                    )
    print dev_gini_score
    return (np.max(cv_results['test-auc-mean']),
            np.max(cv_results['train-auc-mean']),
            np.argmax(cv_results['test-auc-mean']),
            (np.datetime64('now') - t).astype('int') / 60.
            )

In [80]:
def tune_params(names, values) :
    best_score = -1
    log.msg('**** Grid Start *********')
    
    for grid in values :
        
        msg = "CV with "
        for i,v in enumerate(grid):
            msg+= "%s = %g, " % (names[i], v)
        log.msg(msg)
        
        train_sc, test_sc, n_trees, step_time = step_cv(names, grid)
        
        log.msg('train score : %g, test score : %g, n_trees : %d, step time : %.1f minutes'\
               % (train_sc, test_sc, n_trees, step_time) )
        
        if test_sc > best_score or (test_sc == best_score and train_sc > best_train) :
            best_score = test_sc
            best_train = train_sc
            best_n_trees = n_trees
            best_params = grid
            
    best_gini = (best_score * 2) - 1
    
    log.msg('****** End of grid *********')
    msg =   'best auc score : %g, '\
            'best gini score : %g, '\
            'best train score : %g, '\
            'best n_trees = %d, '\
           % ( best_score, best_gini, best_train, best_n_trees )
    for i,v in enumerate(best_params) :
        msg += 'best %s = %g, ' % (names[i], v)
    log.msg(msg)
    print msg
    
    return best_params

In [11]:
log.LOG_PATH = './logs/'
try :
    log.close()
except :
    pass
log.init('tuning_params-test.log')
    
log.msg('------------------initialized-----------------')

'Error: log is already closed.'

In [None]:
test_pred = oof_preds_dict[0] + oof_preds_dict[1] + oof_preds_dict[2] + \
        oof_preds_dict[3] + oof_preds_dict[4]
test_pred /= 5

In [132]:
#--------------------------------------------------------------
# Adjusting unbalanced data. [max_delta_step, scale_pos_weight]
#--------------------------------------------------------------
names = ('max_delta_step', 'scale_pos_weight')
grid_params = [
    (max_delta_step, scale_pos_weight)
    for max_delta_step in [ 0 ]
    for scale_pos_weight in [1] #np.arange(4.9,6,.2)  #[ 4, 5, 6, 7, 8 ]   #[26, 1, 1.5, 2, 5, 10]
]
best_delta_step, best_pos_weight = tune_params(names, grid_params)
##################


[0]	train-auc:0.863512+0.0233552	test-auc:0.573402+0.0285418
[1]	train-auc:0.93946+0.00998388	test-auc:0.568656+0.0385598
[2]	train-auc:0.964108+0.00602881	test-auc:0.599739+0.0332878
[3]	train-auc:0.977283+0.00463323	test-auc:0.609945+0.02874
[4]	train-auc:0.984842+0.00400307	test-auc:0.618419+0.0257277
[5]	train-auc:0.990637+0.00298849	test-auc:0.612646+0.0336474
[6]	train-auc:0.993983+0.00218101	test-auc:0.611739+0.0464987
[7]	train-auc:0.99603+0.00148255	test-auc:0.611223+0.0404277
[8]	train-auc:0.997392+0.00115834	test-auc:0.610866+0.037119
[9]	train-auc:0.998186+0.000678573	test-auc:0.61378+0.0288023
()
best auc score : 0.998186, best gini score : 0.996372, best train score : 0.618419, best n_trees = 4, best max_delta_step = 0, best scale_pos_weight = 1, 


In [130]:
#--------------------------------------------------------------
# Adjusting unbalanced data. [max_delta_step, scale_pos_weight]
#--------------------------------------------------------------
names = ('max_delta_step', 'scale_pos_weight')
grid_params = [
    (max_delta_step, scale_pos_weight)
    for max_delta_step in [ 0 ]
    for scale_pos_weight in [1] #np.arange(4.9,6,.2)  #[ 4, 5, 6, 7, 8 ]   #[26, 1, 1.5, 2, 5, 10]
]
best_delta_step, best_pos_weight = tune_params(names, grid_params)
##################


[0]	train-auc:0.863512+0.0233552	test-auc:0.530074+0.0544463
[1]	train-auc:0.93946+0.00998388	test-auc:0.530636+0.071868
[2]	train-auc:0.964108+0.00602881	test-auc:0.524733+0.0643895
[3]	train-auc:0.977283+0.00463323	test-auc:0.524042+0.056755
[4]	train-auc:0.984842+0.00400307	test-auc:0.53829+0.0528797
[5]	train-auc:0.990637+0.00298849	test-auc:0.533814+0.0415573
[6]	train-auc:0.993983+0.00218101	test-auc:0.539765+0.0410964
[7]	train-auc:0.99603+0.00148255	test-auc:0.539492+0.0376436
[8]	train-auc:0.997392+0.00115834	test-auc:0.537794+0.0376569
[9]	train-auc:0.998186+0.000678573	test-auc:0.537036+0.0360671
()
best auc score : 0.998186, best gini score : 0.996372, best train score : 0.539765, best n_trees = 6, best max_delta_step = 0, best scale_pos_weight = 1, 


In [None]:
#-------------------------------------------------------------------------------
names = ('max_depth', 'min_child_weight')
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in np.arange(1,10,2)
    for min_child_weight in np.arange(1,10,2)
]
best_depth, best_child_weight = tune_params(names, grid_params)
#####
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in [best_depth-1, best_depth, best_depth+1]
    for min_child_weight in [best_child_weight-1, best_child_weight, best_child_weight+1]
]
best_depth, best_child_weight = tune2param(names, grid_params)
#######
if best_depth == 10 or best_child_weight == 10 :
    if best_depth == 10 : max_depth_r = np.arange(10,17,2)
    elif best_depth <10 : max_depth_r = [best_depth]
    if best_child_weight == 10 : min_child_weight_r = np.arange(10,17,2)
    elif best_child_weight < 10 : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tune2param(names, grid_params)
##########
if best_depth > 10 or best_child_weight > 10 :
    if best_depth > 10 : max_depth_r = [ best_depth-1, best_depth, best_depth+1 ]
    else : max_depth_r = [best_depth]
    if best_child_weight > 10 : 
        min_child_weight_r = [best_child_weight-1,best_child_weight,best_child_weight+1]
    else : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tune2param(names, grid_params)
###########    
params['max_depth'] = best_depth
params['min_child_weight'] = best_child_weight

In [None]:
#--------------------------------------------------------------------------------------------
name = 'gamma'
grid_params = np.arange(1,10,2)
best_gamma = tune1param(name, grid_params)
######
grid_params = np.arange(best_gamma-1, best_gamma+1.1, 0.2)
best_gamma = tune1param(name, grid_params)
#######
grid_params = np.arange(best_gamma-.1, best_gamma+.11, 0.1)
best_gamma = tune1param(name, grid_params)
########
params['gamma'] = best_gamma

In [None]:
#------------------------------------------------------
name = 'max_delta_step'
grid_params = [1,2]
best_delta_step = tune1paramparam(ngrid_params_params grid_params)
##########
grid_params = np.arange(.1,1,.2)
best_delta_step = tune1param(name, grid_params)
##########
#grid_params = [ best_params[0] - .1, best_params[0], best_params[0] + .1 ]
grid_params = [best_delta_step - .1, best_delta_step, best_delta_step + .1]
best_delta_step = tune1param(name, grid_params)
###########
params['max_delta_step'] = best_delta_step

In [None]:
#---------------------------------------------------------------------------------------
names = ('subsample', 'colsample_bytree')
grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(.1,1,.2)
    for colsample_bytree in np.arange(.1,1,.2)
]
best_subsample, best_colsample = tune2param(names, grid_params)

grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(best_subsample-.1,best_subsample+.11, 0.05)
    for colsample_bytree in np.arange(best_colsample-.1, best_colsample+.11, 0.05)
]
best_subsample, best_colsample = tune2param(names, grid_params)
params['subsample'] = best_subsample
params['col_sample_bytree'] = best_colsample


In [None]:
#--------------------------------------------------------------------------------------------
names = ('reg_alpha', 'reg_lambda')
grid_params = [
    (alpha, lambd)
    for alpha in np.arange(1,10,2)
    for lambd in [ 1 * 10 ** -i for i in [0,1,2,3,4] ]
]
best_alpha, best_lambd = tune2param(names, grid_params)
######
grid_params = [
    (alpha, best_lambd)
    for alpha in np.arange(best_alpha-1, best_alpha+1.1, 0.5)
]
best_alpha, best_lambd = tune2param(names, grid_params)
##########
params['reg_alpha'] = best_alpha
params['reg_lambda'] = best_lambd


In [None]:
#-----------------------------------------------------------------------------
name = 'eta'
grid_params = [.1, .2, .3]
best_eta = tune1param(name, grid_params)
#######
params['eta'] = best_eta

In [None]:
log.close()

# Done tuning

------------------

# Making Submision Data

In [None]:
tune1param('eta', [.1])
#----------
test_preds = []
for p in test_preds_dict.values() :
    if len(test_preds) == 0 :
        test_preds = p
    else :
        test_preds += p
test_preds /= len(test_preds_dict)
#---------
try :
    _ = len(ids)
except :
    ids = np.load('../../nn/data/test_ids.npy')

with open('../data/submission.csv', 'wb') as f :
	f.write('id,target')
    
	for i,p in zip(ids,test_preds) :
		f.write('\n%d,%.4f'%(i,p))

# prediction validation set

In [None]:
tune1param('eta', [.1])
dev_preds = []
for p in dev_preds_dict.values() :
    if len(dev_preds) == 0 :
        dev_preds = p
    else :
        dev_preds += p
dev_preds /= len(dev_preds_dict)
print gini_xgb(dev_preds, dvalid)[0][1]