In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pyplot as py

import log

%matplotlib inline
seed = 0
np.random.seed(seed)

In [None]:
# preparing training data.
train_sample_size = 10000
val_size = 3000

train = np.load('../../nn/data/train_scaled.npy')

#train = train[:train_sample_size,:]
labels = train[:,0]   
train = train[:,1:]

zero_important_features = [3, 5, 15, 22, 23, 24, 25, 26, 35, 39, 45, 48, 49, 56, 57, 58, 59, 60,\
                           62, 68, 71, 72, 73, 74, 79, 80, 83, 88, 91, 94, 95, 96, 97, 118, 119, 120]

above_0_important_features = [ i for i in range(train.shape[1]) if i not in zero_important_features ]
train = train[:, above_0_important_features]

x_train, x_valid, y_train, y_valid = train_test_split(train,labels,random_state=seed,test_size=val_size)

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

del train, labels, x_train, x_valid, y_train, y_valid

In [None]:
# preparing test data.
if 0 :
    test = np.load('../../nn/data/test_scaled.npy')
    test = test[:, above_0_important_features]
    dtest = xgb.DMatrix(test)
    del test

In [None]:
# defining Gini's Score function.
def gini(labels, preds) :
    return roc_auc_score(labels, preds) * 2. - 1
#----------------------------
def gini_xgb(preds, dtrain):
    return [('gini', gini(dtrain.get_label(), preds))]

In [None]:
v = bla[3].cvfold


In [None]:
v.evaluation_result_list[1][1]

In [28]:
assert p not in params.keys()

AssertionError: 

#  Automated Parameter Tuning

In [22]:
ROUNDS = 400
ESROUNDS = 30

params = {
    'max_delta_step' : 0,
    'scale_pos_weight' : 5.3, #1,
    
    'max_depth' : 10,
    'min_child_weight' : 7,
    
    'subsample' : 0.9,
    'colsample_bytree' : 0.5,
    
    'reg_alpha' : 5,
    'reg_lambda' : 0.0001,
    
    'gamma' : 0,
    
    'eta' : 0.1,
    
    'objective' : "binary:logistic",
    'n_jobs' : -1,
    'eval_metric' : 'auc',
    'random_seed' : seed
}

In [None]:
class GetBestCVFolds : 
    def __init__(self) :
        #self.cvfolds = cvfolds
        self.best_score = -1
    def __call__(self, cbenv) :
        current_score = cbenv.evaluation_result_list[1][1]
        if current_score > self.best_score :
            self.best_score = current_score
            global cvfolds
            cvfolds = cbenv.cvfolds
cvfolds = None

In [None]:
def fpreproc(dtrain_, dtest_, param_):
    label = dtrain_.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param_['scale_pos_weight'] = ratio
    return (dtrain_, dtest_, param_)

def step_cv(names, values) : 
    
    for p,v in zip(names, values) :
        if p not in params.keys() :
            raise "Error %s not in parameters" % p
        params[p] = v
    
    t = np.datetime64('now')
    global cvfolds
    cvfolds = None
    
    cv_results = xgb.cv(params, dtrain, 
                    num_boost_round=ROUNDS, early_stopping_rounds=ESROUNDS, seed=seed,
                    nfold=3, stratified = True,
                    metrics=('auc'),
                    fpreproc=fpreproc,
                    verbose_eval = False,
                    callbacks=[GetBestCVFolds()]
                    )
    #-----------------------
    assert cvfolds != None
    val_preds = np.zeros(dvalid.num_row())
    for fold in cvfolds :
        val_preds += fold.bst.predict(dvalid)
    val_preds /= len(cvfolds)
    val_score = gini(dvalid.get_label(), val_preds)
    #--------------------------------------
    return (np.max(cv_results['test-auc-mean']),
            np.max(cv_results['train-auc-mean']),
            np.argmax(cv_results['test-auc-mean']),
            (np.datetime64('now') - t).astype('int') / 60.,
            val_score
            )

In [None]:
def tune_params(names, values) :
    best_sc = -1
    log.msg('**** Grid Start *********')
    
    for grid in values :
        
        msg = "CV with "
        for i,v in enumerate(grid):
            msg+= "%s = %g, " % (names[i], v)
        log.msg(msg)
        
        train_sc, test_sc, n_trees, step_time, val_sc = step_cv(names, grid)
        
        log.msg('train score : %g, fold-dev score : %g, val score : %g, n_trees : %d,'\
                'step time : %.1f minutes'\
               % (train_sc, test_sc, val_sc, n_trees, step_time) )
        
        #if test_sc > best_score or (test_sc == best_score and train_sc > best_train) :
        if val_sc > best_sc :
            best_sc = val_sc
            best_train = train_sc
            best_test = test_sc
            best_n_trees = n_trees
            best_params = grid
            
    #best_gini = (best_score * 2) - 1
    
    log.msg('****** End of grid *********')
    msg =   'best val score : %g, '\
            'best test score : %g, '\
            'best train score : %g, '\
            'best n_trees = %d, '\
           % ( best_sc, best_test, best_train, best_n_trees )
    for i,v in enumerate(best_params) :
        msg += 'best %s = %g, ' % (names[i], v)
    log.msg(msg)
    print msg
    
    return best_params

In [None]:
cvfolds

In [None]:
log.LOG_PATH = './logs/'
try :
    log.close()
except :
    pass
log.init('tuning_params-5.log')
    
log.msg('------------------initialized-----------------')

In [18]:
n = ('s')

In [19]:
type(n) == tuple

False

In [None]:
#--------------------------------------------------------------
# Adjusting unbalanced data. [max_delta_step, scale_pos_weight]
#--------------------------------------------------------------
names = ('max_delta_step', 'scale_pos_weight')
grid_params = [
    (max_delta_step, scale_pos_weight)
    for max_delta_step in [ 0 ]
    for scale_pos_weight in [1] #np.arange(4.9,6,.2)  #[ 4, 5, 6, 7, 8 ]   #[26, 1, 1.5, 2, 5, 10]
]
best_delta_step, best_pos_weight = tune_params(names, grid_params)
##################


In [None]:
#--------------------------------------------------------------
# Adjusting unbalanced data. [max_delta_step, scale_pos_weight]
#--------------------------------------------------------------
names = ('max_delta_step', 'scale_pos_weight')
grid_params = [
    (max_delta_step, scale_pos_weight)
    for max_delta_step in [ 0 ]
    for scale_pos_weight in [1] #np.arange(4.9,6,.2)  #[ 4, 5, 6, 7, 8 ]   #[26, 1, 1.5, 2, 5, 10]
]
best_delta_step, best_pos_weight = tune_params(names, grid_params)
##################


In [None]:
#-------------------------------------------------------------------------------
names = ('max_depth', 'min_child_weight')
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in np.arange(1,10,2)
    for min_child_weight in np.arange(1,10,2)
]
best_depth, best_child_weight = tune_params(names, grid_params)
#####
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in [best_depth-1, best_depth, best_depth+1]
    for min_child_weight in [best_child_weight-1, best_child_weight, best_child_weight+1]
]
best_depth, best_child_weight = tune2param(names, grid_params)
#######
if best_depth == 10 or best_child_weight == 10 :
    if best_depth == 10 : max_depth_r = np.arange(10,17,2)
    elif best_depth <10 : max_depth_r = [best_depth]
    if best_child_weight == 10 : min_child_weight_r = np.arange(10,17,2)
    elif best_child_weight < 10 : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tune2param(names, grid_params)
##########
if best_depth > 10 or best_child_weight > 10 :
    if best_depth > 10 : max_depth_r = [ best_depth-1, best_depth, best_depth+1 ]
    else : max_depth_r = [best_depth]
    if best_child_weight > 10 : 
        min_child_weight_r = [best_child_weight-1,best_child_weight,best_child_weight+1]
    else : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tune2param(names, grid_params)
###########    
params['max_depth'] = best_depth
params['min_child_weight'] = best_child_weight

In [None]:
#--------------------------------------------------------------------------------------------
name = 'gamma'
grid_params = np.arange(1,10,2)
best_gamma = tune1param(name, grid_params)
######
grid_params = np.arange(best_gamma-1, best_gamma+1.1, 0.2)
best_gamma = tune1param(name, grid_params)
#######
grid_params = np.arange(best_gamma-.1, best_gamma+.11, 0.1)
best_gamma = tune1param(name, grid_params)
########
params['gamma'] = best_gamma

In [None]:
#------------------------------------------------------
name = 'max_delta_step'
grid_params = [1,2]
best_delta_step = tune1paramparam(ngrid_params_params grid_params)
##########
grid_params = np.arange(.1,1,.2)
best_delta_step = tune1param(name, grid_params)
##########
#grid_params = [ best_params[0] - .1, best_params[0], best_params[0] + .1 ]
grid_params = [best_delta_step - .1, best_delta_step, best_delta_step + .1]
best_delta_step = tune1param(name, grid_params)
###########
params['max_delta_step'] = best_delta_step

In [None]:
#---------------------------------------------------------------------------------------
names = ('subsample', 'colsample_bytree')
grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(.1,1,.2)
    for colsample_bytree in np.arange(.1,1,.2)
]
best_subsample, best_colsample = tune2param(names, grid_params)

grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(best_subsample-.1,best_subsample+.11, 0.05)
    for colsample_bytree in np.arange(best_colsample-.1, best_colsample+.11, 0.05)
]
best_subsample, best_colsample = tune2param(names, grid_params)
params['subsample'] = best_subsample
params['col_sample_bytree'] = best_colsample


In [None]:
#--------------------------------------------------------------------------------------------
names = ('reg_alpha', 'reg_lambda')
grid_params = [
    (alpha, lambd)
    for alpha in np.arange(1,10,2)
    for lambd in [ 1 * 10 ** -i for i in [0,1,2,3,4] ]
]
best_alpha, best_lambd = tune2param(names, grid_params)
######
grid_params = [
    (alpha, best_lambd)
    for alpha in np.arange(best_alpha-1, best_alpha+1.1, 0.5)
]
best_alpha, best_lambd = tune2param(names, grid_params)
##########
params['reg_alpha'] = best_alpha
params['reg_lambda'] = best_lambd


In [None]:
#-----------------------------------------------------------------------------
name = 'eta'
grid_params = [.1, .2, .3]
best_eta = tune1param(name, grid_params)
#######
params['eta'] = best_eta

In [None]:
log.close()

# Done tuning

------------------

# Making Submision Data

In [None]:
tune1param('eta', [.1])
#----------
test_preds = []
for p in test_preds_dict.values() :
    if len(test_preds) == 0 :
        test_preds = p
    else :
        test_preds += p
test_preds /= len(test_preds_dict)
#---------
try :
    _ = len(ids)
except :
    ids = np.load('../../nn/data/test_ids.npy')

with open('../data/submission.csv', 'wb') as f :
	f.write('id,target')
    
	for i,p in zip(ids,test_preds) :
		f.write('\n%d,%.4f'%(i,p))

# prediction validation set

In [None]:
tune1param('eta', [.1])
dev_preds = []
for p in dev_preds_dict.values() :
    if len(dev_preds) == 0 :
        dev_preds = p
    else :
        dev_preds += p
dev_preds /= len(dev_preds_dict)
print gini_xgb(dev_preds, dvalid)[0][1]