In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

import matplotlib
import matplotlib.pyplot as py

import log

%matplotlib inline
seed = 0
np.random.seed(seed)

In [3]:
# preparing training data.
train_sample_size = 1000
test_size = 0.0001

train = np.load('../../nn/data/train_scaled.npy')

#train = train[:train_sample_size,:]
labels = train[:,0]   
train = train[:,1:]

zero_important_features = [3, 5, 15, 22, 23, 24, 25, 26, 35, 39, 45, 48, 49, 56, 57, 58, 59, 60,\
                           62, 68, 71, 72, 73, 74, 79, 80, 83, 88, 91, 94, 95, 96, 97, 118, 119, 120]

above_0_important_features = [ i for i in range(train.shape[1]) if i not in zero_important_features ]
train = train[:, above_0_important_features]

x_train, x_valid, y_train, y_valid = train_test_split(train,labels,random_state=seed,test_size=test_size)

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

del train, labels, x_train, x_valid, y_train, y_valid

In [None]:
# preparing test data.
if 0 :
    test = np.load('../../nn/data/test_scaled.npy')
    test = test[:, above_0_important_features]
    dtest = xgb.DMatrix(test)
    del test

In [4]:
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation

def gini(y_true, y_prob):
    y_true = np.asarray(y_true)    
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        
        y_i = y_true[i]
        
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    
    return gini


# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini(labels, preds)
    return [('gini', gini_score)]




## Trying Xgb.booster mode

In [None]:
ROUNDS = 400
ESROUNDS = 50
params = {
    'max_delta_step' : 0,
    'scale_pos_weight' : 1, #1,
    
    'max_depth' : 10,
    'min_child_weight' : 7,
    
    'subsample' : 1,
    'colsample_bytree' : 1,
    
    'reg_alpha' : 0,
    'reg_lambda' : 1,
    
    'gamma' : 0,
    
    'eta' : 0.3,
    
    'objective' : "binary:logistic",
    'n_jobs' : -1,
    'eval_metric' : 'auc',
    'random_seed' : seed
}

In [None]:
t = np.datetime64('now')
model = xgb.train(params, dtrain,
                  num_boost_round=ROUNDS,
                  early_stopping_rounds=ESROUNDS,
                  evals = [(dvalid, 'eval')],
                  feval = (gini_xgb), maximize = True,
                  verbose_eval = 10
                 )
print np.datetime64('now') - t 

In [None]:
dvalid.num_row()

In [None]:
labels[labels == 1].shape[0] / float(labels.shape[0]) * 100

In [None]:
t = np.datetime64('now')
cv_results = xgb.cv(params, dtrain, 
                    num_boost_round=ROUNDS, early_stopping_rounds=ESROUNDS, seed=seed,
                    nfold=3, stratified = True,
                    #feval=gini_xgb, maximize=True,
                    verbose_eval = 10,
                    as_pandas = False
                   )
print np.datetime64('now') - t

In [None]:
train_sc, test_sc, n_trees, step_time = step_cv({})

#  Automated Parameter Tuning

In [5]:
ROUNDS = 400
ESROUNDS = 50
params = {
    'max_delta_step' : 0,
    'scale_pos_weight' : 1, #1,
    
    'max_depth' : 10,
    'min_child_weight' : 7,
    
    'subsample' : 1,
    'colsample_bytree' : 1,
    
    'reg_alpha' : 0,
    'reg_lambda' : 1,
    
    'gamma' : 0,
    
    'eta' : 0.3,
    
    'objective' : "binary:logistic",
    'n_jobs' : -1,
    'eval_metric' : 'auc',
    'random_seed' : seed
}

In [6]:
def step_cv(parameters) : 
    
    for p,v in parameters.items() :
        if p not in params.keys() :
            raise "Error %s not in parameters" % p
        params[p] = v
    
    t = np.datetime64('now')
    
    cv_results = xgb.cv(params, dvalid, 
                    num_boost_round=ROUNDS, early_stopping_rounds=ESROUNDS, seed=seed,
                    nfold=3, stratified = True,
                    verbose_eval = False
                    )
    
    return (np.max(cv_results['test-auc-mean']),
            np.max(cv_results['train-auc-mean']),
            np.argmax(cv_results['test-auc-mean']),
            (np.datetime64('now') - t).astype('int') / 60.
            )

In [16]:
def tune2param(names, values) :
    best_score = -1
    log.msg('**** Grid Start *********')
    for p1, p2 in values :
        
        log.msg('CV with %s = %f, and %s = %f' % (names[0], p1, names[1], p2))
        train_sc, test_sc, n_trees, step_time = step_cv({names[0]:p1,names[1]:p2})
        log.msg('train score : %.3f, test score : %.3f, n_trees : %d, step time : %1f minutes'\
               % (train_sc, test_sc, n_trees, step_time) )
        if test_sc > best_score :
            best_score = test_sc
            best_train = train_sc
            best_n_trees = n_trees
            best_params = (p1, p2)
            
    log.msg('****** End of grid *********')
    msg =   'best test score : %.4f, '\
            'best train score : %.4f, '\
            'best %s = %.1f, '\
            'best %s = %.1f '\
            'and n_trees = %d'\
           % ( best_score, best_train,\
              names[0], best_params[0],\
              names[1], best_params[1],\
              best_n_trees )
    log.msg(msg)
    print msg
    return best_params

def tune1param(name, values) :
    best_score = -1
    log.msg('**** Grid Start *********')
    for p in values :
        
        log.msg('CV with %s = %f' % (name, p))
        train_sc, test_sc, n_trees, step_time = step_cv({name:p})
        log.msg('train score : %.3f, test score : %.3f, n_trees : %d, step time : %1f minutes'\
               % (train_sc, test_sc, n_trees, step_time) )
        if test_sc > best_score :
            best_score = test_sc
            best_train = train_sc
            best_n_trees = n_trees
            best_param = p
            
    log.msg('****** End of grid *********')
    msg =   'best test score : %.4f, '\
            'best train score : %.4f, '\
            'best %s = %.1f, '\
            'and n_trees = %d' \
           % ( best_score, best_train, name, best_param, best_n_trees )
    log.msg(msg)
    print msg
    return best_param

In [8]:
log.LOG_PATH = './logs/'
try :
    log.close()
except :
    pass
log.init('tuning_params-test.log')
    
log.msg('------------------initialized-----------------')

'Error: log is already closed.'

In [13]:
#-------------------------------------------------------------------------------
names = ('max_depth', 'min_child_weight')
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in np.arange(1,10,2)
    for min_child_weight in np.arange(1,10,2)
]
best_depth, best_child_weight = tune2param(names, grid_params)
#####
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in [best_depth-1, best_depth, best_depth+1]
    for min_child_weight in [best_child_weight-1, best_child_weight, best_child_weight+1]
]
best_depth, best_child_weight = tune2param(names, grid_params)
#######
if best_depth == 10 or best_child_weight == 10 :
    if best_depth == 10 : max_depth_r = np.arange(10,17,2)
    elif best_depth <10 : max_depth_r = [best_depth]
    if best_child_weight == 10 : min_child_weight_r = np.arange(10,17,2)
    elif best_child_weight < 10 : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tune2param(names, grid_params)
##########
if best_depth > 10 or best_child_weight > 10 :
    if best_depth > 10 : max_depth_r = [ best_depth-1, best_depth, best_depth+1 ]
    else : max_depth_r = [best_depth]
    if best_child_weight > 10 : 
        min_child_weight_r = [best_child_weight-1,best_child_weight,best_child_weight+1]
    else : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tune2param(names, grid_params)
###########    
params['max_depth'] = best_depth
params['min_child_weight'] = best_child_weight

best test score : 1.0000,best train score : 0.5789best max_depth = 1.0,best min_child_weight = 1.0 and n_trees = 30
best test score : 1.0000,best train score : 0.7096best max_depth = 1.0,best min_child_weight = 0.0 and n_trees = 63


In [None]:
#---------------------------------------------------------------------------------------
names = ('subsample', 'colsample_bytree')
grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(.1,1,.2)
    for colsample_bytree in np.arange(.1,1,.2)
]
best_subsample, best_colsample = tune2param(names, grid_params)

grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(best_subsample-.1,best_subsample+.11, 0.05)
    for colsample_bytree in np.arange(best_colsample-.1, best_colsample+.11, 0.05)
]
best_subsample, best_colsample = tune2param(names, grid_params)
params['subsample'] = best_subsample
params['col_sample_bytree'] = best_colsample


In [None]:
#--------------------------------------------------------------------------------------------
names = ('reg_alpha', 'reg_lambda')
grid_params = [
    (alpha, lambd)
    for alpha in np.arange(1,10,2)
    for lambd in [ 1 * 10 ** -i for i in [0,1,2,3,4] ]
]
best_alpha, best_lambd = tune2param(names, grid_params)
######
grid_params = [
    (alpha, best_lambd)
    for alpha in np.arange(best_alpha-1, best_alpha+11, 0.5)
]
best_alpha, best_lambd = tune2param(names, grid_params)
##########
params['reg_alpha'] = best_alpha
params['reg_lambda'] = best_lambd


In [17]:
#--------------------------------------------------------------------------------------------
name = 'gamma'
grid_params = np.arange(1,10,2)
best_gamma = tune1param(name, grid_params)
######
grid_params = np.arange(best_gamma-1, best_gamma+11, 0.5)
best_gamma = tune1param(name, grid_params)
#######
params['gamma'] = best_gamma
#-----------------------------------------------------------------------------
name = 'eta'
grid_params = [.1, .2, .3]
best_eta = tune1param(name, grid_params)
#######
params['eta'] = best_eta

best test score : 0.6111, best train score : 0.4727, best gamma = 1.0, and n_trees = 0
best test score : 1.0000, best train score : 0.7096, best gamma = 0.0, and n_trees = 87
best test score : 1.0000, best train score : 0.7096, best eta = 0.2, and n_trees = 87


In [None]:
log.close()

# Done tuning

------------------

In [None]:
(gini(dtrain.get_label(),model.predict(dtrain)) + 1 ) / 2

In [None]:
np.argmax(cv_results['train-gini-mean'])

In [None]:
pred = model.predict(dtest)

In [None]:
pred

In [None]:
try :
    _ = len(ids)
except :
    ids = np.load('../../nn/data/test_ids.npy')

with open('../data/submission1.csv', 'wb') as f :
	f.write('id,target')
    
	for i,p in zip(ids,pred) :
		f.write('\n%d,%.4f'%(i,p))