In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
#from sklearn.metrics import roc_auc_score
#import matplotlib
#import matplotlib.pyplot as py

#import log
from xgb_tuner import xgb_tuner

#%matplotlib inline
seed = 0
np.random.seed(seed)

In [None]:
# preparing training data.
train_sample_size = 10000
val_size = 3000

train = np.load('../../nn/data/train_scaled.npy')

#train = train[:train_sample_size,:]
labels = train[:,0]   
train = train[:,1:]

zero_important_features = [3, 5, 15, 22, 23, 24, 25, 26, 35, 39, 45, 48, 49, 56, 57, 58, 59, 60,\
                           62, 68, 71, 72, 73, 74, 79, 80, 83, 88, 91, 94, 95, 96, 97, 118, 119, 120]

above_0_important_features = [ i for i in range(train.shape[1]) if i not in zero_important_features ]
train = train[:, above_0_important_features]

x_train, x_valid, y_train, y_valid = train_test_split(train,labels,random_state=seed,test_size=val_size)

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

del train, labels, x_train, x_valid, y_train, y_valid

In [None]:
# preparing test data.
if 0 :
    test = np.load('../../nn/data/test_scaled.npy')
    test = test[:, above_0_important_features]
    dtest = xgb.DMatrix(test)
    del test

In [None]:
from xgb_parameterizing import parametizer

In [None]:
def preproc(train, test=None) :
    
    from sklearn.model_selection import train_test_split
    import xgboost as xgb
    
    train_sample_size = 5000
    dev_size = 5000
    
    #train = train[:train_sample_size,:]
    
    labels = train[:,0]
    train = train[:,1:]
    
    zero_important_features = [3, 5, 15, 22, 23, 24, 25, 26, 35, 39, 45, 48, 49, 56, 57, 58, 59, 60,\
                           62, 68, 71, 72, 73, 74, 79, 80, 83, 88, 91, 94, 95, 96, 97, 118, 119, 120]

    above_0_important_features = [ i for i in range(train.shape[1]) if i not in zero_important_features ]
    train = train[:, above_0_important_features]
    
    
    x_train, x_dev, y_train, y_dev = train_test_split(train,labels,random_state=0,test_size=dev_size)

    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_dev, label=y_dev)
    dtest  = xgb.DMatrix(test) if test else None

    del train, labels, x_train, x_dev, y_train, y_dev, test
    return dtrain, dvalid, dtest

In [None]:
rizer = parametizer('../../nn/data/train_scaled.npy', preproc=preproc, log_file_index=0)

In [None]:

rizer.doall()

In [None]:
rizer.tune_model_complexity()


#  Automated Parameter Tuning

In [None]:
ROUNDS = 400

ESROUNDS = 50

params = {
    'max_delta_step' : 0,
    'scale_pos_weight' : 1, # calculated for each fold. #neg / #pos
    
    'max_depth' : 6,
    'min_child_weight' : 1,
    
    'subsample' : 1,
    'colsample_bytree' : 1,
    
    'reg_alpha' : 0,#5
    'reg_lambda' : 1,#0.0001,
    
    'gamma' : 0,
    
    'eta' : 0.3,
    
    'objective' : "binary:logistic",
    'n_jobs' : -1,
    'eval_metric' : 'auc',
    'random_seed' : seed
}

In [None]:
tuner = xgb_tuner(dtrain, dvalid, params, logging=True, log_file_index=6, rounds=ROUNDS,srounds=ESROUNDS)
del dtrain, dvalid

In [None]:
#--------------------------------------------------------------
# Adjusting unbalanced data. [max_delta_step, scale_pos_weight]
#--------------------------------------------------------------
names = ['max_delta_step', 'scale_pos_weight']
grid_params = [
    (max_delta_step, scale_pos_weight)
    for max_delta_step in [ 0 ]
    for scale_pos_weight in [1] #np.arange(4.9,6,.2)  #[ 4, 5, 6, 7, 8 ]   #[26, 1, 1.5, 2, 5, 10]
]
#best_delta_step, best_pos_weight = tune_params(names, grid_params)
##################


In [None]:

p1 = set(np.arange(1,10,2))
p2 = set(np.arange(1,10,2))
p1.add(2)
p1.add(3)
p1.add(4)


In [None]:
p1

In [None]:
#-------------------------------------------------------------------------------
names = ['max_depth', 'min_child_weight'] # ------------------------------------
#--------------------------------------------------------------------------------
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in np.arange(1,10,2)
    for min_child_weight in np.arange(1,10,2)
]
best_depth, best_child_weight = tuner(names, grid_params)

p1 = set(np.arange(1,10,2))
p2 = set(np.arange(1,10,2))

######################################################################
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in [best_depth-1, best_depth, best_depth+1]
    for min_child_weight in [best_child_weight-1, best_child_weight, best_child_weight+1]
]
best_depth, best_child_weight = tuner(names, grid_params)
######################################################################
if best_depth == 10 or best_child_weight == 10 :
    if best_depth == 10 : max_depth_r = np.arange(10,17,2)
    elif best_depth <10 : max_depth_r = [best_depth]
    if best_child_weight == 10 : min_child_weight_r = np.arange(10,17,2)
    elif best_child_weight < 10 : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tuner(names, grid_params)
###############################################################
if best_depth > 10 or best_child_weight > 10 :
    if best_depth > 10 : max_depth_r = [ best_depth-1, best_depth, best_depth+1 ]
    else : max_depth_r = [best_depth]
    if best_child_weight > 10 : 
        min_child_weight_r = [best_child_weight-1,best_child_weight,best_child_weight+1]
    else : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tuner(names, grid_params)
################################################################    
params['max_depth'] = best_depth
params['min_child_weight'] = best_child_weight

In [None]:
#assert xgb_tuner.cvfolds != None

In [None]:
tuner(names, grid_params)

-------------------

In [None]:
#--------------------------------------------------------------
# Adjusting unbalanced data. [max_delta_step, scale_pos_weight]
#--------------------------------------------------------------
names = ('max_delta_step', 'scale_pos_weight')
grid_params = [
    (max_delta_step, scale_pos_weight)
    for max_delta_step in [ 0 ]
    for scale_pos_weight in [1] #np.arange(4.9,6,.2)  #[ 4, 5, 6, 7, 8 ]   #[26, 1, 1.5, 2, 5, 10]
]
best_delta_step, best_pos_weight = tune_params(names, grid_params)
##################


In [None]:
#-------------------------------------------------------------------------------
names = ['max_depth', 'min_child_weight'] # ------------------------------------
#--------------------------------------------------------------------------------
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in np.arange(1,10,2)
    for min_child_weight in np.arange(1,10,2)
]
best_depth, best_child_weight = tuner(names, grid_params)
######################################################################
grid_params = [
    (max_depth, min_child_weight)
    for max_depth in [best_depth-1, best_depth, best_depth+1]
    for min_child_weight in [best_child_weight-1, best_child_weight, best_child_weight+1]
]
best_depth, best_child_weight = tuner(names, grid_params)
######################################################################
if best_depth == 10 or best_child_weight == 10 :
    if best_depth == 10 : max_depth_r = np.arange(10,17,2)
    elif best_depth <10 : max_depth_r = [best_depth]
    if best_child_weight == 10 : min_child_weight_r = np.arange(10,17,2)
    elif best_child_weight < 10 : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tuner(names, grid_params)
###############################################################
if best_depth > 10 or best_child_weight > 10 :
    if best_depth > 10 : max_depth_r = [ best_depth-1, best_depth, best_depth+1 ]
    else : max_depth_r = [best_depth]
    if best_child_weight > 10 : 
        min_child_weight_r = [best_child_weight-1,best_child_weight,best_child_weight+1]
    else : min_child_weight_r = [best_child_weight]
    grid_params = [
        (max_depth, min_child_weight)
        for max_depth in max_depth_r
        for min_child_weight in min_child_weight_r
    ]
    best_depth, best_child_weight = tuner(names, grid_params)
################################################################    
params['max_depth'] = best_depth
params['min_child_weight'] = best_child_weight

In [None]:
#--------------------------------------------------------------------------------------------
name = 'gamma'
grid_params = np.arange(1,10,2)
best_gamma = tune1param(name, grid_params)
######
grid_params = np.arange(best_gamma-1, best_gamma+1.1, 0.2)
best_gamma = tune1param(name, grid_params)
#######
grid_params = np.arange(best_gamma-.1, best_gamma+.11, 0.1)
best_gamma = tune1param(name, grid_params)
########
params['gamma'] = best_gamma

In [None]:
#------------------------------------------------------
name = 'max_delta_step'
grid_params = [1,2]
best_delta_step = tune1paramparam(ngrid_params_params grid_params)
##########
grid_params = np.arange(.1,1,.2)
best_delta_step = tune1param(name, grid_params)
##########
#grid_params = [ best_params[0] - .1, best_params[0], best_params[0] + .1 ]
grid_params = [best_delta_step - .1, best_delta_step, best_delta_step + .1]
best_delta_step = tune1param(name, grid_params)
###########
params['max_delta_step'] = best_delta_step

In [None]:
#---------------------------------------------------------------------------------------
names = ('subsample', 'colsample_bytree')
grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(.1,1,.2)
    for colsample_bytree in np.arange(.1,1,.2)
]
best_subsample, best_colsample = tune2param(names, grid_params)

grid_params = [
    (subsample, colsample_bytree)
    for subsample in np.arange(best_subsample-.1,best_subsample+.11, 0.05)
    for colsample_bytree in np.arange(best_colsample-.1, best_colsample+.11, 0.05)
]
best_subsample, best_colsample = tune2param(names, grid_params)
params['subsample'] = best_subsample
params['col_sample_bytree'] = best_colsample


In [None]:
#--------------------------------------------------------------------------------------------
names = ('reg_alpha', 'reg_lambda')
grid_params = [
    (alpha, lambd)
    for alpha in np.arange(1,10,2)
    for lambd in [ 1 * 10 ** -i for i in [0,1,2,3,4] ]
]
best_alpha, best_lambd = tune2param(names, grid_params)
######
grid_params = [
    (alpha, best_lambd)
    for alpha in np.arange(best_alpha-1, best_alpha+1.1, 0.5)
]
best_alpha, best_lambd = tune2param(names, grid_params)
##########
params['reg_alpha'] = best_alpha
params['reg_lambda'] = best_lambd


In [None]:
#-----------------------------------------------------------------------------
name = 'eta'
grid_params = [.1, .2, .3]
best_eta = tune1param(name, grid_params)
#######
params['eta'] = best_eta

In [None]:
log.close()

# Done tuning

------------------

# Making Submision Data

In [None]:
tune1param('eta', [.1])
#----------
test_preds = []
for p in test_preds_dict.values() :
    if len(test_preds) == 0 :
        test_preds = p
    else :
        test_preds += p
test_preds /= len(test_preds_dict)
#---------
try :
    _ = len(ids)
except :
    ids = np.load('../../nn/data/test_ids.npy')

with open('../data/submission.csv', 'wb') as f :
	f.write('id,target')
    
	for i,p in zip(ids,test_preds) :
		f.write('\n%d,%.4f'%(i,p))