In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
# from numba import jit
import time
import gc

In [3]:
# Compute gini
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
# @jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [4]:
# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [5]:
# Read data
train_df = pd.read_csv('train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('test.csv', na_values="-1")

In [6]:
# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]

# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [7]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

In [8]:
y_valid_pred = 0*y
y_test_pred = 0

In [9]:
# set the seed of random number generator, which is useful for creating simulations 
# or random objects that can be reproduced.
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, accuracy_score, f1_score, roc_auc_score
import random
import xgboost as xgb

SEED=3
random.seed(SEED)
np.random.seed(SEED)

cv_folds = 3
early_stopping_rounds = 200
model=xgb.XGBClassifier(seed = SEED)
xgb_param = model.get_xgb_params()
model.set_params(learning_rate=0.01)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=3, silent=True,
       subsample=1)

In [9]:
# set the seed of random number generator, which is useful for creating simulations 
# or random objects that can be reproduced.
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, accuracy_score, f1_score, roc_auc_score
import random
import xgboost as xgb

SEED=3
random.seed(SEED)
np.random.seed(SEED)

cv_folds = 5
early_stopping_rounds = 200
model=xgb.XGBClassifier(seed = SEED)
xgb_param = model.get_xgb_params()
model.set_params(learning_rate=0.01)

# xgtrain = xgb.DMatrix(X, y)
# cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round = 2000, nfold = cv_folds, metrics = 'auc', early_stopping_rounds = early_stopping_rounds, seed = SEED)
# print (cvresult)
# print ("Optimal number of trees (estimators) is %i" % cvresult.shape[0])

     test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0         0.583251      0.001703        0.585571       0.003303
1         0.598575      0.005206        0.600856       0.001901
2         0.601517      0.002868        0.604387       0.001977
3         0.602701      0.004071        0.605352       0.001240
4         0.604170      0.002749        0.607322       0.001575
5         0.605909      0.003594        0.608810       0.000637
6         0.605935      0.003851        0.608465       0.000756
7         0.606223      0.003595        0.609088       0.000836
8         0.606379      0.003853        0.609122       0.000639
9         0.607044      0.003354        0.609871       0.000927
10        0.607384      0.003022        0.610391       0.001059
11        0.607921      0.003217        0.610493       0.000852
12        0.608179      0.003356        0.610860       0.001033
13        0.608204      0.003123        0.611185       0.001096
14        0.608621      0.003312        

In [11]:
%%time
from sklearn.grid_search import GridSearchCV
model.set_params(n_estimators = 355)
param_test1 = {
'max_depth': [4,5,6,7],
'min_child_weight': [1,2,3,4]
}
gsearch1 = GridSearchCV(estimator = model, param_grid = param_test1, scoring = 'roc_auc', n_jobs = 32, iid = False, cv = cv_folds, verbose = 10)
gsearch1.fit(X,y)
#print gsearch1.grid_scores_
print (gsearch1.best_params_)
print (gsearch1.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] max_depth=4, min_child_weight=1 .................................
[CV] max_depth=4, min_child_weight=1 .................................
[CV] max_depth=4, min_child_weight=1 .................................
[CV] max_depth=4, min_child_weight=2 .................................
[CV] max_depth=4, min_child_weight=2 .................................
[CV] max_depth=4, min_child_weight=2 .................................
[CV] max_depth=4, min_child_weight=3 .................................
[CV] max_depth=4, min_child_weight=3 .................................
[CV] max_depth=4, min_child_weight=3 .................................
[CV] max_depth=4, min_child_weight=4 .................................
[CV] max_depth=4, min_child_weight=4 .................................
[CV] max_depth=4, min_child_weight=4 .................................
[CV] max_depth=5, min_child_weight=1 .................................
[CV] max_depth=5

[Parallel(n_jobs=32)]: Done   5 out of  48 | elapsed: 12.9min remaining: 111.2min


[CV] ........ max_depth=4, min_child_weight=4, score=0.627277 -12.9min
[CV] max_depth=7, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=4, score=0.626878 -12.9min
[CV] max_depth=7, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=3, score=0.627324 -12.9min
[CV] max_depth=7, min_child_weight=2 .................................
[CV] ........ max_depth=4, min_child_weight=4, score=0.625495 -13.1min
[CV] max_depth=7, min_child_weight=2 .................................
[CV] ........ max_depth=4, min_child_weight=2, score=0.625445 -13.3min
[CV] max_depth=7, min_child_weight=2 .................................


[Parallel(n_jobs=32)]: Done  10 out of  48 | elapsed: 13.4min remaining: 50.8min


[CV] ........ max_depth=4, min_child_weight=2, score=0.627354 -13.4min
[CV] max_depth=7, min_child_weight=3 .................................
[CV] ........ max_depth=4, min_child_weight=3, score=0.625536 -13.8min
[CV] max_depth=7, min_child_weight=3 .................................
[CV] ........ max_depth=5, min_child_weight=4, score=0.630105 -16.1min
[CV] max_depth=7, min_child_weight=3 .................................
[CV] ........ max_depth=5, min_child_weight=3, score=0.630084 -16.1min
[CV] max_depth=7, min_child_weight=4 .................................
[CV] ........ max_depth=5, min_child_weight=2, score=0.630071 -16.2min
[CV] max_depth=7, min_child_weight=4 .................................


[Parallel(n_jobs=32)]: Done  15 out of  48 | elapsed: 16.3min remaining: 36.0min


[CV] ........ max_depth=5, min_child_weight=2, score=0.630129 -16.3min
[CV] max_depth=7, min_child_weight=4 .................................
[CV] ........ max_depth=5, min_child_weight=3, score=0.630062 -16.4min
[CV] ........ max_depth=5, min_child_weight=4, score=0.628515 -16.4min
[CV] ........ max_depth=5, min_child_weight=1, score=0.630002 -16.5min
[CV] ........ max_depth=5, min_child_weight=4, score=0.630049 -16.5min


[Parallel(n_jobs=32)]: Done  20 out of  48 | elapsed: 16.6min remaining: 23.2min


[CV] ........ max_depth=5, min_child_weight=1, score=0.630178 -16.5min
[CV] ........ max_depth=5, min_child_weight=1, score=0.628372 -16.6min
[CV] ........ max_depth=5, min_child_weight=2, score=0.628437 -16.9min
[CV] ........ max_depth=5, min_child_weight=3, score=0.628533 -16.9min
[CV] ........ max_depth=6, min_child_weight=2, score=0.631934 -19.1min


[Parallel(n_jobs=32)]: Done  25 out of  48 | elapsed: 19.3min remaining: 17.8min


[CV] ........ max_depth=6, min_child_weight=1, score=0.632263 -19.2min
[CV] ........ max_depth=6, min_child_weight=3, score=0.631743 -19.2min
[CV] ........ max_depth=6, min_child_weight=1, score=0.630252 -19.4min
[CV] ........ max_depth=6, min_child_weight=2, score=0.631576 -19.5min
[CV] ........ max_depth=6, min_child_weight=1, score=0.631434 -19.6min


[Parallel(n_jobs=32)]: Done  30 out of  48 | elapsed: 19.7min remaining: 11.8min


[CV] ........ max_depth=6, min_child_weight=3, score=0.631457 -19.5min
[CV] ........ max_depth=6, min_child_weight=2, score=0.630273 -19.5min
[CV] ........ max_depth=6, min_child_weight=4, score=0.630547 -12.4min
[CV] ........ max_depth=6, min_child_weight=4, score=0.631480 -12.5min
[CV] ........ max_depth=6, min_child_weight=3, score=0.630307 -13.7min


[Parallel(n_jobs=32)]: Done  35 out of  48 | elapsed: 26.2min remaining:  9.7min


[CV] ........ max_depth=6, min_child_weight=4, score=0.631939 -13.5min
[CV] ........ max_depth=7, min_child_weight=2, score=0.632862 -13.5min
[CV] ........ max_depth=7, min_child_weight=1, score=0.630654 -13.8min
[CV] ........ max_depth=7, min_child_weight=3, score=0.632687 -13.0min
[CV] ........ max_depth=7, min_child_weight=2, score=0.630350 -13.6min


[Parallel(n_jobs=32)]: Done  40 out of  48 | elapsed: 26.9min remaining:  5.4min


[CV] ........ max_depth=7, min_child_weight=3, score=0.630812 -13.6min
[CV] ........ max_depth=7, min_child_weight=1, score=0.632928 -14.9min
[CV] ........ max_depth=7, min_child_weight=2, score=0.631130 -15.3min
[CV] ........ max_depth=7, min_child_weight=1, score=0.631247 -15.3min
[CV] ........ max_depth=7, min_child_weight=4, score=0.633112 -12.8min


[Parallel(n_jobs=32)]: Done  45 out of  48 | elapsed: 29.1min remaining:  1.9min


[CV] ........ max_depth=7, min_child_weight=3, score=0.631009 -13.1min
[CV] ........ max_depth=7, min_child_weight=4, score=0.630965 -12.9min
[CV] ........ max_depth=7, min_child_weight=4, score=0.630689 -13.1min


[Parallel(n_jobs=32)]: Done  48 out of  48 | elapsed: 29.3min finished


{'max_depth': 7, 'min_child_weight': 1}
0.6316094344978929
CPU times: user 12min 38s, sys: 2.7 s, total: 12min 41s
Wall time: 41min 36s


In [19]:
%%time
# using features, labels, cv_folds, model from previous example
model.set_params(n_estimators = 355)
model.set_params(max_depth = 7)
model.set_params(min_child_weight = 1)
param_test2 = {
'gamma':[i/10.0 for i in range(0,9)]
}
gsearch2 = GridSearchCV(estimator = model, param_grid = param_test2, scoring = 'roc_auc', n_jobs = 32, iid = False, cv = cv_folds, verbose = 1)
gsearch2.fit(X,y)
print (gsearch2.best_params_)
print (gsearch2.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=32)]: Done  20 out of  27 | elapsed: 20.2min remaining:  7.1min
[Parallel(n_jobs=32)]: Done  27 out of  27 | elapsed: 20.3min finished


{'gamma': 0.6}
0.6317915186237932
CPU times: user 12min 32s, sys: 2.13 s, total: 12min 34s
Wall time: 32min 40s


In [20]:
%%time
bestgama = gsearch2.best_params_['gamma']
model.set_params(n_estimators = 355)
model.set_params(learning_rate=0.01)
model.set_params(max_depth = 7)
model.set_params(min_child_weight = 1)
model.set_params(gamma = bestgama)

param_test3 = {
'subsample' : [i/10.0 for i in range(6,11)],
'colsample_bytree' : [i/10.0 for i in range(6,11)]
}
gsearch3 = GridSearchCV(estimator = model, param_grid = param_test3, scoring = 'roc_auc',n_jobs = 32, iid = False, cv = cv_folds, verbose = 25)
gsearch3.fit(X,y)
print (gsearch3.best_params_)
print (gsearch3.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] colsample_bytree=0.6, subsample=0.6 .............................
[CV] colsample_bytree=0.6, subsample=0.6 .............................
[CV] colsample_bytree=0.6, subsample=0.6 .............................
[CV] colsample_bytree=0.6, subsample=0.7 .............................
[CV] colsample_bytree=0.6, subsample=0.7 .............................
[CV] colsample_bytree=0.6, subsample=0.7 .............................
[CV] colsample_bytree=0.6, subsample=0.8 .............................
[CV] colsample_bytree=0.6, subsample=0.8 .............................
[CV] colsample_bytree=0.6, subsample=0.8 .............................
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV] colsample_bytree=0.6, subsample=1.0 .............................
[CV] colsample_b

[Parallel(n_jobs=32)]: Done   1 tasks      | elapsed: 14.1min


[CV] .... colsample_bytree=0.6, subsample=0.7, score=0.636130 -14.1min
[CV] colsample_bytree=0.8, subsample=0.7 .............................


[Parallel(n_jobs=32)]: Done   2 tasks      | elapsed: 14.1min


[CV] .... colsample_bytree=0.6, subsample=0.6, score=0.636382 -14.2min
[CV] colsample_bytree=0.8, subsample=0.7 .............................


[Parallel(n_jobs=32)]: Done   3 tasks      | elapsed: 14.2min


[CV] .... colsample_bytree=0.6, subsample=0.9, score=0.635012 -14.2min
[CV] colsample_bytree=0.8, subsample=0.7 .............................


[Parallel(n_jobs=32)]: Done   4 tasks      | elapsed: 14.2min


[CV] .... colsample_bytree=0.6, subsample=0.8, score=0.635466 -14.3min
[CV] colsample_bytree=0.8, subsample=0.8 .............................


[Parallel(n_jobs=32)]: Done   5 tasks      | elapsed: 14.3min


[CV] .... colsample_bytree=0.6, subsample=0.7, score=0.635603 -14.4min
[CV] colsample_bytree=0.8, subsample=0.8 .............................


[Parallel(n_jobs=32)]: Done   6 tasks      | elapsed: 14.4min


[CV] .... colsample_bytree=0.6, subsample=0.8, score=0.633358 -14.4min
[CV] colsample_bytree=0.8, subsample=0.8 .............................


[Parallel(n_jobs=32)]: Done   7 tasks      | elapsed: 14.4min


[CV] .... colsample_bytree=0.6, subsample=0.6, score=0.635880 -14.5min
[CV] colsample_bytree=0.8, subsample=0.9 .............................


[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed: 14.5min


[CV] .... colsample_bytree=0.6, subsample=0.7, score=0.633447 -14.6min
[CV] colsample_bytree=0.8, subsample=0.9 .............................


[Parallel(n_jobs=32)]: Done   9 tasks      | elapsed: 14.6min


[CV] .... colsample_bytree=0.6, subsample=0.6, score=0.633555 -14.7min
[CV] colsample_bytree=0.8, subsample=0.9 .............................


[Parallel(n_jobs=32)]: Done  10 tasks      | elapsed: 14.7min


[CV] .... colsample_bytree=0.6, subsample=1.0, score=0.635476 -14.7min
[CV] colsample_bytree=0.8, subsample=1.0 .............................


[Parallel(n_jobs=32)]: Done  11 tasks      | elapsed: 14.8min


[CV] .... colsample_bytree=0.6, subsample=0.8, score=0.636553 -14.7min
[CV] colsample_bytree=0.8, subsample=1.0 .............................


[Parallel(n_jobs=32)]: Done  12 tasks      | elapsed: 14.8min


[CV] .... colsample_bytree=0.6, subsample=1.0, score=0.634544 -15.0min
[CV] colsample_bytree=0.8, subsample=1.0 .............................
[CV] .... colsample_bytree=0.6, subsample=1.0, score=0.633440 -15.1min
[CV] colsample_bytree=0.9, subsample=0.6 .............................
[CV] .... colsample_bytree=0.6, subsample=0.9, score=0.633232 -15.3min
[CV] colsample_bytree=0.9, subsample=0.6 .............................
[CV] .... colsample_bytree=0.7, subsample=0.9, score=0.635122 -16.3min
[CV] colsample_bytree=0.9, subsample=0.6 .............................


[Parallel(n_jobs=32)]: Done  16 out of  75 | elapsed: 16.4min remaining: 60.5min


[CV] .... colsample_bytree=0.7, subsample=0.7, score=0.635557 -16.5min
[CV] colsample_bytree=0.9, subsample=0.7 .............................
[CV] .... colsample_bytree=0.7, subsample=0.6, score=0.635592 -16.8min
[CV] colsample_bytree=0.9, subsample=0.7 .............................
[CV] .... colsample_bytree=0.7, subsample=0.8, score=0.632746 -16.9min
[CV] colsample_bytree=0.9, subsample=0.7 .............................
[CV] .... colsample_bytree=0.7, subsample=0.7, score=0.635437 -16.9min
[CV] colsample_bytree=0.9, subsample=0.8 .............................


[Parallel(n_jobs=32)]: Done  20 out of  75 | elapsed: 17.0min remaining: 46.9min


[CV] .... colsample_bytree=0.7, subsample=1.0, score=0.634387 -16.9min
[CV] colsample_bytree=0.9, subsample=0.8 .............................
[CV] .... colsample_bytree=0.7, subsample=1.0, score=0.634003 -17.2min
[CV] colsample_bytree=0.9, subsample=0.8 .............................
[CV] .... colsample_bytree=0.7, subsample=1.0, score=0.632716 -17.3min
[CV] colsample_bytree=0.9, subsample=0.9 .............................
[CV] .... colsample_bytree=0.7, subsample=0.8, score=0.635412 -17.4min
[CV] colsample_bytree=0.9, subsample=0.9 .............................


[Parallel(n_jobs=32)]: Done  24 out of  75 | elapsed: 17.5min remaining: 37.3min


[CV] .... colsample_bytree=0.7, subsample=0.9, score=0.634269 -17.5min
[CV] colsample_bytree=0.9, subsample=0.9 .............................
[CV] .... colsample_bytree=0.7, subsample=0.8, score=0.634909 -17.7min
[CV] colsample_bytree=0.9, subsample=1.0 .............................
[CV] .... colsample_bytree=0.7, subsample=0.9, score=0.633120 -17.8min
[CV] colsample_bytree=0.9, subsample=1.0 .............................
[CV] .... colsample_bytree=0.7, subsample=0.6, score=0.635160 -18.0min
[CV] colsample_bytree=0.9, subsample=1.0 .............................


[Parallel(n_jobs=32)]: Done  28 out of  75 | elapsed: 18.1min remaining: 30.4min


[CV] .... colsample_bytree=0.7, subsample=0.7, score=0.633023 -18.0min
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] .... colsample_bytree=0.7, subsample=0.6, score=0.633108 -19.0min
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] .... colsample_bytree=0.8, subsample=0.6, score=0.635413 -19.5min
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] .... colsample_bytree=0.8, subsample=0.6, score=0.635732 -19.8min
[CV] colsample_bytree=1.0, subsample=0.7 .............................


[Parallel(n_jobs=32)]: Done  32 out of  75 | elapsed: 20.0min remaining: 26.9min


[CV] .... colsample_bytree=0.8, subsample=1.0, score=0.633994 -17.9min
[CV] colsample_bytree=1.0, subsample=0.7 .............................
[CV] .... colsample_bytree=0.8, subsample=0.7, score=0.635404 -18.7min
[CV] colsample_bytree=1.0, subsample=0.7 .............................
[CV] .... colsample_bytree=0.8, subsample=0.8, score=0.632896 -18.5min
[CV] colsample_bytree=1.0, subsample=0.8 .............................
[CV] .... colsample_bytree=0.8, subsample=0.9, score=0.634766 -18.4min
[CV] colsample_bytree=1.0, subsample=0.8 .............................


[Parallel(n_jobs=32)]: Done  36 out of  75 | elapsed: 33.0min remaining: 35.7min


[CV] .... colsample_bytree=0.8, subsample=1.0, score=0.633313 -18.2min
[CV] colsample_bytree=1.0, subsample=0.8 .............................
[CV] .... colsample_bytree=0.8, subsample=0.8, score=0.635111 -18.8min
[CV] colsample_bytree=1.0, subsample=0.9 .............................
[CV] .... colsample_bytree=0.8, subsample=0.6, score=0.632900 -19.1min
[CV] colsample_bytree=1.0, subsample=0.9 .............................
[CV] .... colsample_bytree=0.8, subsample=0.9, score=0.633225 -18.6min
[CV] colsample_bytree=1.0, subsample=0.9 .............................


[Parallel(n_jobs=32)]: Done  40 out of  75 | elapsed: 33.3min remaining: 29.1min


[CV] .... colsample_bytree=0.8, subsample=0.8, score=0.635174 -18.9min
[CV] colsample_bytree=1.0, subsample=1.0 .............................
[CV] .... colsample_bytree=0.8, subsample=0.9, score=0.633947 -18.9min
[CV] colsample_bytree=1.0, subsample=1.0 .............................
[CV] .... colsample_bytree=0.8, subsample=0.7, score=0.635436 -19.5min
[CV] colsample_bytree=1.0, subsample=1.0 .............................
[CV] .... colsample_bytree=0.8, subsample=0.7, score=0.633191 -19.4min


[Parallel(n_jobs=32)]: Done  44 out of  75 | elapsed: 33.6min remaining: 23.7min


[CV] .... colsample_bytree=0.8, subsample=1.0, score=0.632141 -18.5min
[CV] .... colsample_bytree=0.9, subsample=0.6, score=0.634861 -21.1min
[CV] .... colsample_bytree=0.9, subsample=0.6, score=0.635439 -21.6min
[CV] .... colsample_bytree=0.9, subsample=0.8, score=0.634416 -20.5min


[Parallel(n_jobs=32)]: Done  48 out of  75 | elapsed: 37.6min remaining: 21.1min


[CV] .... colsample_bytree=0.9, subsample=0.8, score=0.634549 -20.8min
[CV] .... colsample_bytree=0.9, subsample=1.0, score=0.633669 -19.9min
[CV] .... colsample_bytree=0.9, subsample=0.9, score=0.634232 -20.4min
[CV] .... colsample_bytree=0.9, subsample=0.6, score=0.632884 -21.6min


[Parallel(n_jobs=32)]: Done  52 out of  75 | elapsed: 38.0min remaining: 16.8min


[CV] .... colsample_bytree=0.9, subsample=1.0, score=0.632353 -20.2min
[CV] .... colsample_bytree=0.9, subsample=0.9, score=0.633773 -20.6min
[CV] .... colsample_bytree=0.9, subsample=0.7, score=0.634868 -21.5min
[CV] .... colsample_bytree=0.9, subsample=0.7, score=0.634866 -21.2min


[Parallel(n_jobs=32)]: Done  56 out of  75 | elapsed: 38.1min remaining: 12.9min


[CV] .... colsample_bytree=0.9, subsample=0.9, score=0.632629 -20.5min
[CV] .... colsample_bytree=0.9, subsample=1.0, score=0.632070 -20.1min
[CV] .... colsample_bytree=0.9, subsample=0.8, score=0.632720 -20.9min
[CV] .... colsample_bytree=0.9, subsample=0.7, score=0.633162 -21.5min


[Parallel(n_jobs=32)]: Done  60 out of  75 | elapsed: 38.5min remaining:  9.6min


[CV] .... colsample_bytree=1.0, subsample=0.6, score=0.634964 -21.7min
[CV] .... colsample_bytree=1.0, subsample=0.6, score=0.635033 -21.1min
[CV] .... colsample_bytree=1.0, subsample=0.7, score=0.634536 -20.4min
[CV] .... colsample_bytree=1.0, subsample=0.6, score=0.632946 -21.0min


[Parallel(n_jobs=32)]: Done  64 out of  75 | elapsed: 40.7min remaining:  7.0min


[CV] .... colsample_bytree=1.0, subsample=0.7, score=0.634453 -12.4min
[CV] .... colsample_bytree=1.0, subsample=0.8, score=0.632748 -12.4min
[CV] .... colsample_bytree=1.0, subsample=0.9, score=0.633831 -12.4min
[CV] .... colsample_bytree=1.0, subsample=1.0, score=0.633068 -12.5min


[Parallel(n_jobs=32)]: Done  68 out of  75 | elapsed: 46.0min remaining:  4.7min


[CV] .... colsample_bytree=1.0, subsample=1.0, score=0.631619 -12.7min
[CV] .... colsample_bytree=1.0, subsample=0.9, score=0.633647 -13.1min
[CV] .... colsample_bytree=1.0, subsample=0.8, score=0.634367 -13.3min
[CV] .... colsample_bytree=1.0, subsample=0.8, score=0.634589 -13.4min


[Parallel(n_jobs=32)]: Done  72 out of  75 | elapsed: 46.3min remaining:  1.9min


[CV] .... colsample_bytree=1.0, subsample=0.9, score=0.632603 -13.1min
[CV] .... colsample_bytree=1.0, subsample=1.0, score=0.630687 -12.9min
[CV] .... colsample_bytree=1.0, subsample=0.7, score=0.632767 -13.7min


[Parallel(n_jobs=32)]: Done  75 out of  75 | elapsed: 46.6min finished


{'colsample_bytree': 0.6, 'subsample': 0.6}
0.6352723459471995
CPU times: user 10min 1s, sys: 3.6 s, total: 10min 5s
Wall time: 56min 4s


In [24]:
%%time
bestgama = gsearch2.best_params_['gamma']
subsampleratio = gsearch3.best_params_['subsample']
colsamplebytree = gsearch3.best_params_['colsample_bytree']

model.set_params(subsample = subsampleratio)
model.set_params(colsample_bytree = colsamplebytree)
model.set_params(n_estimators = 355)
model.set_params(learning_rate=0.01)
model.set_params(max_depth = 7)
model.set_params(min_child_weight = 1)
model.set_params(gamma = bestgama)

param_test4 = {
'reg_alpha':[1e-6, 3e-6, 5e-6, 8e-6, 1e-5]
}
gsearch4 = GridSearchCV(estimator = model, param_grid = param_test4, scoring = 'roc_auc',n_jobs = 32, iid = False, cv = cv_folds, verbose = 2)
gsearch4.fit(X,y)
print (gsearch4.best_params_)
print (gsearch4.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] reg_alpha=1e-06 .................................................
[CV] reg_alpha=1e-06 .................................................
[CV] reg_alpha=1e-06 .................................................
[CV] reg_alpha=3e-06 .................................................
[CV] reg_alpha=3e-06 .................................................
[CV] reg_alpha=3e-06 .................................................
[CV] reg_alpha=5e-06 .................................................
[CV] reg_alpha=5e-06 .................................................
[CV] reg_alpha=5e-06 .................................................
[CV] reg_alpha=8e-06 .................................................
[CV] reg_alpha=8e-06 .................................................
[CV] reg_alpha=8e-06 .................................................
[CV] reg_alpha=1e-05 .................................................
[CV] reg_alpha=1e

[Parallel(n_jobs=32)]: Done   8 out of  15 | elapsed:  8.6min remaining:  7.6min


[CV] ........................................ reg_alpha=5e-06 - 8.6min
[CV] ........................................ reg_alpha=8e-06 - 8.6min
[CV] ........................................ reg_alpha=5e-06 - 8.6min
[CV] ........................................ reg_alpha=3e-06 - 8.7min
[CV] ........................................ reg_alpha=8e-06 - 8.7min
[CV] ........................................ reg_alpha=3e-06 - 8.7min
[CV] ........................................ reg_alpha=8e-06 - 8.8min


[Parallel(n_jobs=32)]: Done  15 out of  15 | elapsed:  8.9min finished


{'reg_alpha': 8e-06}
0.6352723571594193
CPU times: user 8min 57s, sys: 1.36 s, total: 8min 58s
Wall time: 17min 43s


In [25]:
%%time
bestgama = gsearch2.best_params_['gamma']
subsampleratio = gsearch3.best_params_['subsample']
colsamplebytree = gsearch3.best_params_['colsample_bytree']
bestalpha = gsearch4.best_params_['reg_alpha']

model.set_params(subsample = subsampleratio)
model.set_params(colsample_bytree = colsamplebytree)
model.set_params(n_estimators = 355)
model.set_params(learning_rate=0.01)
model.set_params(max_depth = 7)
model.set_params(min_child_weight = 1)
model.set_params(gamma = bestgama)
model.set_params(reg_alpha = bestalpha)

param_test6 = {
'learning_rate':[i/1000 for i in range(10,30,3)]
}
gsearch6 = GridSearchCV(estimator = model, param_grid = param_test6, scoring = 'roc_auc',n_jobs = 32, iid = False, cv = 2, verbose = 200)
gsearch6.fit(X,y)
print (gsearch6.best_params_)
print (gsearch6.best_score_)

Fitting 2 folds for each of 7 candidates, totalling 14 fits
Pickling array (shape=(36,), dtype=object).
Memmaping (shape=(17, 595212), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-c8320333ef89c199cb2d02fd012d0c1e.pkl
[CV] learning_rate=0.01 ..............................................
Memmaping (shape=(19, 595212), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-03bdd15f4dd52eb08719e21b90244dbe.pkl
Pickling array (shape=(17,), dtype=object).
Pickling array (shape=(19,), dtype=object).
Pickling array (shape=(17,), dtype=int64).
Pickling array (shape=(19,), dtype=int64).
Memmaping (shape=(595212,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-398dd08d7946d7598ece4fcdc755b613.pkl
Memmaping (shape=(297606,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-f564ceb6e86b12862f2605b301ac

[CV] learning_rate=0.022 .............................................
Memmaping (shape=(19, 595212), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-03bdd15f4dd52eb08719e21b90244dbe.pkl
Pickling array (shape=(17,), dtype=object).
Pickling array (shape=(19,), dtype=object).
Pickling array (shape=(17,), dtype=int64).
Pickling array (shape=(19,), dtype=int64).
Memmaping (shape=(595212,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-398dd08d7946d7598ece4fcdc755b613.pkl
Memmaping (shape=(297606,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-f564ceb6e86b12862f2605b301acae5d.pkl
Memmaping (shape=(297606,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139713382738688/8285-139710009241784-e3f6c41bad597244b36677ef9378d1e5.pkl
Pickling array (shape=(36,), dtype=object).
Memmaping (shape=(17, 595212), dtype=float64) to old fil

[CV] .................... learning_rate=0.016, score=0.637798 - 5.3min
[CV] .................... learning_rate=0.022, score=0.638142 - 5.3min
[Parallel(n_jobs=32)]: Done  14 out of  14 | elapsed:  5.4min remaining:    0.0s
[Parallel(n_jobs=32)]: Done  14 out of  14 | elapsed:  5.4min finished
{'learning_rate': 0.019}
0.6376907986679627
CPU times: user 9min 2s, sys: 1.25 s, total: 9min 3s
Wall time: 14min 19s


In [29]:
%%time
bestgama = gsearch2.best_params_['gamma']
subsampleratio = gsearch3.best_params_['subsample']
colsamplebytree = gsearch3.best_params_['colsample_bytree']
bestalpha = gsearch4.best_params_['reg_alpha']
learnrate = gsearch6.best_params_['learning_rate']

model.set_params(subsample = subsampleratio)
model.set_params(colsample_bytree = colsamplebytree)
model.set_params(n_estimators = 355)
model.set_params(max_depth = 7)
model.set_params(min_child_weight = 1)
model.set_params(gamma = bestgama)
model.set_params(reg_alpha = bestalpha)
model.set_params(learning_rate = learnrate)

param_test7 = {
'reg_lambda': [i/1000 for i in range(2,14,2)]
}
gsearch7 = GridSearchCV(estimator = model, param_grid = param_test7, scoring = 'roc_auc',n_jobs = 32, iid = False, cv = 2, verbose = 200)
gsearch7.fit(X,y)
print (gsearch7.best_params_)
print (gsearch7.best_score_)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Pickling array (shape=(36,), dtype=object).
Memmaping (shape=(17, 595212), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-c8320333ef89c199cb2d02fd012d0c1e.pkl
[CV] reg_lambda=0.002 ................................................
Memmaping (shape=(19, 595212), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-03bdd15f4dd52eb08719e21b90244dbe.pkl
Pickling array (shape=(17,), dtype=object).
Pickling array (shape=(19,), dtype=object).
Pickling array (shape=(17,), dtype=int64).
Pickling array (shape=(19,), dtype=int64).
Memmaping (shape=(595212,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-398dd08d7946d7598ece4fcdc755b613.pkl
Memmaping (shape=(297606,), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-f564ceb6e86b12862f2605b301ac

[CV] reg_lambda=0.01 .................................................
Memmaping (shape=(19, 595212), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-03bdd15f4dd52eb08719e21b90244dbe.pkl
Pickling array (shape=(17,), dtype=object).
Pickling array (shape=(19,), dtype=object).
Pickling array (shape=(17,), dtype=int64).
Pickling array (shape=(19,), dtype=int64).
Memmaping (shape=(595212,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-398dd08d7946d7598ece4fcdc755b613.pkl
Memmaping (shape=(297606,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-f564ceb6e86b12862f2605b301acae5d.pkl
Memmaping (shape=(297606,), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_8285_139710009253960/8285-139710009295928-e3f6c41bad597244b36677ef9378d1e5.pkl
Pickling array (shape=(36,), dtype=object).
Memmaping (shape=(17, 595212), dtype=float64) to old fil

In [None]:
# Set up classifier
model = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=6,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )

In [None]:
%%time
list1 = [17, 42, 159, 267, 379, 589, 1099, 2113, 3127, 4179, 5199, 6001, 7666, 8017, 9527]
for rseed in list1:
    kf = KFold(n_splits = 5, random_state = rseed, shuffle = True)
    for i, (train_index, test_index) in enumerate(kf.split(train_df)):

        # Create data for this fold
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        X_test = test_df.copy()
        print( "\nFold ", i)

        # Enocode data
        for f in f_cats:
            X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                            trn_series=X_train[f],
                                                            val_series=X_valid[f],
                                                            tst_series=X_test[f],
                                                            target=y_train,
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            noise_level=0
                                                            )
        # Run model for this fold
        if OPTIMIZE_ROUNDS:
            eval_set=[(X_valid,y_valid)]
            fit_model = model.fit( X_train, y_train, 
                                   eval_set=eval_set,
                                   eval_metric=gini_xgb,
                                   early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                                   verbose=False
                                 )
            print( "  Best N trees = ", model.best_ntree_limit )
            print( "  Best gini = ", model.best_score )
        else:
            fit_model = model.fit( X_train, y_train )

        # Generate validation predictions for this fold
        pred = fit_model.predict_proba(X_valid)[:,1]
        print( "  Gini = ", eval_gini(y_valid, pred) )
        y_valid_pred.iloc[test_index] = pred

        # Accumulate test set predictions
        y_test_pred += fit_model.predict_proba(X_test)[:,1]

        del X_test, X_train, X_valid, y_train

    y_test_pred /= K  # Average test set predictions

    print( "\nGini for full training set:", eval_gini(y, y_valid_pred))
    print('+'*50)
    
    # Save validation predictions for stacking/ensembling
    val = pd.DataFrame()
    val['id'] = id_train
    val['target'] = y_valid_pred.values
    val.to_csv('{}_xgbvalid.csv'.format(rseed), float_format='%.6f', index=False)
    
    # Create submission file
    sub = pd.DataFrame()
    sub['id'] = id_test
    sub['target'] = y_test_pred
    sub.to_csv('{}_xgbsub.csv'.format(rseed), float_format='%.6f', index=False)

In [None]:
!ls 15_fold_xgb

In [None]:
import os
csvnames = os.popen('ls 15_fold_xgb').readlines()

In [None]:
csvnames

In [None]:
scv1 = csvnames[0].split()
scv1
str1 = ''.join(scv1)
str1[:-4]

In [None]:
total = pd.DataFrame()
for csvname in csvnames:
    csvname1 = csvname.split()
    str1 = ''.join(csvname1)[:-4]
    sub = pd.read_csv('15_fold_xgb/{}.csv'.format(str1))
    total[str1] = sub['target']

In [None]:
total.info()

In [None]:
total['mean'] = total.mean(axis=1)
total['variance'] = total.std(axis=1)
total.head()

In [None]:
test_sub2 = pd.read_csv('input/test.csv',usecols=['id'])
test_sub2['target'] =  total['mean']

In [None]:
test_sub2.to_csv('mean_15_fold_xgb.csv',index=False)

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

total['variance'].plot(style='.')

In [None]:
total['mean'].plot(style='.')