## Imports

In [1]:
import datetime
import os

In [2]:
import sklearn_evaluation as skeval
import xgboost as xgb

In [3]:
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import *
from xgboost import XGBClassifier

## Config

In [4]:
RANDOM_SEED = 42

In [5]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [6]:
X = load(features_data_folder + 'X_train_summary_stats.pickle')

In [7]:
y = load(features_data_folder + 'y_train.pickle')

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.15,
    random_state=RANDOM_SEED,
    stratify=y
)

In [9]:
print('X train:', X_train.shape)
print('y train:', y_train.shape)
print('X val:  ', X_val.shape)
print('y val:  ', y_val.shape)

X train: (343646, 12)
y train: (343646,)
X val:   (60644, 12)
y val:   (60644,)


## Train Model

In [10]:
positive_imbalance_ratio = np.count_nonzero(y_train == 0) / np.count_nonzero(y_train == 1)

In [11]:
print('Positive imbalance ratio:', positive_imbalance_ratio)

Positive imbalance ratio: 1.7085825983463778


In [12]:
xgb_params = {
    'objective': 'binary:logistic',
    'n_estimators': 100,
    'seed': RANDOM_SEED,
    'learning_rate': 0.15,
    'max_depth': 6,
    'subsample': 0.75,
    'colsample_bytree': 0.4,
    'scale_pos_weight': positive_imbalance_ratio,
    'silent': 1,
}

### Search for the best parameters

In [13]:
random_search_parameters = {
    'n_estimators': randint(200, 1500),
    'learning_rate': uniform(0.01, 0.1 - 0.01),
    'max_depth': randint(5, 16),
    'subsample': uniform(0.2, 1.0 - 0.2),
    'colsample_bytree': uniform(0.2, 1.0 - 0.2),
}

In [14]:
model = XGBClassifier(**xgb_params)

In [15]:
def log_loss_scorer(estimator, X, y):
    return -log_loss(y, estimator.predict_proba(X)[:, -1])

In [16]:
randomized_search = RandomizedSearchCV(
    model,
    random_search_parameters,
    n_jobs=1,
    n_iter=250,
    cv=3,
    scoring=log_loss_scorer,
    refit=True,
    verbose=2,
    random_state=RANDOM_SEED,
)

In [None]:
%%time
model = randomized_search.fit(X_train, y_train)

Fitting 3 folds for each of 250 candidates, totalling 750 fits
[CV] colsample_bytree=0.7426732318, learning_rate=0.0330769850738, max_depth=9, n_estimators=778, subsample=0.290430557588 
[CV]  colsample_bytree=0.7426732318, learning_rate=0.0330769850738, max_depth=9, n_estimators=778, subsample=0.290430557588, total=  49.9s
[CV] colsample_bytree=0.7426732318, learning_rate=0.0330769850738, max_depth=9, n_estimators=778, subsample=0.290430557588 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.4s remaining:    0.0s


[CV]  colsample_bytree=0.7426732318, learning_rate=0.0330769850738, max_depth=9, n_estimators=778, subsample=0.290430557588, total=  49.0s
[CV] colsample_bytree=0.7426732318, learning_rate=0.0330769850738, max_depth=9, n_estimators=778, subsample=0.290430557588 
[CV]  colsample_bytree=0.7426732318, learning_rate=0.0330769850738, max_depth=9, n_estimators=778, subsample=0.290430557588, total=  48.5s
[CV] colsample_bytree=0.483088516261, learning_rate=0.0483813466299, max_depth=5, n_estimators=774, subsample=0.348129242398 
[CV]  colsample_bytree=0.483088516261, learning_rate=0.0483813466299, max_depth=5, n_estimators=774, subsample=0.348129242398, total=  27.3s
[CV] colsample_bytree=0.483088516261, learning_rate=0.0483813466299, max_depth=5, n_estimators=774, subsample=0.348129242398 
[CV]  colsample_bytree=0.483088516261, learning_rate=0.0483813466299, max_depth=5, n_estimators=774, subsample=0.348129242398, total=  26.5s
[CV] colsample_bytree=0.483088516261, learning_rate=0.0483813466

[CV]  colsample_bytree=0.823312697817, learning_rate=0.0792435557242, max_depth=14, n_estimators=440, subsample=0.306663570056, total=  49.6s
[CV] colsample_bytree=0.359774110236, learning_rate=0.065439990793, max_depth=11, n_estimators=1392, subsample=0.43874656664 
[CV]  colsample_bytree=0.359774110236, learning_rate=0.065439990793, max_depth=11, n_estimators=1392, subsample=0.43874656664, total= 1.8min
[CV] colsample_bytree=0.359774110236, learning_rate=0.065439990793, max_depth=11, n_estimators=1392, subsample=0.43874656664 
[CV]  colsample_bytree=0.359774110236, learning_rate=0.065439990793, max_depth=11, n_estimators=1392, subsample=0.43874656664, total= 1.7min
[CV] colsample_bytree=0.359774110236, learning_rate=0.065439990793, max_depth=11, n_estimators=1392, subsample=0.43874656664 
[CV]  colsample_bytree=0.359774110236, learning_rate=0.065439990793, max_depth=11, n_estimators=1392, subsample=0.43874656664, total= 1.7min
[CV] colsample_bytree=0.986150310568, learning_rate=0.081

[CV]  colsample_bytree=0.973316072118, learning_rate=0.0939355714511, max_depth=12, n_estimators=1403, subsample=0.855738826669, total= 3.6min
[CV] colsample_bytree=0.973316072118, learning_rate=0.0939355714511, max_depth=12, n_estimators=1403, subsample=0.855738826669 
[CV]  colsample_bytree=0.973316072118, learning_rate=0.0939355714511, max_depth=12, n_estimators=1403, subsample=0.855738826669, total= 3.6min
[CV] colsample_bytree=0.765327444916, learning_rate=0.0488237553815, max_depth=9, n_estimators=918, subsample=0.702899200562 
[CV]  colsample_bytree=0.765327444916, learning_rate=0.0488237553815, max_depth=9, n_estimators=918, subsample=0.702899200562, total= 1.5min
[CV] colsample_bytree=0.765327444916, learning_rate=0.0488237553815, max_depth=9, n_estimators=918, subsample=0.702899200562 
[CV]  colsample_bytree=0.765327444916, learning_rate=0.0488237553815, max_depth=9, n_estimators=918, subsample=0.702899200562, total= 1.5min
[CV] colsample_bytree=0.765327444916, learning_rate=

[CV]  colsample_bytree=0.833375591408, learning_rate=0.0362171205358, max_depth=6, n_estimators=516, subsample=0.349443263511, total=  28.3s
[CV] colsample_bytree=0.872920938772, learning_rate=0.0465768255771, max_depth=12, n_estimators=1035, subsample=0.508063724843 
[CV]  colsample_bytree=0.872920938772, learning_rate=0.0465768255771, max_depth=12, n_estimators=1035, subsample=0.508063724843, total= 2.2min
[CV] colsample_bytree=0.872920938772, learning_rate=0.0465768255771, max_depth=12, n_estimators=1035, subsample=0.508063724843 
[CV]  colsample_bytree=0.872920938772, learning_rate=0.0465768255771, max_depth=12, n_estimators=1035, subsample=0.508063724843, total= 2.2min
[CV] colsample_bytree=0.872920938772, learning_rate=0.0465768255771, max_depth=12, n_estimators=1035, subsample=0.508063724843 
[CV]  colsample_bytree=0.872920938772, learning_rate=0.0465768255771, max_depth=12, n_estimators=1035, subsample=0.508063724843, total= 2.2min
[CV] colsample_bytree=0.710533679926, learning

[CV]  colsample_bytree=0.82906766124, learning_rate=0.0921354754131, max_depth=6, n_estimators=1362, subsample=0.620101385766, total= 1.5min
[CV] colsample_bytree=0.82906766124, learning_rate=0.0921354754131, max_depth=6, n_estimators=1362, subsample=0.620101385766 
[CV]  colsample_bytree=0.82906766124, learning_rate=0.0921354754131, max_depth=6, n_estimators=1362, subsample=0.620101385766, total= 1.5min
[CV] colsample_bytree=0.82906766124, learning_rate=0.0921354754131, max_depth=6, n_estimators=1362, subsample=0.620101385766 
[CV]  colsample_bytree=0.82906766124, learning_rate=0.0921354754131, max_depth=6, n_estimators=1362, subsample=0.620101385766, total= 1.4min
[CV] colsample_bytree=0.78982315387, learning_rate=0.0753708641234, max_depth=5, n_estimators=755, subsample=0.531609656792 
[CV]  colsample_bytree=0.78982315387, learning_rate=0.0753708641234, max_depth=5, n_estimators=755, subsample=0.531609656792, total=  39.6s
[CV] colsample_bytree=0.78982315387, learning_rate=0.0753708

[CV]  colsample_bytree=0.454132825679, learning_rate=0.055854562911, max_depth=11, n_estimators=539, subsample=0.90496561648, total=  54.5s
[CV] colsample_bytree=0.454132825679, learning_rate=0.055854562911, max_depth=11, n_estimators=539, subsample=0.90496561648 
[CV]  colsample_bytree=0.454132825679, learning_rate=0.055854562911, max_depth=11, n_estimators=539, subsample=0.90496561648, total=  54.0s
[CV] colsample_bytree=0.250863194647, learning_rate=0.0444364914437, max_depth=7, n_estimators=570, subsample=0.706725088069 
[CV]  colsample_bytree=0.250863194647, learning_rate=0.0444364914437, max_depth=7, n_estimators=570, subsample=0.706725088069, total=  30.8s
[CV] colsample_bytree=0.250863194647, learning_rate=0.0444364914437, max_depth=7, n_estimators=570, subsample=0.706725088069 
[CV]  colsample_bytree=0.250863194647, learning_rate=0.0444364914437, max_depth=7, n_estimators=570, subsample=0.706725088069, total=  30.1s
[CV] colsample_bytree=0.250863194647, learning_rate=0.0444364

[CV]  colsample_bytree=0.424328909529, learning_rate=0.0753139282853, max_depth=14, n_estimators=1016, subsample=0.904661406497, total= 2.1min
[CV] colsample_bytree=0.720954053869, learning_rate=0.0343713255706, max_depth=6, n_estimators=700, subsample=0.26426686567 
[CV]  colsample_bytree=0.720954053869, learning_rate=0.0343713255706, max_depth=6, n_estimators=700, subsample=0.26426686567, total=  29.9s
[CV] colsample_bytree=0.720954053869, learning_rate=0.0343713255706, max_depth=6, n_estimators=700, subsample=0.26426686567 
[CV]  colsample_bytree=0.720954053869, learning_rate=0.0343713255706, max_depth=6, n_estimators=700, subsample=0.26426686567, total=  28.8s
[CV] colsample_bytree=0.720954053869, learning_rate=0.0343713255706, max_depth=6, n_estimators=700, subsample=0.26426686567 
[CV]  colsample_bytree=0.720954053869, learning_rate=0.0343713255706, max_depth=6, n_estimators=700, subsample=0.26426686567, total=  29.2s
[CV] colsample_bytree=0.67119383936, learning_rate=0.014982832

[CV]  colsample_bytree=0.261030053799, learning_rate=0.0349602198589, max_depth=8, n_estimators=1020, subsample=0.262899661864, total=  40.3s
[CV] colsample_bytree=0.261030053799, learning_rate=0.0349602198589, max_depth=8, n_estimators=1020, subsample=0.262899661864 
[CV]  colsample_bytree=0.261030053799, learning_rate=0.0349602198589, max_depth=8, n_estimators=1020, subsample=0.262899661864, total=  41.5s
[CV] colsample_bytree=0.261030053799, learning_rate=0.0349602198589, max_depth=8, n_estimators=1020, subsample=0.262899661864 
[CV]  colsample_bytree=0.261030053799, learning_rate=0.0349602198589, max_depth=8, n_estimators=1020, subsample=0.262899661864, total=  41.3s
[CV] colsample_bytree=0.621890003856, learning_rate=0.0483856138297, max_depth=6, n_estimators=1331, subsample=0.390345192713 
[CV]  colsample_bytree=0.621890003856, learning_rate=0.0483856138297, max_depth=6, n_estimators=1331, subsample=0.390345192713, total= 1.0min
[CV] colsample_bytree=0.621890003856, learning_rate

[CV]  colsample_bytree=0.607437800025, learning_rate=0.072487646921, max_depth=8, n_estimators=1461, subsample=0.737009630891, total= 1.9min
[CV] colsample_bytree=0.607437800025, learning_rate=0.072487646921, max_depth=8, n_estimators=1461, subsample=0.737009630891 
[CV]  colsample_bytree=0.607437800025, learning_rate=0.072487646921, max_depth=8, n_estimators=1461, subsample=0.737009630891, total= 1.9min
[CV] colsample_bytree=0.245121282028, learning_rate=0.0907120707718, max_depth=12, n_estimators=1474, subsample=0.537052586653 
[CV]  colsample_bytree=0.245121282028, learning_rate=0.0907120707718, max_depth=12, n_estimators=1474, subsample=0.537052586653, total= 1.7min
[CV] colsample_bytree=0.245121282028, learning_rate=0.0907120707718, max_depth=12, n_estimators=1474, subsample=0.537052586653 
[CV]  colsample_bytree=0.245121282028, learning_rate=0.0907120707718, max_depth=12, n_estimators=1474, subsample=0.537052586653, total= 1.8min
[CV] colsample_bytree=0.245121282028, learning_rat

[CV]  colsample_bytree=0.743960867674, learning_rate=0.0631699764914, max_depth=9, n_estimators=354, subsample=0.777421162285, total=  33.4s
[CV] colsample_bytree=0.690269369134, learning_rate=0.0561044504673, max_depth=9, n_estimators=566, subsample=0.522779309236 
[CV]  colsample_bytree=0.690269369134, learning_rate=0.0561044504673, max_depth=9, n_estimators=566, subsample=0.522779309236, total=  47.1s
[CV] colsample_bytree=0.690269369134, learning_rate=0.0561044504673, max_depth=9, n_estimators=566, subsample=0.522779309236 
[CV]  colsample_bytree=0.690269369134, learning_rate=0.0561044504673, max_depth=9, n_estimators=566, subsample=0.522779309236, total=  46.4s
[CV] colsample_bytree=0.690269369134, learning_rate=0.0561044504673, max_depth=9, n_estimators=566, subsample=0.522779309236 
[CV]  colsample_bytree=0.690269369134, learning_rate=0.0561044504673, max_depth=9, n_estimators=566, subsample=0.522779309236, total=  47.9s
[CV] colsample_bytree=0.287842570735, learning_rate=0.0636

[CV]  colsample_bytree=0.213479392584, learning_rate=0.0222026211483, max_depth=8, n_estimators=546, subsample=0.396016641255, total=  22.7s
[CV] colsample_bytree=0.213479392584, learning_rate=0.0222026211483, max_depth=8, n_estimators=546, subsample=0.396016641255 
[CV]  colsample_bytree=0.213479392584, learning_rate=0.0222026211483, max_depth=8, n_estimators=546, subsample=0.396016641255, total=  23.5s
[CV] colsample_bytree=0.901456969416, learning_rate=0.0367452317147, max_depth=15, n_estimators=1287, subsample=0.767119349425 
[CV]  colsample_bytree=0.901456969416, learning_rate=0.0367452317147, max_depth=15, n_estimators=1287, subsample=0.767119349425, total= 3.8min
[CV] colsample_bytree=0.901456969416, learning_rate=0.0367452317147, max_depth=15, n_estimators=1287, subsample=0.767119349425 
[CV]  colsample_bytree=0.901456969416, learning_rate=0.0367452317147, max_depth=15, n_estimators=1287, subsample=0.767119349425, total= 3.9min
[CV] colsample_bytree=0.901456969416, learning_rat

[CV]  colsample_bytree=0.954935784868, learning_rate=0.0581214346879, max_depth=8, n_estimators=470, subsample=0.625238497971, total=  44.4s
[CV] colsample_bytree=0.507318384433, learning_rate=0.0595722344915, max_depth=7, n_estimators=668, subsample=0.346551942917 
[CV]  colsample_bytree=0.507318384433, learning_rate=0.0595722344915, max_depth=7, n_estimators=668, subsample=0.346551942917, total=  33.4s
[CV] colsample_bytree=0.507318384433, learning_rate=0.0595722344915, max_depth=7, n_estimators=668, subsample=0.346551942917 
[CV]  colsample_bytree=0.507318384433, learning_rate=0.0595722344915, max_depth=7, n_estimators=668, subsample=0.346551942917, total=  32.5s
[CV] colsample_bytree=0.507318384433, learning_rate=0.0595722344915, max_depth=7, n_estimators=668, subsample=0.346551942917 
[CV]  colsample_bytree=0.507318384433, learning_rate=0.0595722344915, max_depth=7, n_estimators=668, subsample=0.346551942917, total=  33.2s
[CV] colsample_bytree=0.212405430923, learning_rate=0.0491

[CV]  colsample_bytree=0.223712369088, learning_rate=0.0751628470179, max_depth=11, n_estimators=1107, subsample=0.436656651761, total= 1.1min
[CV] colsample_bytree=0.223712369088, learning_rate=0.0751628470179, max_depth=11, n_estimators=1107, subsample=0.436656651761 
[CV]  colsample_bytree=0.223712369088, learning_rate=0.0751628470179, max_depth=11, n_estimators=1107, subsample=0.436656651761, total= 1.1min
[CV] colsample_bytree=0.223712369088, learning_rate=0.0751628470179, max_depth=11, n_estimators=1107, subsample=0.436656651761 
[CV]  colsample_bytree=0.223712369088, learning_rate=0.0751628470179, max_depth=11, n_estimators=1107, subsample=0.436656651761, total= 1.1min
[CV] colsample_bytree=0.690451056442, learning_rate=0.0673847018669, max_depth=7, n_estimators=1434, subsample=0.853700262863 
[CV]  colsample_bytree=0.690451056442, learning_rate=0.0673847018669, max_depth=7, n_estimators=1434, subsample=0.853700262863, total= 1.7min
[CV] colsample_bytree=0.690451056442, learning

[CV]  colsample_bytree=0.328715428044, learning_rate=0.0380657567921, max_depth=15, n_estimators=640, subsample=0.321872164755, total=  52.0s
[CV] colsample_bytree=0.328715428044, learning_rate=0.0380657567921, max_depth=15, n_estimators=640, subsample=0.321872164755 
[CV]  colsample_bytree=0.328715428044, learning_rate=0.0380657567921, max_depth=15, n_estimators=640, subsample=0.321872164755, total=  52.4s
[CV] colsample_bytree=0.594402688085, learning_rate=0.0638673853224, max_depth=9, n_estimators=1176, subsample=0.262436028417 
[CV]  colsample_bytree=0.594402688085, learning_rate=0.0638673853224, max_depth=9, n_estimators=1176, subsample=0.262436028417, total= 1.1min
[CV] colsample_bytree=0.594402688085, learning_rate=0.0638673853224, max_depth=9, n_estimators=1176, subsample=0.262436028417 
[CV]  colsample_bytree=0.594402688085, learning_rate=0.0638673853224, max_depth=9, n_estimators=1176, subsample=0.262436028417, total= 1.1min
[CV] colsample_bytree=0.594402688085, learning_rate

[CV]  colsample_bytree=0.680239571644, learning_rate=0.0232917252299, max_depth=11, n_estimators=1457, subsample=0.832011112383, total= 2.8min
[CV] colsample_bytree=0.363805306486, learning_rate=0.0501576343033, max_depth=6, n_estimators=1294, subsample=0.657940437045 
[CV]  colsample_bytree=0.363805306486, learning_rate=0.0501576343033, max_depth=6, n_estimators=1294, subsample=0.657940437045, total= 1.0min
[CV] colsample_bytree=0.363805306486, learning_rate=0.0501576343033, max_depth=6, n_estimators=1294, subsample=0.657940437045 
[CV]  colsample_bytree=0.363805306486, learning_rate=0.0501576343033, max_depth=6, n_estimators=1294, subsample=0.657940437045, total= 1.0min
[CV] colsample_bytree=0.363805306486, learning_rate=0.0501576343033, max_depth=6, n_estimators=1294, subsample=0.657940437045 
[CV]  colsample_bytree=0.363805306486, learning_rate=0.0501576343033, max_depth=6, n_estimators=1294, subsample=0.657940437045, total= 1.0min
[CV] colsample_bytree=0.474571348421, learning_rat

[CV]  colsample_bytree=0.672149416146, learning_rate=0.0939471273687, max_depth=7, n_estimators=568, subsample=0.871069337669, total=  41.8s
[CV] colsample_bytree=0.672149416146, learning_rate=0.0939471273687, max_depth=7, n_estimators=568, subsample=0.871069337669 
[CV]  colsample_bytree=0.672149416146, learning_rate=0.0939471273687, max_depth=7, n_estimators=568, subsample=0.871069337669, total=  41.1s
[CV] colsample_bytree=0.672149416146, learning_rate=0.0939471273687, max_depth=7, n_estimators=568, subsample=0.871069337669 
[CV]  colsample_bytree=0.672149416146, learning_rate=0.0939471273687, max_depth=7, n_estimators=568, subsample=0.871069337669, total=  42.0s
[CV] colsample_bytree=0.425991863122, learning_rate=0.042257871996, max_depth=5, n_estimators=229, subsample=0.322442816397 
[CV]  colsample_bytree=0.425991863122, learning_rate=0.042257871996, max_depth=5, n_estimators=229, subsample=0.322442816397, total=   8.0s
[CV] colsample_bytree=0.425991863122, learning_rate=0.042257

[CV]  colsample_bytree=0.232528419937, learning_rate=0.0892021321109, max_depth=6, n_estimators=585, subsample=0.631248891064, total=  24.2s
[CV] colsample_bytree=0.232528419937, learning_rate=0.0892021321109, max_depth=6, n_estimators=585, subsample=0.631248891064 
[CV]  colsample_bytree=0.232528419937, learning_rate=0.0892021321109, max_depth=6, n_estimators=585, subsample=0.631248891064, total=  23.8s
[CV] colsample_bytree=0.403013456012, learning_rate=0.0142991696674, max_depth=14, n_estimators=1391, subsample=0.220917964779 
[CV]  colsample_bytree=0.403013456012, learning_rate=0.0142991696674, max_depth=14, n_estimators=1391, subsample=0.220917964779, total= 1.7min
[CV] colsample_bytree=0.403013456012, learning_rate=0.0142991696674, max_depth=14, n_estimators=1391, subsample=0.220917964779 
[CV]  colsample_bytree=0.403013456012, learning_rate=0.0142991696674, max_depth=14, n_estimators=1391, subsample=0.220917964779, total= 1.7min
[CV] colsample_bytree=0.403013456012, learning_rat

[CV]  colsample_bytree=0.734281791524, learning_rate=0.0167149150546, max_depth=15, n_estimators=1016, subsample=0.304551753953, total= 1.9min
[CV] colsample_bytree=0.678732826005, learning_rate=0.0648992342528, max_depth=14, n_estimators=1391, subsample=0.867860416444 
[CV]  colsample_bytree=0.678732826005, learning_rate=0.0648992342528, max_depth=14, n_estimators=1391, subsample=0.867860416444, total= 3.4min
[CV] colsample_bytree=0.678732826005, learning_rate=0.0648992342528, max_depth=14, n_estimators=1391, subsample=0.867860416444 
[CV]  colsample_bytree=0.678732826005, learning_rate=0.0648992342528, max_depth=14, n_estimators=1391, subsample=0.867860416444, total= 3.4min
[CV] colsample_bytree=0.678732826005, learning_rate=0.0648992342528, max_depth=14, n_estimators=1391, subsample=0.867860416444 
[CV]  colsample_bytree=0.678732826005, learning_rate=0.0648992342528, max_depth=14, n_estimators=1391, subsample=0.867860416444, total= 3.4min
[CV] colsample_bytree=0.959034946255, learni

[CV]  colsample_bytree=0.499712294604, learning_rate=0.0722763175434, max_depth=9, n_estimators=652, subsample=0.742656533113, total=  50.3s
[CV] colsample_bytree=0.499712294604, learning_rate=0.0722763175434, max_depth=9, n_estimators=652, subsample=0.742656533113 
[CV]  colsample_bytree=0.499712294604, learning_rate=0.0722763175434, max_depth=9, n_estimators=652, subsample=0.742656533113, total=  51.0s
[CV] colsample_bytree=0.499712294604, learning_rate=0.0722763175434, max_depth=9, n_estimators=652, subsample=0.742656533113 
[CV]  colsample_bytree=0.499712294604, learning_rate=0.0722763175434, max_depth=9, n_estimators=652, subsample=0.742656533113, total=  51.1s
[CV] colsample_bytree=0.617381410851, learning_rate=0.0255536625222, max_depth=7, n_estimators=630, subsample=0.675734179255 
[CV]  colsample_bytree=0.617381410851, learning_rate=0.0255536625222, max_depth=7, n_estimators=630, subsample=0.675734179255, total=  41.7s
[CV] colsample_bytree=0.617381410851, learning_rate=0.0255

[CV]  colsample_bytree=0.389543575717, learning_rate=0.0319707016746, max_depth=12, n_estimators=1146, subsample=0.442335724613, total= 1.6min
[CV] colsample_bytree=0.389543575717, learning_rate=0.0319707016746, max_depth=12, n_estimators=1146, subsample=0.442335724613 
[CV]  colsample_bytree=0.389543575717, learning_rate=0.0319707016746, max_depth=12, n_estimators=1146, subsample=0.442335724613, total= 1.6min
[CV] colsample_bytree=0.622689411135, learning_rate=0.0516392417043, max_depth=12, n_estimators=969, subsample=0.931189213634 
[CV]  colsample_bytree=0.622689411135, learning_rate=0.0516392417043, max_depth=12, n_estimators=969, subsample=0.931189213634, total= 2.0min
[CV] colsample_bytree=0.622689411135, learning_rate=0.0516392417043, max_depth=12, n_estimators=969, subsample=0.931189213634 
[CV]  colsample_bytree=0.622689411135, learning_rate=0.0516392417043, max_depth=12, n_estimators=969, subsample=0.931189213634, total= 1.9min
[CV] colsample_bytree=0.622689411135, learning_r

[CV]  colsample_bytree=0.754316984371, learning_rate=0.0190557353305, max_depth=8, n_estimators=1089, subsample=0.859716447761, total= 1.7min
[CV] colsample_bytree=0.385656622367, learning_rate=0.0555418870383, max_depth=9, n_estimators=574, subsample=0.411909269565 
[CV]  colsample_bytree=0.385656622367, learning_rate=0.0555418870383, max_depth=9, n_estimators=574, subsample=0.411909269565, total=  34.6s
[CV] colsample_bytree=0.385656622367, learning_rate=0.0555418870383, max_depth=9, n_estimators=574, subsample=0.411909269565 
[CV]  colsample_bytree=0.385656622367, learning_rate=0.0555418870383, max_depth=9, n_estimators=574, subsample=0.411909269565, total=  35.1s
[CV] colsample_bytree=0.385656622367, learning_rate=0.0555418870383, max_depth=9, n_estimators=574, subsample=0.411909269565 
[CV]  colsample_bytree=0.385656622367, learning_rate=0.0555418870383, max_depth=9, n_estimators=574, subsample=0.411909269565, total=  34.6s
[CV] colsample_bytree=0.350129696773, learning_rate=0.074

[CV]  colsample_bytree=0.421855979629, learning_rate=0.0821641454036, max_depth=12, n_estimators=1214, subsample=0.364735670202, total= 1.7min
[CV] colsample_bytree=0.421855979629, learning_rate=0.0821641454036, max_depth=12, n_estimators=1214, subsample=0.364735670202 
[CV]  colsample_bytree=0.421855979629, learning_rate=0.0821641454036, max_depth=12, n_estimators=1214, subsample=0.364735670202, total= 1.7min
[CV] colsample_bytree=0.421855979629, learning_rate=0.0821641454036, max_depth=12, n_estimators=1214, subsample=0.364735670202 
[CV]  colsample_bytree=0.421855979629, learning_rate=0.0821641454036, max_depth=12, n_estimators=1214, subsample=0.364735670202, total= 1.7min
[CV] colsample_bytree=0.561536857289, learning_rate=0.090411024451, max_depth=6, n_estimators=1008, subsample=0.496162120285 
[CV]  colsample_bytree=0.561536857289, learning_rate=0.090411024451, max_depth=6, n_estimators=1008, subsample=0.496162120285, total=  51.9s
[CV] colsample_bytree=0.561536857289, learning_r

[CV]  colsample_bytree=0.883208373731, learning_rate=0.0801382547217, max_depth=10, n_estimators=1272, subsample=0.774922559607, total= 2.5min
[CV] colsample_bytree=0.883208373731, learning_rate=0.0801382547217, max_depth=10, n_estimators=1272, subsample=0.774922559607 
[CV]  colsample_bytree=0.883208373731, learning_rate=0.0801382547217, max_depth=10, n_estimators=1272, subsample=0.774922559607, total= 2.5min
[CV] colsample_bytree=0.763391059783, learning_rate=0.0917077332588, max_depth=8, n_estimators=1217, subsample=0.250856529716 
[CV]  colsample_bytree=0.763391059783, learning_rate=0.0917077332588, max_depth=8, n_estimators=1217, subsample=0.250856529716, total= 1.3min
[CV] colsample_bytree=0.763391059783, learning_rate=0.0917077332588, max_depth=8, n_estimators=1217, subsample=0.250856529716 
[CV]  colsample_bytree=0.763391059783, learning_rate=0.0917077332588, max_depth=8, n_estimators=1217, subsample=0.250856529716, total= 1.4min
[CV] colsample_bytree=0.763391059783, learning_r

[CV]  colsample_bytree=0.940090684067, learning_rate=0.0345623240299, max_depth=5, n_estimators=1183, subsample=0.406017239136, total= 1.1min
[CV] colsample_bytree=0.631187594724, learning_rate=0.0170475045762, max_depth=5, n_estimators=634, subsample=0.670643668177 
[CV]  colsample_bytree=0.631187594724, learning_rate=0.0170475045762, max_depth=5, n_estimators=634, subsample=0.670643668177, total=  38.8s
[CV] colsample_bytree=0.631187594724, learning_rate=0.0170475045762, max_depth=5, n_estimators=634, subsample=0.670643668177 
[CV]  colsample_bytree=0.631187594724, learning_rate=0.0170475045762, max_depth=5, n_estimators=634, subsample=0.670643668177, total=  31.5s
[CV] colsample_bytree=0.631187594724, learning_rate=0.0170475045762, max_depth=5, n_estimators=634, subsample=0.670643668177 
[CV]  colsample_bytree=0.631187594724, learning_rate=0.0170475045762, max_depth=5, n_estimators=634, subsample=0.670643668177, total=  32.2s
[CV] colsample_bytree=0.679791311028, learning_rate=0.064

[CV]  colsample_bytree=0.585818230682, learning_rate=0.0936435564334, max_depth=5, n_estimators=800, subsample=0.937721045305, total=  45.9s
[CV] colsample_bytree=0.585818230682, learning_rate=0.0936435564334, max_depth=5, n_estimators=800, subsample=0.937721045305 
[CV]  colsample_bytree=0.585818230682, learning_rate=0.0936435564334, max_depth=5, n_estimators=800, subsample=0.937721045305, total=  42.7s
[CV] colsample_bytree=0.585818230682, learning_rate=0.0936435564334, max_depth=5, n_estimators=800, subsample=0.937721045305 
[CV]  colsample_bytree=0.585818230682, learning_rate=0.0936435564334, max_depth=5, n_estimators=800, subsample=0.937721045305, total=  42.3s


[Parallel(n_jobs=1)]: Done 750 out of 750 | elapsed: 938.4min finished


CPU times: user 5d 3h 30min 8s, sys: 21min 45s, total: 5d 3h 51min 54s
Wall time: 15h 43min 14s


In [33]:
randomized_search.grid_scores_



[mean: -0.47500, std: 0.00043, params: {'colsample_bytree': 0.74267323180031286, 'learning_rate': 0.033076985073781255, 'max_depth': 9, 'n_estimators': 778, 'subsample': 0.29043055758760983},
 mean: -0.48568, std: 0.00053, params: {'colsample_bytree': 0.48308851626110311, 'learning_rate': 0.048381346629868781, 'max_depth': 5, 'n_estimators': 774, 'subsample': 0.34812924239791454},
 mean: -0.47722, std: 0.00034, params: {'colsample_bytree': 0.77879984040248496, 'learning_rate': 0.02613874089232876, 'max_depth': 8, 'n_estimators': 653, 'subsample': 0.66637526827723126},
 mean: -0.46532, std: 0.00059, params: {'colsample_bytree': 0.97239137419137323, 'learning_rate': 0.030531319747579694, 'max_depth': 11, 'n_estimators': 1034, 'subsample': 0.74313271520891644},
 mean: -0.48386, std: 0.00077, params: {'colsample_bytree': 0.24265369029979145, 'learning_rate': 0.031516681584218519, 'max_depth': 11, 'n_estimators': 828, 'subsample': 0.72564506619397329},
 mean: -0.48464, std: 0.00031, params:

In [22]:
randomized_search.best_params_

{'colsample_bytree': 0.47800317678119192,
 'learning_rate': 0.023478574372627188,
 'max_depth': 15,
 'n_estimators': 1399,
 'subsample': 0.90242029937509605}

In [None]:
# skeval.plot.grid_search(
#     randomized_search.grid_scores_,
#     change='n_estimators',
#     kind='bar'
# )

In [23]:
# skeval.plot.grid_search(
#     search_result,
#     change=['colsample_bytree', 'subsample'],
#     subset={
#         'num_boost_round': 500,
#         'learning_rate': 0.05,
#         'max_depth': 10,
#     }
# )

## Evaluate Model

In [24]:
y_pred_train = model.predict(X_train)
y_pred_proba_train = model.predict_proba(X_train)[:, -1]

In [25]:
y_pred_val = model.predict(X_val)
y_pred_proba_val = model.predict_proba(X_val)[:, -1]

In [26]:
continuous_metrics = [log_loss, roc_auc_score]

In [27]:
binary_metrics = [accuracy_score, precision_score, recall_score]

### Train

In [28]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

log_loss            :    0.25620
roc_auc_score       :    0.99130
accuracy_score      :    0.91013
precision_score     :    0.80595
recall_score        :    0.99652


### Validation

In [29]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_proba_val)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_val)))

log_loss            :    0.45364
roc_auc_score       :    0.85295
accuracy_score      :    0.75820
precision_score     :    0.63311
recall_score        :    0.82063


In [None]:
raise ValueError('Stopping before the test set')

### Test

In [34]:
X_test = load(features_data_folder + 'X_test_summary_stats.pickle')

In [35]:
y_test = model.predict_proba(X_test)[:, -1]

In [36]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [37]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

In [38]:
df_submission = df_submission[['test_id', 'is_duplicate']]

In [39]:
df_submission.head(10)

Unnamed: 0,test_id,is_duplicate
0,0,0.212164
1,1,0.63657
2,2,0.775008
3,3,0.000105
4,4,0.186864
5,5,0.005855
6,6,0.662158
7,7,0.601435
8,8,0.564478
9,9,0.25498


In [40]:
df_submission.to_csv(submissions_data_folder + submission_id + '-submission-draft.csv', header=True, index=None)