In [None]:
df1 = pd.read_hdf('feats_selected/train_engineered_933cols.hdf5','table')
df2 = pd.read_hdf('feats_selected/train_init_761.hdf5','table')
df_train = pd.concat([df1,df2],axis=1)

y = pd.read_csv('../../input_orig/train_numeric.csv', usecols=['Response'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=42, stratify=y.values)

del df_train
import gc
gc.collect()

dtrain = xgb.DMatrix(X_train, label=y_train)

del X_train
del y_train
gc.collect()

dval = xgb.DMatrix(X_test, label=y_test)

del X_test
del y_test
gc.collect()

dtrain.save_binary("feats_selected/train_with_engg_1694.buffer")
dval.save_binary("feats_selected/val_with_engg_1694.buffer")

In [8]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
from numba import jit

#@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

#@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

In [2]:
dtrain = xgb.DMatrix('feats_selected/train_with_engg_1694.buffer')
dval = xgb.DMatrix('feats_selected/val_with_engg_1694.buffer')
y = pd.read_csv('../../input_orig/train_numeric.csv', usecols=['Response'])

In [10]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params = {
    'colsample_bytree': 0.2,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 18,
    'min_child_weight': 1,
    'gamma': 10,
    'base_score': prior,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 20
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst = xgb.train(xgb_params, dtrain, num_boost_round=50000, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1)

val_pred = bst.predict(dval, ntree_limit=bst.best_ntree_limit)
print(roc_auc_score(dval.get_label(), val_pred))
print(mcc_eval(val_pred, dval))
print(len(bst.get_score(fmap='', importance_type='gain')))

[0]	train-auc:0.784501	eval-auc:0.703434
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 20 rounds.
[1]	train-auc:0.819222	eval-auc:0.71451
[2]	train-auc:0.843085	eval-auc:0.718414
[3]	train-auc:0.856376	eval-auc:0.720891
[4]	train-auc:0.866151	eval-auc:0.725695
[5]	train-auc:0.939423	eval-auc:0.878264
[6]	train-auc:0.941586	eval-auc:0.875633
[7]	train-auc:0.944149	eval-auc:0.874603
[8]	train-auc:0.946216	eval-auc:0.872077
[9]	train-auc:0.947721	eval-auc:0.869796
[10]	train-auc:0.949678	eval-auc:0.867558
[11]	train-auc:0.950314	eval-auc:0.865927
[12]	train-auc:0.951924	eval-auc:0.863984
[13]	train-auc:0.953692	eval-auc:0.862947
[14]	train-auc:0.955025	eval-auc:0.861777
[15]	train-auc:0.957422	eval-auc:0.860727
[16]	train-auc:0.958061	eval-auc:0.859314
[17]	train-auc:0.959047	eval-auc:0.858546
[18]	train-auc:0.962876	eval-auc:0.882074
[19]	train-auc:0.964296	eval-auc:0.880821
[20]	train-auc:0.964976	eval-a

In [12]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 36,
    'min_child_weight': 1,
    'gamma': 10,
    'base_score': prior,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 20
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst1 = xgb.train(xgb_params, dtrain, num_boost_round=50000, early_stopping_rounds=30, 
                 evals=evals, verbose_eval=1)

val_pred = bst1.predict(dval, ntree_limit=bst1.best_ntree_limit)
print(roc_auc_score(dval.get_label(), val_pred))
print(mcc_eval(val_pred, dval))
print(len(bst1.get_score(fmap='', importance_type='gain')))

[0]	train-auc:0.919535	eval-auc:0.889102
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 30 rounds.
[1]	train-auc:0.925261	eval-auc:0.890808
[2]	train-auc:0.930378	eval-auc:0.891239
[3]	train-auc:0.935743	eval-auc:0.891633
[4]	train-auc:0.938973	eval-auc:0.894532
[5]	train-auc:0.942111	eval-auc:0.895575
[6]	train-auc:0.943281	eval-auc:0.897164
[7]	train-auc:0.944884	eval-auc:0.898274
[8]	train-auc:0.951088	eval-auc:0.898273
[9]	train-auc:0.957818	eval-auc:0.899334
[10]	train-auc:0.960299	eval-auc:0.899173
[11]	train-auc:0.962072	eval-auc:0.899687
[12]	train-auc:0.964235	eval-auc:0.899485
[13]	train-auc:0.965958	eval-auc:0.898895
[14]	train-auc:0.967728	eval-auc:0.898208
[15]	train-auc:0.969705	eval-auc:0.897983
[16]	train-auc:0.971222	eval-auc:0.89804
[17]	train-auc:0.972165	eval-auc:0.898402
[18]	train-auc:0.975603	eval-auc:0.899143
[19]	train-auc:0.977977	eval-auc:0.900148
[20]	train-auc:0.979551	eval-a

In [14]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params = {
    'booster': 'dart',
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.05,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 10,
    'base_score': prior,
    'rate_drop': 0.2,
    'skip_drop': 0.5,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 20
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst2 = xgb.train(xgb_params, dtrain, num_boost_round=50000, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1)

val_pred = bst2.predict(dval, ntree_limit=bst2.best_ntree_limit)
print(roc_auc_score(dval.get_label(), val_pred))
print(mcc_eval(val_pred, dval))
print(len(bst2.get_score(fmap='', importance_type='gain')))

[0]	train-auc:0.734959	eval-auc:0.686321
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 20 rounds.
[1]	train-auc:0.737817	eval-auc:0.689537
[2]	train-auc:0.75577	eval-auc:0.69453
[3]	train-auc:0.743058	eval-auc:0.698339
[4]	train-auc:0.778599	eval-auc:0.696613
[5]	train-auc:0.78503	eval-auc:0.700314
[6]	train-auc:0.784991	eval-auc:0.706388
[7]	train-auc:0.790628	eval-auc:0.707113
[8]	train-auc:0.792573	eval-auc:0.709718
[9]	train-auc:0.794649	eval-auc:0.708958
[10]	train-auc:0.790641	eval-auc:0.710519
[11]	train-auc:0.794658	eval-auc:0.7104
[12]	train-auc:0.806329	eval-auc:0.714667
[13]	train-auc:0.808178	eval-auc:0.716171
[14]	train-auc:0.808481	eval-auc:0.717265
[15]	train-auc:0.809829	eval-auc:0.716704
[16]	train-auc:0.810579	eval-auc:0.717706
[17]	train-auc:0.817238	eval-auc:0.718392
[18]	train-auc:0.818323	eval-auc:0.721169
[19]	train-auc:0.817052	eval-auc:0.719687
[20]	train-auc:0.818208	eval-auc:0