In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

%matplotlib inline

from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
from numba import jit

@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

In [2]:
dtrain = xgb.DMatrix('../input_hdf/train_with_faron_sel.buffer')

In [3]:
y = dtrain.get_label()
prior = np.sum(y) / (1.*len(y))

In [6]:
xgb_params1 = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 48,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    'tree_method': 'exact',
    'objective': 'binary:logistic',
    'silent': 0,
    'seed': 12,
    'nthread': 22
}
#'tree_method': 'exact',
evals  = [(dtrain,'train')]
bst1 = xgb.train(xgb_params1, dtrain, num_boost_round=100, evals=evals, verbose_eval=2, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.349939
[2]	train-MCC:0.473154
[4]	train-MCC:0.556565
[6]	train-MCC:0.628336
[8]	train-MCC:0.70539
[10]	train-MCC:0.757008
[12]	train-MCC:0.79259
[14]	train-MCC:0.818238
[16]	train-MCC:0.834695
[18]	train-MCC:0.849717
[20]	train-MCC:0.871542
[22]	train-MCC:0.891927
[24]	train-MCC:0.909823
[26]	train-MCC:0.927451
[28]	train-MCC:0.942184
[30]	train-MCC:0.953322
[32]	train-MCC:0.962653
[34]	train-MCC:0.971831
[36]	train-MCC:0.97793
[38]	train-MCC:0.980547
[40]	train-MCC:0.985143
[42]	train-MCC:0.986468
[44]	train-MCC:0.989185
[46]	train-MCC:0.990859
[48]	train-MCC:0.991873
[50]	train-MCC:0.992535
[52]	train-MCC:0.993562
[54]	train-MCC:0.995253
[56]	train-MCC:0.996421
[58]	train-MCC:0.996927
[60]	train-MCC:0.997734
[62]	train-MCC:0.998245
[64]	train-MCC:0.998539
[66]	train-MCC:0.998904
[68]	train-MCC:0.999122
[70]	train-MCC:0.999342
[72]	train-MCC:0.999488
[74]	train-MCC:0.999561
[76]	train-MCC:0.999561
[78]	train-MCC:0.999561
[80]	train-MCC:0.999561
[82]	train-MCC:0.999561


In [14]:
bst2 = xgb.train(xgb_params1, dtrain, xgb_model=bst1, num_boost_round=10, evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.999561
[1]	train-MCC:0.999561
[2]	train-MCC:0.999561
[3]	train-MCC:0.999561
[4]	train-MCC:0.999561
[5]	train-MCC:0.999561
[6]	train-MCC:0.999561
[7]	train-MCC:0.999561
[8]	train-MCC:0.999561
[9]	train-MCC:0.999561


In [18]:
bst3 = xgb.train(xgb_params1, dtrain, xgb_model=bst2, num_boost_round=10, evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.999561
[1]	train-MCC:0.999561
[2]	train-MCC:0.999561
[3]	train-MCC:0.999561
[4]	train-MCC:0.999561
[5]	train-MCC:0.999561
[6]	train-MCC:0.999561
[7]	train-MCC:0.999561
[8]	train-MCC:0.999562
[9]	train-MCC:0.999635


In [7]:
bst1.dump_model('427646mcc_pred.dump', fmap='xgb1.fmap', with_stats=True)
bst1.save_model('427646_pred.model')

In [8]:
dtest = xgb.DMatrix('../input_hdf/test_with_faron_sel.buffer')

In [9]:
best_proba = 0.29268866777420044

In [19]:
# generate predictions at the chosen threshold
preds = (bst3.predict(dtest) > best_proba).astype(np.int8)

In [20]:
# and submit
sub = pd.read_csv("../input_orig/sample_submission.csv", index_col=0)
sub["Response"] = preds
sub.to_csv("427646_pred_bst3_submission.csv.gz", compression="gzip")

In [21]:
preds.sum()

2185