In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

%matplotlib inline

from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
from numba import jit

@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

In [None]:
dtrain = xgb.DMatrix('input_hdf/train_with_faron.buffer')

In [3]:
df1 = pd.read_csv('input_orig/train_numeric.csv', usecols=['Id','Response'])

In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, df1.Response.values, test_size=0.1, random_state=42, stratify=df1.Response.values)

In [5]:
y = dtrain.get_label()
prior = np.sum(y) / (1.*len(y))

In [6]:
dvalid = dtrain.slice(X_test.index)

In [7]:
dvalid.num_row(), X_test.shape

(118375, (118375, 2))

In [8]:
dtrain = dtrain.slice(X_train.index)

In [9]:
dtrain.num_row(), X_train.shape

(1065372, (1065372, 2))

In [14]:
spw = (y.shape[0]-y.sum())/y.sum()

In [28]:
xgb_params5 = {
    'seed': 12,
    'colsample_bytree': 1,
    'gamma': 0.2,
    'silent': 0,
    'subsample': 1,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 9,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'max_delta_step': 1,
    'base_score': prior,
    'nthread': 16
}
evals5  = [(dtrain,'train'), (dvalid,'eval')]
bst5 = xgb.train(xgb_params5, dtrain, num_boost_round=500, 
          evals=evals5, early_stopping_rounds=20, 
          verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.101656	eval-MCC:0.074093
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.
[1]	train-MCC:0.147531	eval-MCC:0.114974
[2]	train-MCC:0.156966	eval-MCC:0.125581
[3]	train-MCC:0.200676	eval-MCC:0.169536
[4]	train-MCC:0.215927	eval-MCC:0.188348
[5]	train-MCC:0.224383	eval-MCC:0.203386
[6]	train-MCC:0.226055	eval-MCC:0.204349
[7]	train-MCC:0.233152	eval-MCC:0.210605
[8]	train-MCC:0.237714	eval-MCC:0.216841
[9]	train-MCC:0.240379	eval-MCC:0.219236
[10]	train-MCC:0.241404	eval-MCC:0.221061
[11]	train-MCC:0.246147	eval-MCC:0.223414
[12]	train-MCC:0.249211	eval-MCC:0.22447
[13]	train-MCC:0.252353	eval-MCC:0.225822
[14]	train-MCC:0.254265	eval-MCC:0.226966
[15]	train-MCC:0.255546	eval-MCC:0.229781
[16]	train-MCC:0.257099	eval-MCC:0.22923
[17]	train-MCC:0.258208	eval-MCC:0.230459
[18]	train-MCC:0.259757	eval-MCC:0.232465
[19]	train-MCC:0.259909	eval-MCC:0.23308
[20]	train-MCC:0.262731	eval-MCC

In [29]:
import operator
importance = bst5.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

len(importance)

509

In [30]:
bst5.save_model('0001.model')

In [27]:
xgb_params4 = {
    'seed': 12,
    'colsample_bytree': 0.7,
    'silent': 0,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'max_delta_step': 1,
    'base_score': prior,
    'nthread': 16
}
evals4  = [(dtrain,'train'), (dvalid,'eval')]
bst4 = xgb.train(xgb_params4, dtrain, num_boost_round=500, 
          evals=evals4, early_stopping_rounds=20, 
          verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.144084	eval-MCC:0.132538
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.
[1]	train-MCC:0.187442	eval-MCC:0.180184
[2]	train-MCC:0.189108	eval-MCC:0.184737
[3]	train-MCC:0.195746	eval-MCC:0.191437
[4]	train-MCC:0.198772	eval-MCC:0.1933
[5]	train-MCC:0.208298	eval-MCC:0.204426
[6]	train-MCC:0.208369	eval-MCC:0.204613
[7]	train-MCC:0.20871	eval-MCC:0.204613
[8]	train-MCC:0.211193	eval-MCC:0.207869
[9]	train-MCC:0.211882	eval-MCC:0.210031
[10]	train-MCC:0.211882	eval-MCC:0.210031
[11]	train-MCC:0.215175	eval-MCC:0.214675
[12]	train-MCC:0.215658	eval-MCC:0.214675
[13]	train-MCC:0.220175	eval-MCC:0.216512
[14]	train-MCC:0.220175	eval-MCC:0.216512
[15]	train-MCC:0.221749	eval-MCC:0.217925
[16]	train-MCC:0.221749	eval-MCC:0.217925
[17]	train-MCC:0.221749	eval-MCC:0.218163
[18]	train-MCC:0.221776	eval-MCC:0.218402
[19]	train-MCC:0.221776	eval-MCC:0.219202
[20]	train-MCC:0.223836	eval-MCC

In [24]:
xgb_params3 = {
    'seed': 12,
    'colsample_bytree': 0.7,
    'silent': 0,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 9,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'max_delta_step': 1,
    'base_score': prior,
    'nthread': 16
}
evals3  = [(dtrain,'train'), (dvalid,'eval')]
bst3 = xgb.train(xgb_params3, dtrain, num_boost_round=500, 
          evals=evals3, early_stopping_rounds=20, 
          verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.103177	eval-MCC:0.081876
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.
[1]	train-MCC:0.182421	eval-MCC:0.160426
[2]	train-MCC:0.197017	eval-MCC:0.179417
[3]	train-MCC:0.209835	eval-MCC:0.19641
[4]	train-MCC:0.210657	eval-MCC:0.19955
[5]	train-MCC:0.214396	eval-MCC:0.20313
[6]	train-MCC:0.215537	eval-MCC:0.205143
[7]	train-MCC:0.219624	eval-MCC:0.209549
[8]	train-MCC:0.221824	eval-MCC:0.216904
[9]	train-MCC:0.22407	eval-MCC:0.218846
[10]	train-MCC:0.226918	eval-MCC:0.219914
[11]	train-MCC:0.227383	eval-MCC:0.220761
[12]	train-MCC:0.228241	eval-MCC:0.223141
[13]	train-MCC:0.230206	eval-MCC:0.224955
[14]	train-MCC:0.231882	eval-MCC:0.229305
[15]	train-MCC:0.23181	eval-MCC:0.227301
[16]	train-MCC:0.233107	eval-MCC:0.226944
[17]	train-MCC:0.234163	eval-MCC:0.226425
[18]	train-MCC:0.235145	eval-MCC:0.228305
[19]	train-MCC:0.235975	eval-MCC:0.230719
[20]	train-MCC:0.238121	eval-MCC:0

In [25]:
import operator
importance = bst3.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

len(importance)

694

In [21]:
xgb_params2 = {
    'seed': 12,
    'colsample_bytree': 1,
    'silent': 0,
    'subsample': 1,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 9,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'max_delta_step': 1,
    'base_score': prior,
    'nthread': 16
}
evals2  = [(dtrain,'train'), (dvalid,'eval')]
bst2 = xgb.train(xgb_params2, dtrain, num_boost_round=500, 
          evals=evals2, early_stopping_rounds=10, 
          verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.101656	eval-MCC:0.074093
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 10 rounds.
[1]	train-MCC:0.147531	eval-MCC:0.114974
[2]	train-MCC:0.156966	eval-MCC:0.125581
[3]	train-MCC:0.202549	eval-MCC:0.172481
[4]	train-MCC:0.215927	eval-MCC:0.188477
[5]	train-MCC:0.224383	eval-MCC:0.203386
[6]	train-MCC:0.226112	eval-MCC:0.204349
[7]	train-MCC:0.233421	eval-MCC:0.211701
[8]	train-MCC:0.238044	eval-MCC:0.216841
[9]	train-MCC:0.240607	eval-MCC:0.219236
[10]	train-MCC:0.241677	eval-MCC:0.221061
[11]	train-MCC:0.245943	eval-MCC:0.223414
[12]	train-MCC:0.249157	eval-MCC:0.224918
[13]	train-MCC:0.251824	eval-MCC:0.225822
[14]	train-MCC:0.254092	eval-MCC:0.227197
[15]	train-MCC:0.254845	eval-MCC:0.227661
[16]	train-MCC:0.256775	eval-MCC:0.227391
[17]	train-MCC:0.257856	eval-MCC:0.229457
[18]	train-MCC:0.258837	eval-MCC:0.230342
[19]	train-MCC:0.260623	eval-MCC:0.231514
[20]	train-MCC:0.261112	eval-

In [22]:
import operator
importance = bst2.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

len(importance)

487

In [18]:
xgb_params1 = {
    'seed': 12,
    'colsample_bytree': 1,
    'silent': 0,
    'subsample': 1,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 6,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'max_delta_step': 1,
    'base_score': prior,
    'nthread': 16
}
evals1  = [(dtrain,'train'), (dvalid,'eval')]
bst1 = xgb.train(xgb_params1, dtrain, num_boost_round=500, 
          evals=evals1, early_stopping_rounds=10, 
          verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.158945	eval-MCC:0.140281
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 10 rounds.
[1]	train-MCC:0.185775	eval-MCC:0.169967
[2]	train-MCC:0.200763	eval-MCC:0.183731
[3]	train-MCC:0.205663	eval-MCC:0.190944
[4]	train-MCC:0.212093	eval-MCC:0.197412
[5]	train-MCC:0.216434	eval-MCC:0.20162
[6]	train-MCC:0.218415	eval-MCC:0.202657
[7]	train-MCC:0.219836	eval-MCC:0.209805
[8]	train-MCC:0.223189	eval-MCC:0.214376
[9]	train-MCC:0.224532	eval-MCC:0.216003
[10]	train-MCC:0.226384	eval-MCC:0.218062
[11]	train-MCC:0.22895	eval-MCC:0.218062
[12]	train-MCC:0.229841	eval-MCC:0.219536
[13]	train-MCC:0.229854	eval-MCC:0.219749
[14]	train-MCC:0.230071	eval-MCC:0.22068
[15]	train-MCC:0.23023	eval-MCC:0.220102
[16]	train-MCC:0.231573	eval-MCC:0.222234
[17]	train-MCC:0.231351	eval-MCC:0.223159
[18]	train-MCC:0.233571	eval-MCC:0.225959
[19]	train-MCC:0.234366	eval-MCC:0.226972
[20]	train-MCC:0.235227	eval-MCC:

In [19]:
import operator
importance = bst1.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

len(importance)

350

In [18]:
xgb_params = {
    'seed': 12,
    'colsample_bytree': 1,
    'silent': 0,
    'subsample': 1,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 6,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'max_delta_step': 1,
    'base_score': prior, 
    'nthread': 16
}
evals  = [(dtrain,'train'), (dvalid,'eval')]
bst = xgb.train(xgb_params, dtrain, num_boost_round=500, 
          evals=evals, early_stopping_rounds=10, 
          verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.227742	eval-MCC:0.214714
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 5 rounds.
[1]	train-MCC:0.228179	eval-MCC:0.218644
[2]	train-MCC:0.236488	eval-MCC:0.22626
[3]	train-MCC:0.237394	eval-MCC:0.22762
[4]	train-MCC:0.242003	eval-MCC:0.22974
[5]	train-MCC:0.245214	eval-MCC:0.232234
[6]	train-MCC:0.248151	eval-MCC:0.234204
[7]	train-MCC:0.2492	eval-MCC:0.234689
[8]	train-MCC:0.253065	eval-MCC:0.2352
[9]	train-MCC:0.255618	eval-MCC:0.23766
[10]	train-MCC:0.25734	eval-MCC:0.236629
[11]	train-MCC:0.259203	eval-MCC:0.238108
[12]	train-MCC:0.262757	eval-MCC:0.238861
[13]	train-MCC:0.266244	eval-MCC:0.243309
[14]	train-MCC:0.268314	eval-MCC:0.244251
[15]	train-MCC:0.272012	eval-MCC:0.244229
[16]	train-MCC:0.273699	eval-MCC:0.242851
[17]	train-MCC:0.275936	eval-MCC:0.245157
[18]	train-MCC:0.27842	eval-MCC:0.246093
[19]	train-MCC:0.27852	eval-MCC:0.249438
[20]	train-MCC:0.281149	eval-MCC:0.249768

In [20]:
import operator
importance = bst.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

len(importance)

In [26]:
bst.save_model('0001.model')