In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

%matplotlib inline

from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
from numba import jit

#@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

#@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.18.


In [4]:
df_train = pd.read_hdf('../../search/feats/tr_time_diff_adjacent12.hdf5','table')

In [5]:
y = pd.read_csv('../../input_orig/train_numeric.csv', usecols=['Response'])

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=42, stratify=y.values)

In [7]:
del df_train
import gc
gc.collect()

dtrain = xgb.DMatrix(X_train, label=y_train)

del X_train
del y_train
gc.collect()

dval = xgb.DMatrix(X_test, label=y_test)

del X_test
del y_test
gc.collect()

9

In [None]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst = xgb.train(xgb_params, dtrain, num_boost_round=50000, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1)

In [None]:
val_pred = bst.predict(dval, ntree_limit=bst.best_ntree_limit)

In [None]:
mcc_eval(val_pred, dval)

In [None]:
#eval_mcc(dval.get_label(), val_pred, show=1)

In [None]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params1 = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst1 = xgb.train(xgb_params1, dtrain, num_boost_round=50000, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True)

In [12]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params2 = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    'num_parallel_tree': 10,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst2 = xgb.train(xgb_params2, dtrain, num_boost_round=1, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True)

[0]	train-MCC:0.164093	eval-MCC:0.147757
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.


In [18]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params3 = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    'num_parallel_tree': 10,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst3 = xgb.train(xgb_params3, dtrain, num_boost_round=1, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True, xgb_model=bst2)

[0]	train-MCC:0.187311	eval-MCC:0.152543
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.


In [13]:
val_pred2 = bst2.predict(dval)

In [14]:
from sklearn.metrics import roc_auc_score
roc_auc_score(dval.get_label(), val_pred2)

0.61968873949198322

In [15]:
mcc_eval(val_pred2, dval)

('MCC', 0.14775725336057771)

In [16]:
len(bst2.get_score(fmap='', importance_type='gain'))

101

In [19]:
val_pred3 = bst3.predict(dval)
print(roc_auc_score(dval.get_label(), val_pred3))
print(mcc_eval(val_pred3, dval))
print(len(bst3.get_score(fmap='', importance_type='gain')))

0.628627757203
('MCC', 0.15254323873498979)
175


In [20]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params3 = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    'num_parallel_tree': 20,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst4 = xgb.train(xgb_params3, dtrain, num_boost_round=1, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True, xgb_model=bst3)
val_pred3 = bst4.predict(dval)
print(roc_auc_score(dval.get_label(), val_pred3))
print(mcc_eval(val_pred3, dval))
print(len(bst4.get_score(fmap='', importance_type='gain')))

[0]	train-MCC:0.204316	eval-MCC:0.157513
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.
0.635648390595
('MCC', 0.15751290159005946)
233


In [21]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params3 = {
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    'num_parallel_tree': 10,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst5 = xgb.train(xgb_params3, dtrain, num_boost_round=1, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True, xgb_model=bst4)
val_pred3 = bst5.predict(dval)
print(roc_auc_score(dval.get_label(), val_pred3))
print(mcc_eval(val_pred3, dval))
print(len(bst5.get_score(fmap='', importance_type='gain')))

[0]	train-MCC:0.209037	eval-MCC:0.157331
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.
0.640980759579
('MCC', 0.15733091946411609)
277


In [22]:
prior = (np.sum(y) / (1.*len(y))).values[0]

xgb_params3 = {
    'colsample_bytree': 0.7,
    'colsample_bylevel': 1,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'max_depth': 12,
    'min_child_weight': 1,
    'gamma': 0,
    'base_score': prior,
    'num_parallel_tree': 50,
    #'tree_method': 'exact',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': 0,
    'seed': 12,
    'nthread': 4
}

evals  = [(dtrain,'train'), (dval,'eval')]
bst6 = xgb.train(xgb_params3, dtrain, num_boost_round=1, early_stopping_rounds=20, 
                 evals=evals, verbose_eval=1, feval=mcc_eval, maximize=True)
val_pred3 = bst6.predict(dval)
print(roc_auc_score(dval.get_label(), val_pred3))
print(mcc_eval(val_pred3, dval))
print(len(bst6.get_score(fmap='', importance_type='gain')))

[0]	train-MCC:0.168643	eval-MCC:0.154525
Multiple eval metrics have been passed: 'eval-MCC' will be used for early stopping.

Will train until eval-MCC hasn't improved in 20 rounds.
0.655206296743
('MCC', 0.15452478072626402)
536


In [None]:
def create_feature_map(features):
    outfile = open('xgb_1173_date_feats.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

from operator import itemgetter
def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb_1173_date_feats.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

In [None]:
#get_importance(bst1, bst1.feature_names)
create_feature_map(bst1.feature_names)

bst1.save_model('xgb_1173_date_0168mcc_0671auc.model')

In [None]:
#get_importance(bst1, bst1.feature_names)