In [43]:
%matplotlib inline
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from lib import XGBHelper as xgbh
data  = np.load("Data/processed_data_150mb.np")
X = data[:, :-1]
y = np.array(data[:, -1], dtype=int)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [44]:
from xgboost import cv

In [45]:
param = {}
param['max_depth']= 3   # depth of tree
param['eta'] = 0.3      # shrinkage parameter
param['silent'] = 0     # not silent
param['objective'] = 'binary:logistic'
param['nthread'] = 7 # Number of threads used

plst = param.items()
print(param)
evallist = [(dtrain, 'train'), (dtest, 'eval')]
cross_val = cv(param, dtrain, num_boost_round=160, nfold=10, stratified=False, folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, fpreproc=None, as_pandas=True, verbose_eval= 5, show_stdv=True, seed=0, callbacks=None, shuffle=True)

{'max_depth': 3, 'eta': 0.3, 'silent': 0, 'objective': 'binary:logistic', 'nthread': 7}
[0]	train-error:0.179995+0.000768324	test-error:0.183286+0.00743744
[5]	train-error:0.165496+0.00260266	test-error:0.170433+0.00580026
[10]	train-error:0.152309+0.00156104	test-error:0.158099+0.0049536
[15]	train-error:0.144152+0.00172742	test-error:0.150429+0.00672566
[20]	train-error:0.140087+0.00178554	test-error:0.147043+0.0061539
[25]	train-error:0.137258+0.00145373	test-error:0.145108+0.00547633
[30]	train-error:0.135787+0.00178357	test-error:0.14345+0.00548694
[35]	train-error:0.134071+0.00123099	test-error:0.143449+0.0056859
[40]	train-error:0.132409+0.00117832	test-error:0.143104+0.00509052
[45]	train-error:0.131303+0.00109255	test-error:0.142862+0.00548108
[50]	train-error:0.12981+0.00110879	test-error:0.14231+0.00539924
[55]	train-error:0.128731+0.00103833	test-error:0.141826+0.00573896
[60]	train-error:0.127637+0.00123472	test-error:0.140859+0.00594658
[65]	train-error:0.126609+0.0012249

In [46]:
1-min(cross_val['test-error-mean'])

0.8606271

In [47]:
def get_error_values(y_pred, y_test, thresholds):
    accuracy_1 = []
    accuracy_0 = []
    for thresh in thresholds:
        y_test_i = y_test[y_test == 1]
        y_pred_i = y_pred[y_test == 1]
        correct = np.sum(y_pred_i > thresh)
        accuracy_1.append(1.0 * correct / len(y_test_i))

        y_test_i = y_test[y_test == 0]
        y_pred_i = y_pred[y_test == 0]
        correct = np.sum(y_pred_i <= thresh)
        accuracy_0.append(1.0 * correct / len(y_test_i))
    
    error_1 = list(1 - np.array(accuracy_1))
    error_0 = list(1 - np.array(accuracy_0))
    return error_1, error_0

In [48]:
num_round = 150
bst = xgb.train(plst, dtrain, num_round, evallist, verbose_eval=False)
y_pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit, output_margin=True)
thresholds = sorted(np.unique(np.round(y_pred, 2)))
error_cuv, error_ger = xgbh.get_error_values(y_pred, y_test, thresholds)

In [69]:
accuracy_1 = []
thresh = 0.5
y_test_i = y_test[y_test == 1]
y_pred_i = y_pred[y_test == 1]
correct = np.sum(y_pred_i > thresh)
accuracy_1.append(1.0 * correct / len(y_test_i))

In [76]:
accuracy_1

[0.8338810875410816]

In [51]:
accuracy_0 = []
thresh = 0.5
y_test_i = y_test[y_test == 0]
y_pred_i = y_pred[y_test == 0]
correct = np.sum(y_pred_i <= thresh)
accuracy_0.append(1.0 * correct / len(y_test_i))

In [77]:
accuracy_0

[0.8882661996497373]

In [78]:
accuracy = accuracy_0 + accuracy_1

In [79]:
accuracy

[0.8882661996497373, 0.8338810875410816]

In [80]:
sum(accuracy)/2

0.8610736435954094