## K-Fold CV XGBoost

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split


from sklearn.model_selection import cross_val_predict

import xgboost as xgb


In [36]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [49]:
RANDOM_SEED = 12
TEST_SIZE_PERCENT = 0.2

In [81]:
training = pd.read_csv('test/training-person2.csv')
test = pd.read_csv('test/test-person2.csv')
sumbit = test['person'].to_frame()

target_train = training['label'].values
id_test = test['person'].values

train = np.array(training.drop(['label'], axis = 1))
test = np.array(test.drop(['person'], axis = 1))

xgb_preds = []

In [82]:
#y = training['label']
#X = training.drop(axis=1, labels=['label'])
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE_PERCENT, random_state=RANDOM_SEED)

#target_train = y_train.values
#id_test = y_test.values

#train =  np.array(X_train)
#test = np.array(X_test)

In [83]:
#Create K-fold cross-validation (K=4 here)
K = 5
kf = KFold(n_splits = K, random_state = 3228, shuffle = True)

In [84]:
for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, verbose_eval=50, early_stopping_rounds=200)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))

[0]	train-auc:0.828779	valid-auc:0.820564	train-gini:0.678786	valid-gini:0.659834
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 200 rounds.
[50]	train-auc:0.875065	valid-auc:0.853479	train-gini:0.75021	valid-gini:0.706909
[100]	train-auc:0.882441	valid-auc:0.85585	train-gini:0.764959	valid-gini:0.71169
[150]	train-auc:0.889755	valid-auc:0.860211	train-gini:0.779512	valid-gini:0.720401
[200]	train-auc:0.896598	valid-auc:0.860495	train-gini:0.793196	valid-gini:0.72098
[250]	train-auc:0.905682	valid-auc:0.860956	train-gini:0.811363	valid-gini:0.721912
[300]	train-auc:0.914537	valid-auc:0.861449	train-gini:0.829075	valid-gini:0.722897
[350]	train-auc:0.921571	valid-auc:0.861484	train-gini:0.843142	valid-gini:0.722967
[400]	train-auc:0.926989	valid-auc:0.861254	train-gini:0.853977	valid-gini:0.722508
[450]	train-auc:0.93216	valid-auc:0.860679	train-gini:0.864321	valid-gini:0.721357
[500]	train-auc:0.9368

In [85]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

output = pd.DataFrame({'person': id_test, 'label': preds})

In [86]:
#roc_auc_score(y_test, output['label'])

ValueError: Found input variables with inconsistent numbers of samples: [3883, 19415]

In [None]:
output.to_csv('test/sumbit-XGB-CV.csv', index=False)