In [12]:
import pandas as pd
import numpy as np 
import xgboost as xgb
from collections import Counter
import re
import sklearn.cross_validation as cv

In [13]:
def Gini(y_pred, dtrain):
    y_pred = y_pred**power
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

## Train

In [15]:
import warnings
warnings.filterwarnings('ignore')
num_rounds = 10000
params = {  "objective": "reg:linear"
          , "eta": 0.005
          , "min_child_weight": 10
          , "subsample": 0.6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

powers=np.linspace(.01,.1,10)

tr = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
te =  pd.read_csv('./test.csv', sep = ',',index_col = 'Id')
labels = tr['Hazard'].values

full =  pd.concat(objs = [tr,te])
full_cat = full.select_dtypes(include = ['object'])
full_num = full.select_dtypes(exclude = ['object'])
full_cat = full_cat.apply(func = lambda x: np.unique(x, return_inverse =1)[1], axis=0)

full = pd.concat([full_num,full_cat], axis=1)
split = np.isnan(full.Hazard)

train = full[~split].drop('Hazard', axis=1).values

gini = []
for power in powers:
    for j in range(2):
        folds = cv.StratifiedKFold(labels, n_folds = 5, shuffle=True)
        for train_ind, val_ind in folds: 
            xgtrain = xgb.DMatrix(train[train_ind,:], label=labels[train_ind])
            xgval = xgb.DMatrix(train[val_ind,:], label=labels[val_ind])
            watchlist = [(xgtrain, 'train'),(xgval, 'val')]
            model = xgb.train(params, xgtrain, num_rounds, watchlist,
                              feval= Gini, early_stopping_rounds=200,verbose_eval=False)
            gini.append(model.best_score)
    print('power: {} | Mean Gini: {:.6f} | Std Gini: {:.6f} '.format(
        power, np.mean(gini),np.std(gini)))

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[950]	train-Gini:-0.630695	val-Gini:-0.391872

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[968]	train-Gini:-0.637550	val-Gini:-0.376222

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[798]	train-Gini:-0.607020	val-Gini:-0.386726

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1005]	train-Gini:-0.632584	val-Gini:-0.388299

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1475]	train-Gini:-0.685013	val-Gini:-0.384649

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1193]	train-Gini:-0.655291	val-Gini:-0.396362

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1435]	train-Gini:-0.677241	val-Gini:-0.376105

Will train until val error hasn't decreased in 200 rounds.
Stopping. Bes

power: 0.01 | Mean Gini: -0.384928 | Std Gini: 0.008290 
power: 0.020000000000000004 | Mean Gini: -0.384925 | Std Gini: 0.007362 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[795]	train-Gini:-0.607674	val-Gini:-0.395667

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1376]	train-Gini:-0.672974	val-Gini:-0.381493

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1076]	train-Gini:-0.649309	val-Gini:-0.396219

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1149]	train-Gini:-0.653598	val-Gini:-0.387812

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1076]	train-Gini:-0.641494	val-Gini:-0.367336

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1020]	train-Gini:-0.638288	val-Gini:-0.387319

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1158]	train-Gini:-0.652146	val-Gini:-0.388984

Will train until val error hasn't decreased in 200 rounds.
Stopping. B


power: 0.030000000000000006 | Mean Gini: -0.385146 | Std Gini: 0.007491 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1086]	train-Gini:-0.643335	val-Gini:-0.394832

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[834]	train-Gini:-0.618937	val-Gini:-0.406384

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1044]	train-Gini:-0.642444	val-Gini:-0.370408

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[981]	train-Gini:-0.633825	val-Gini:-0.377045

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[971]	train-Gini:-0.634371	val-Gini:-0.371610

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1012]	train-Gini:-0.639515	val-Gini:-0.374025

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[796]	train-Gini:-0.611632	val-Gini:-0.379681

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best


power: 0.04000000000000001 | Mean Gini: -0.384899 | Std Gini: 0.008804 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1116]	train-Gini:-0.646534	val-Gini:-0.387639

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[928]	train-Gini:-0.632578	val-Gini:-0.386975

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1224]	train-Gini:-0.660216	val-Gini:-0.385389

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1004]	train-Gini:-0.634015	val-Gini:-0.384392

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1129]	train-Gini:-0.649275	val-Gini:-0.373915

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1154]	train-Gini:-0.653121	val-Gini:-0.375964

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1194]	train-Gini:-0.656363	val-Gini:-0.399969

Will train until val error hasn't decreased in 200 rounds.
Stopping. B


power: 0.05000000000000001 | Mean Gini: -0.384782 | Std Gini: 0.008730 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1009]	train-Gini:-0.638037	val-Gini:-0.389312

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1030]	train-Gini:-0.638216	val-Gini:-0.391969

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1070]	train-Gini:-0.648776	val-Gini:-0.382673

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1002]	train-Gini:-0.636288	val-Gini:-0.382058

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1041]	train-Gini:-0.640752	val-Gini:-0.376162

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[824]	train-Gini:-0.616081	val-Gini:-0.381456

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1259]	train-Gini:-0.659730	val-Gini:-0.395192

Will train until val error hasn't decreased in 200 rounds.
Stopping. B


power: 0.06000000000000001 | Mean Gini: -0.384649 | Std Gini: 0.008688 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1029]	train-Gini:-0.641185	val-Gini:-0.391760

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1016]	train-Gini:-0.638605	val-Gini:-0.392990

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[972]	train-Gini:-0.634529	val-Gini:-0.390355

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1289]	train-Gini:-0.662712	val-Gini:-0.377194

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1092]	train-Gini:-0.648718	val-Gini:-0.371997

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1221]	train-Gini:-0.661512	val-Gini:-0.390883

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1035]	train-Gini:-0.640896	val-Gini:-0.370275

Will train until val error hasn't decreased in 200 rounds.
Stopping. B


power: 0.07 | Mean Gini: -0.384535 | Std Gini: 0.008783 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[838]	train-Gini:-0.619237	val-Gini:-0.382713

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[906]	train-Gini:-0.626280	val-Gini:-0.388499

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1126]	train-Gini:-0.648282	val-Gini:-0.391446

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1098]	train-Gini:-0.644902	val-Gini:-0.377008

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[942]	train-Gini:-0.629051	val-Gini:-0.378802

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1336]	train-Gini:-0.670779	val-Gini:-0.385372

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1016]	train-Gini:-0.638954	val-Gini:-0.394461

Will train until val error hasn't decreased in 200 rounds.
Stopping. Bes


power: 0.08 | Mean Gini: -0.384462 | Std Gini: 0.008453 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[925]	train-Gini:-0.622436	val-Gini:-0.393613

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1003]	train-Gini:-0.635522	val-Gini:-0.384255

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1235]	train-Gini:-0.661302	val-Gini:-0.391751

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1279]	train-Gini:-0.665146	val-Gini:-0.393032

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[704]	train-Gini:-0.606141	val-Gini:-0.359338

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1010]	train-Gini:-0.635855	val-Gini:-0.382951

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[820]	train-Gini:-0.614578	val-Gini:-0.390701

Will train until val error hasn't decreased in 200 rounds.
Stopping. Bes


power: 0.09000000000000001 | Mean Gini: -0.384421 | Std Gini: 0.008718 

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1173]	train-Gini:-0.653043	val-Gini:-0.387405

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[847]	train-Gini:-0.616115	val-Gini:-0.396582

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1184]	train-Gini:-0.656802	val-Gini:-0.371064

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1056]	train-Gini:-0.644974	val-Gini:-0.385846

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[940]	train-Gini:-0.630210	val-Gini:-0.386233

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1312]	train-Gini:-0.668545	val-Gini:-0.381466

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1170]	train-Gini:-0.649841	val-Gini:-0.387338

Will train until val error hasn't decreased in 200 rounds.
Stopping. Be


power: 0.1 | Mean Gini: -0.384495 | Std Gini: 0.008500 


In [6]:
print('CV repeats: {} | Mean Gini: {:.6f} | Std Gini: {:.6f} '.format(
        len(gini), np.mean(gini),np.std(gini))) # power =.5

CV repeats: 10 | Mean Gini: -0.384185 | Std Gini: 0.004694 


In [4]:
print('CV repeats: {} | Mean Gini: {:.6f} | Std Gini: {:.6f} '.format(
        len(gini), np.mean(gini),np.std(gini))) # power =.9

CV repeats: 10 | Mean Gini: -0.383559 | Std Gini: 0.008555 


In [6]:
print('CV repeats: {} | Mean Gini: {:.6f} | Std Gini: {:.6f} '.format(
        len(gini), np.mean(gini),np.std(gini)))

CV repeats: 10 | Mean Gini: -0.384105 | Std Gini: 0.010152 
