In [1]:
import pandas as pd
import numpy as np 
import xgboost as xgb
import sklearn.cross_validation as cv

In [2]:
def Gini(y_pred, dtrain):
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

In [11]:
import warnings
warnings.filterwarnings('ignore')
num_rounds = 10000
params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 75
          , "subsample": 0.6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

tr = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
te =  pd.read_csv('./test.csv', sep = ',',index_col = 'Id')

labels = tr['Hazard'].values


full =  pd.concat(objs = [tr,te])
full_cat = full.select_dtypes(include = ['object'])
full_num = full.select_dtypes(exclude = ['object'])
full_cat = full_cat.apply(func = lambda x: np.unique(x, return_inverse =1)[1], axis=0)

full = pd.concat([full_num,full_cat], axis=1)
split = np.isnan(full.Hazard)

train = full[~split].drop('Hazard', axis=1).values
test = full[split].drop('Hazard', axis=1).values
xgtest = xgb.DMatrix(test)
gini = []
for j in range(3):
    folds = cv.StratifiedKFold(labels, n_folds = 5, shuffle=True)
    for train_ind, val_ind in folds: 
        xgtrain = xgb.DMatrix(train[train_ind,:], label=labels[train_ind])
        xgval = xgb.DMatrix(train[val_ind,:], label=labels[val_ind])
        watchlist = [(xgtrain, 'train'),(xgval, 'val')]
        model = xgb.train(params, xgtrain, num_rounds, watchlist,
                          feval= Gini, early_stopping_rounds=200,verbose_eval=False)
        gini.append(model.best_score)
print('Learning rate: {} | Min child weight: {} | Mean Gini: {:.6f} | Std Gini: {:.6f} '.format(
        params['eta'], params['min_child_weight'], -np.mean(gini),np.std(gini)))

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1737]	train-Gini:-0.525610	val-Gini:-0.402534

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2352]	train-Gini:-0.556182	val-Gini:-0.389157

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2730]	train-Gini:-0.568586	val-Gini:-0.379306

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2606]	train-Gini:-0.564527	val-Gini:-0.391209

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2357]	train-Gini:-0.553556	val-Gini:-0.384982

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2356]	train-Gini:-0.552285	val-Gini:-0.413391

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2897]	train-Gini:-0.576318	val-Gini:-0.387026

Will train until val error hasn't decreased in 200 rounds.
Stopping. 

Learning rate: 0.003 | Min child weight: 75 | Mean Gini: 0.388075 | Std Gini: 0.010083 


In [29]:
import warnings
warnings.filterwarnings('ignore')
num_rounds = 3000

tr = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
te =  pd.read_csv('./test.csv', sep = ',',index_col = 'Id')
test_id = te.index

labels = tr['Hazard'].values


full =  pd.concat(objs = [tr,te])
full_cat = full.select_dtypes(include = ['object'])
full_num = full.select_dtypes(exclude = ['object'])
full_cat = full_cat.apply(func = lambda x: np.unique(x, return_inverse =1)[1], axis=0)

full = pd.concat([full_num,full_cat], axis=1)
split = np.isnan(full.Hazard)

train = full[~split].drop('Hazard', axis=1).values
test = full[split].drop('Hazard', axis=1).values
xgtest = xgb.DMatrix(test)
xgtrain = xgb.DMatrix(train, label=labels)
for i in range(10):
    params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 75
          , "subsample": 0.6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9
          , 'seed': i}
    model = xgb.train(params, xgtrain, num_rounds, verbose_eval=False)
    pred = model.predict(xgtest)
    preds = pd.DataFrame({'Hazard': pred},index=test_id)
    preds.to_csv('sub{}.csv'.format(i))

In [30]:
pred = np.mean([pd.read_csv('sub{}.csv'.format(i)).Hazard.values for i in range(10)], axis=0)
preds = pd.DataFrame({'Hazard': pred},index=test_id)
preds.to_csv('sub_final.csv')