In [1]:
import pandas as pd
import numpy as np 
import xgboost as xgb
import patsy
import matplotlib.pyplot as plt
from collections import Counter
import re
%matplotlib inline

In [2]:
def Gini(y_pred, dtrain):
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

## Train

In [None]:
params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 6
          , "subsample": 0.6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

digs = re.compile(r'\d+')

cat_to_drop = []

tr = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
te =  pd.read_csv('./test.csv', sep = ',',index_col = 'Id')

for i in range(32):
    full =  pd.concat(objs = [tr,te])
    full.drop(cat_to_drop, axis=1, inplace = 1)
    full_cat = full.select_dtypes(include = ['object'])
    full_num = full.select_dtypes(exclude = ['object'])
    full_cat = full_cat.apply(func = lambda x: np.unique(x, return_inverse =1)[1], axis=0)

    full = pd.concat([full_num,full_cat], axis=1)
    split = np.isnan(full.Hazard)
    labels = full.loc[~split,'Hazard'].values
    train = full[~split].drop('Hazard', axis=1).values
    feature_names = full.columns[1:]
    offset = 10000
    num_rounds = 10000
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(params, xgtrain, num_rounds, watchlist,
                      feval=Gini, early_stopping_rounds=120, verbose_eval=False)
    importance = Counter(model.get_fscore()).most_common()
    least_imp_id = re.findall(digs, importance[-1][0])
    least_imp_name = feature_names[least_imp_id]
    print('Gini: {} | Dropped: {} | Least Important: {}'.format(model.best_score, cat_to_drop,least_imp_name))
    cat_to_drop.append(least_imp_name)