In [1]:
import pandas as pd
import numpy as np 
import xgboost as xgb
import patsy
import sklearn.cross_validation as cv

In [3]:
def Gini(y_pred, dtrain):
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

In [16]:
pd.options.mode.chained_assignment = None

train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test = pd.read_csv('./test.csv', sep = ',',index_col = 'Id')

full = pd.concat(objs = [train, test])

full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,num_mask]
full_cat = full.iloc[:, ~num_mask]

full_num['T1_V5'] = np.unique(full_cat['T1_V5'], return_inverse = 1)[1]
full_cat.drop('T1_V5', axis=1, inplace = 1)


full_num['T1_V9'] = np.unique(full_cat['T1_V9'], return_inverse = 1)[1]
full_cat.drop('T1_V9', axis=1, inplace = 1)

full_num['T2_V13'] = np.unique(full_cat['T2_V13'], return_inverse = 1)[1]
full_cat.drop('T2_V13', axis=1, inplace = 1)

cat_names = full_cat.columns

form = ' + '.join(cat_names)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)

full_dummies = pd.concat([full_num, x_dummies], axis = 1)


split = np.isnan(full_dummies.Hazard)
labels = full_dummies.loc[~split,'Hazard'].values
train = full_dummies[~split].drop('Hazard', axis=1).values
test  = full_dummies[split].drop('Hazard', axis=1).values

params = {  "objective": "reg:linear"
          , "eta": 0.005
          , "min_child_weight": 6
          , "subsample": .33
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

offset = 10000
num_rounds = 10000

xgtest = xgb.DMatrix(test)

xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(params, xgtrain, num_rounds, watchlist, feval= Gini, early_stopping_rounds=120)

Will train until val error hasn't decreased in 120 rounds.
[0]	train-Gini:-0.284853	val-Gini:-0.227208
[1]	train-Gini:-0.338759	val-Gini:-0.262746
[2]	train-Gini:-0.363201	val-Gini:-0.290216
[3]	train-Gini:-0.372932	val-Gini:-0.298757
[4]	train-Gini:-0.389592	val-Gini:-0.310049
[5]	train-Gini:-0.393427	val-Gini:-0.311001
[6]	train-Gini:-0.399563	val-Gini:-0.314693
[7]	train-Gini:-0.400118	val-Gini:-0.318665
[8]	train-Gini:-0.404634	val-Gini:-0.326219
[9]	train-Gini:-0.407975	val-Gini:-0.327418
[10]	train-Gini:-0.409731	val-Gini:-0.329142
[11]	train-Gini:-0.414790	val-Gini:-0.332995
[12]	train-Gini:-0.415588	val-Gini:-0.335316
[13]	train-Gini:-0.415568	val-Gini:-0.336238
[14]	train-Gini:-0.417464	val-Gini:-0.339261
[15]	train-Gini:-0.419472	val-Gini:-0.341020
[16]	train-Gini:-0.422869	val-Gini:-0.344833
[17]	train-Gini:-0.423597	val-Gini:-0.346129
[18]	train-Gini:-0.425729	val-Gini:-0.346317
[19]	train-Gini:-0.427102	val-Gini:-0.347515
[20]	train-Gini:-0.427641	val-Gini:-0.347482
[21]	t

In [17]:
pd.options.mode.chained_assignment = None

train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test = pd.read_csv('./test.csv', sep = ',',index_col = 'Id')

full = pd.concat(objs = [train, test])

full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,num_mask]
full_cat = full.iloc[:, ~num_mask]

full_num['T1_V5'] = np.unique(full_cat['T1_V5'], return_inverse = 1)[1]
full_cat.drop('T1_V5', axis=1, inplace = 1)


full_num['T1_V9'] = np.unique(full_cat['T1_V9'], return_inverse = 1)[1]
full_cat.drop('T1_V9', axis=1, inplace = 1)

full_num['T2_V13'] = np.unique(full_cat['T2_V13'], return_inverse = 1)[1]
full_cat.drop('T2_V13', axis=1, inplace = 1)

cat_names = full_cat.columns

form = ' + '.join(cat_names)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)

full_dummies = pd.concat([full_num, x_dummies], axis = 1)


split = np.isnan(full_dummies.Hazard)
labels = full_dummies.loc[~split,'Hazard'].values
train = full_dummies[~split].drop('Hazard', axis=1).values
test  = full_dummies[split].drop('Hazard', axis=1).values


params = {  "objective": "reg:linear"
          , "eta": 0.005
          , "min_child_weight": 6
          , "subsample": .6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

offset = 10000
num_rounds = 10000

xgtest = xgb.DMatrix(test)

xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(params, xgtrain, num_rounds, watchlist, feval= Gini, early_stopping_rounds=120)

Will train until val error hasn't decreased in 120 rounds.
[0]	train-Gini:-0.328489	val-Gini:-0.264809
[1]	train-Gini:-0.381890	val-Gini:-0.284881
[2]	train-Gini:-0.406407	val-Gini:-0.305192
[3]	train-Gini:-0.412499	val-Gini:-0.316261
[4]	train-Gini:-0.423578	val-Gini:-0.328871
[5]	train-Gini:-0.427206	val-Gini:-0.338680
[6]	train-Gini:-0.430300	val-Gini:-0.342160
[7]	train-Gini:-0.434014	val-Gini:-0.345149
[8]	train-Gini:-0.434605	val-Gini:-0.348525
[9]	train-Gini:-0.437750	val-Gini:-0.349716
[10]	train-Gini:-0.439217	val-Gini:-0.351670
[11]	train-Gini:-0.439882	val-Gini:-0.354691
[12]	train-Gini:-0.439584	val-Gini:-0.355801
[13]	train-Gini:-0.439363	val-Gini:-0.355556
[14]	train-Gini:-0.439129	val-Gini:-0.356493
[15]	train-Gini:-0.439350	val-Gini:-0.356939
[16]	train-Gini:-0.441439	val-Gini:-0.357349
[17]	train-Gini:-0.441137	val-Gini:-0.357835
[18]	train-Gini:-0.444544	val-Gini:-0.357306
[19]	train-Gini:-0.445883	val-Gini:-0.357275
[20]	train-Gini:-0.446708	val-Gini:-0.357944
[21]	t

In [49]:
params = {  "objective": "reg:linear"
          , "eta": 0.007
          , "min_child_weight": 50
          , "subsample": .6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

mod_xgb = xgb.cv(params, xgtrain, 1000, nfold=5, feval= Gini)

[0]	cv-test-Gini:-0.266589+0.018396	cv-train-Gini:-0.315598+0.012733
[1]	cv-test-Gini:-0.313436+0.009594	cv-train-Gini:-0.362785+0.003220
[2]	cv-test-Gini:-0.323955+0.007002	cv-train-Gini:-0.377855+0.004752
[3]	cv-test-Gini:-0.330663+0.008455	cv-train-Gini:-0.389623+0.002572
[4]	cv-test-Gini:-0.335413+0.008405	cv-train-Gini:-0.394848+0.004125
[5]	cv-test-Gini:-0.336951+0.010141	cv-train-Gini:-0.398500+0.005551
[6]	cv-test-Gini:-0.340719+0.010666	cv-train-Gini:-0.403685+0.004654
[7]	cv-test-Gini:-0.342293+0.011294	cv-train-Gini:-0.406965+0.003371
[8]	cv-test-Gini:-0.345454+0.011675	cv-train-Gini:-0.409463+0.002783
[9]	cv-test-Gini:-0.346835+0.011880	cv-train-Gini:-0.411546+0.002551
[10]	cv-test-Gini:-0.348142+0.010889	cv-train-Gini:-0.412994+0.004185
[11]	cv-test-Gini:-0.349359+0.011054	cv-train-Gini:-0.414169+0.004943
[12]	cv-test-Gini:-0.350000+0.010011	cv-train-Gini:-0.415294+0.005452
[13]	cv-test-Gini:-0.350887+0.009204	cv-train-Gini:-0.416364+0.004992
[14]	cv-test-Gini:-0.350647+0.

In [None]:
preds = model.predict(xgb.DMatrix(test), ntree_limit=model.best_iteration)

In [23]:
test

array([[  2.,   2.,  13., ...,   0.,   0.,   0.],
       [ 10.,   1.,  10., ...,   0.,   0.,   0.],
       [  9.,   2.,  20., ...,   0.,   0.,   0.],
       ..., 
       [ 13.,   1.,  24., ...,   0.,   1.,   0.],
       [  9.,   1.,   7., ...,   0.,   0.,   0.],
       [  4.,   3.,   6., ...,   0.,   0.,   0.]])

In [29]:
df_preds = pd.DataFrame({"Hazard": preds}, index = full_dummies[split].index)
df_preds.to_csv('sub.csv')

In [75]:
import warnings
warnings.filterwarnings('ignore')

params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 50
          , "subsample": .6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

num_rounds = 10000

train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test  = pd.read_csv('./test.csv',  sep = ',', index_col = 'Id')
full = pd.concat(objs = [train, test])
full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,  num_mask]
full_cat = full.iloc[:, ~num_mask]

cat_names = full_cat.columns

for cat in cat_names:
    # grouppify specific category and add them to numerical features
    full_num[cat] = np.unique(full_cat[cat], return_inverse = 1)[1]
    # drop grouped variable from categorical variables
    full_cat.drop(cat, axis=1, inplace = 1)
    # define names of categorical variables
    cat_to_dummy = full_cat.columns
    form  = ' + '.join(cat_to_dummy)
    form += ' - 1'
    x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
    full_dummies = pd.concat([full_num, x_dummies], axis = 1)
    split = np.isnan(full_dummies.Hazard)
    train = full_dummies.loc[~split,:]
    x = train.drop('Hazard', axis=1).values
    y = train.Hazard.values
    gini = []
    for train_ind, val_ind in cv.StratifiedKFold(y, n_folds =5, shuffle=True): 
        xgtrain = xgb.DMatrix(x[train_ind,:], label=y[train_ind])
        xgval = xgb.DMatrix(x[val_ind,:], label=y[val_ind])
        watchlist = [(xgtrain, 'train'),(xgval, 'val')]
        model = xgb.train(params, xgtrain, num_rounds, watchlist,
                          feval= Gini, early_stopping_rounds=120,verbose_eval=False)
        gini.append(model.best_score)
    print('Grouped: {} | #var: {} | Mean Gini: {:.5f} | Std Gini: {:.5f}'.format(
            cat, train.shape[1],-np.mean(gini), np.std(gini)))
    # prepare for new clean run
    full_num = full.iloc[:,num_mask]
    full_cat = full.iloc[:, ~num_mask]

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2115]	train-Gini:-0.525297	val-Gini:-0.398206

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2024]	train-Gini:-0.521995	val-Gini:-0.387469

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2392]	train-Gini:-0.534783	val-Gini:-0.387453

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2464]	train-Gini:-0.534067	val-Gini:-0.374878

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2165]	train-Gini:-0.525314	val-Gini:-0.395057

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2772]	train-Gini:-0.543482	val-Gini:-0.399579

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2457]	train-Gini:-0.539569	val-Gini:-0.375882

Will train until val error hasn't decreased in 120 rounds.
Stopping. 

Grouped: T1_V11 | #var: 83 | Mean Gini: 0.38861 | Std Gini: 0.00806
Grouped: T1_V12 | #var: 91 | Mean Gini: 0.38802 | Std Gini: 0.00802

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2231]	train-Gini:-0.528838	val-Gini:-0.387613

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1983]	train-Gini:-0.516513	val-Gini:-0.388435

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2081]	train-Gini:-0.520911	val-Gini:-0.382127

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2199]	train-Gini:-0.526127	val-Gini:-0.371372

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2592]	train-Gini:-0.537623	val-Gini:-0.405283




Grouped: T1_V15 | #var: 87 | Mean Gini: 0.38697 | Std Gini: 0.01100

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1776]	train-Gini:-0.513227	val-Gini:-0.386526

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2565]	train-Gini:-0.541228	val-Gini:-0.394119

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1995]	train-Gini:-0.520642	val-Gini:-0.389356

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[3401]	train-Gini:-0.570350	val-Gini:-0.392949

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2157]	train-Gini:-0.529126	val-Gini:-0.383990




Grouped: T1_V16 | #var: 77 | Mean Gini: 0.38939 | Std Gini: 0.00381

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2575]	train-Gini:-0.534278	val-Gini:-0.389038

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2415]	train-Gini:-0.528724	val-Gini:-0.391188

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2504]	train-Gini:-0.533580	val-Gini:-0.393580

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2868]	train-Gini:-0.545331	val-Gini:-0.397139

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2515]	train-Gini:-0.540052	val-Gini:-0.375562




Grouped: T1_V17 | #var: 93 | Mean Gini: 0.38930 | Std Gini: 0.00738

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1852]	train-Gini:-0.512642	val-Gini:-0.390877

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1540]	train-Gini:-0.503172	val-Gini:-0.396577

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2909]	train-Gini:-0.551662	val-Gini:-0.374309

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2826]	train-Gini:-0.547084	val-Gini:-0.394321

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2178]	train-Gini:-0.526846	val-Gini:-0.384912




Grouped: T1_V4 | #var: 87 | Mean Gini: 0.38820 | Std Gini: 0.00798

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2100]	train-Gini:-0.524880	val-Gini:-0.390365

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1799]	train-Gini:-0.511202	val-Gini:-0.379277

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2201]	train-Gini:-0.527695	val-Gini:-0.395137

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2638]	train-Gini:-0.540631	val-Gini:-0.385250

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2685]	train-Gini:-0.542651	val-Gini:-0.381453




Grouped: T1_V5 | #var: 85 | Mean Gini: 0.38630 | Std Gini: 0.00581

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1636]	train-Gini:-0.505916	val-Gini:-0.389606

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2290]	train-Gini:-0.525658	val-Gini:-0.389132

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2466]	train-Gini:-0.534287	val-Gini:-0.392306

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2160]	train-Gini:-0.524087	val-Gini:-0.387677

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2264]	train-Gini:-0.523872	val-Gini:-0.383671




Grouped: T1_V6 | #var: 93 | Mean Gini: 0.38848 | Std Gini: 0.00283

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1941]	train-Gini:-0.521763	val-Gini:-0.392037

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2144]	train-Gini:-0.525714	val-Gini:-0.386615

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2225]	train-Gini:-0.530802	val-Gini:-0.389943

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2375]	train-Gini:-0.533141	val-Gini:-0.384697

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2685]	train-Gini:-0.543356	val-Gini:-0.379281




Grouped: T1_V7 | #var: 91 | Mean Gini: 0.38651 | Std Gini: 0.00442

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2270]	train-Gini:-0.533990	val-Gini:-0.383983

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1845]	train-Gini:-0.521814	val-Gini:-0.380342

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1429]	train-Gini:-0.504364	val-Gini:-0.371928

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2720]	train-Gini:-0.546754	val-Gini:-0.397183

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2795]	train-Gini:-0.548813	val-Gini:-0.396559




Grouped: T1_V8 | #var: 91 | Mean Gini: 0.38600 | Std Gini: 0.00970

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2687]	train-Gini:-0.540221	val-Gini:-0.379838

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2237]	train-Gini:-0.525847	val-Gini:-0.390476

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1990]	train-Gini:-0.518266	val-Gini:-0.390973

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2418]	train-Gini:-0.529144	val-Gini:-0.398134

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2462]	train-Gini:-0.532460	val-Gini:-0.381755




Grouped: T1_V9 | #var: 89 | Mean Gini: 0.38824 | Std Gini: 0.00668

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1950]	train-Gini:-0.516854	val-Gini:-0.392467

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2253]	train-Gini:-0.529700	val-Gini:-0.375245

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2908]	train-Gini:-0.545041	val-Gini:-0.384207

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2243]	train-Gini:-0.524143	val-Gini:-0.391892

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2121]	train-Gini:-0.521519	val-Gini:-0.388295




Grouped: T2_V11 | #var: 93 | Mean Gini: 0.38642 | Std Gini: 0.00632

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2159]	train-Gini:-0.523268	val-Gini:-0.398845

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2393]	train-Gini:-0.530367	val-Gini:-0.392865

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2270]	train-Gini:-0.527006	val-Gini:-0.389631

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2458]	train-Gini:-0.529420	val-Gini:-0.392005

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1892]	train-Gini:-0.517747	val-Gini:-0.364490




Grouped: T2_V12 | #var: 93 | Mean Gini: 0.38757 | Std Gini: 0.01193

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2590]	train-Gini:-0.540773	val-Gini:-0.393516

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2287]	train-Gini:-0.526798	val-Gini:-0.397983

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2028]	train-Gini:-0.525292	val-Gini:-0.381077

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2097]	train-Gini:-0.521309	val-Gini:-0.379702

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2401]	train-Gini:-0.529463	val-Gini:-0.388712




Grouped: T2_V13 | #var: 90 | Mean Gini: 0.38820 | Std Gini: 0.00703

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2577]	train-Gini:-0.536459	val-Gini:-0.392641

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1799]	train-Gini:-0.513252	val-Gini:-0.369195

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[3168]	train-Gini:-0.554330	val-Gini:-0.398878

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2435]	train-Gini:-0.529965	val-Gini:-0.394709

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2112]	train-Gini:-0.520414	val-Gini:-0.383458




Grouped: T2_V3 | #var: 93 | Mean Gini: 0.38778 | Std Gini: 0.01057

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2156]	train-Gini:-0.524293	val-Gini:-0.389857

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2594]	train-Gini:-0.534495	val-Gini:-0.403693

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2186]	train-Gini:-0.527351	val-Gini:-0.396378

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2270]	train-Gini:-0.525262	val-Gini:-0.374137

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2167]	train-Gini:-0.528032	val-Gini:-0.379566




Grouped: T2_V5 | #var: 89 | Mean Gini: 0.38873 | Std Gini: 0.01078


In [7]:
import warnings
warnings.filterwarnings('ignore')

params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 30
          , "subsample": .6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

num_rounds = 10000

train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test  = pd.read_csv('./test.csv',  sep = ',', index_col = 'Id')
full = pd.concat(objs = [train, test])
full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,  num_mask]
full_cat = full.iloc[:, ~num_mask]

full_num['T1_V16'] = np.unique(full_cat['T1_V16'], return_inverse = 1)[1]
full_cat.drop('T1_V16', axis=1, inplace = 1)

# define names of categorical variables
cat_to_dummy = full_cat.columns
form  = ' + '.join(cat_to_dummy)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)
split = np.isnan(full_dummies.Hazard)
train = full_dummies.loc[~split,:]
x = train.drop('Hazard', axis=1).values
y = train.Hazard.values
gini = []
for train_ind, val_ind in cv.StratifiedKFold(y, n_folds =5, shuffle=True): 
    xgtrain = xgb.DMatrix(x[train_ind,:], label=y[train_ind])
    xgval = xgb.DMatrix(x[val_ind,:], label=y[val_ind])
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(params, xgtrain, num_rounds, watchlist,
                      feval= Gini, early_stopping_rounds=120,verbose_eval=False)
    gini.append(model.best_score)
print('#var: {} | Mean Gini: {:.5f} | Std Gini: {:.5f}'.format(
       train.shape[1],-np.mean(gini), np.std(gini)))

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1766]	train-Gini:-0.537245	val-Gini:-0.386697

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1924]	train-Gini:-0.544609	val-Gini:-0.407066

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2321]	train-Gini:-0.558419	val-Gini:-0.377846

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2132]	train-Gini:-0.551359	val-Gini:-0.383012

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1499]	train-Gini:-0.523787	val-Gini:-0.381367



#var: 77 | Mean Gini: 0.38720 | Std Gini: 0.01033


In [8]:
import warnings
warnings.filterwarnings('ignore')

params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 50
          , "subsample": .6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

num_rounds = 10000

train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
test  = pd.read_csv('./test.csv',  sep = ',', index_col = 'Id')
full = pd.concat(objs = [train, test])
full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,  num_mask]
full_cat = full.iloc[:, ~num_mask]

full_num['T1_V16'] = np.unique(full_cat['T1_V16'], return_inverse = 1)[1]
full_cat.drop('T1_V16', axis=1, inplace = 1)

# define names of categorical variables
cat_to_dummy = full_cat.columns
form  = ' + '.join(cat_to_dummy)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)
split = np.isnan(full_dummies.Hazard)
train = full_dummies.loc[~split,:]
x = train.drop('Hazard', axis=1).values
y = train.Hazard.values
gini = []
for train_ind, val_ind in cv.StratifiedKFold(y, n_folds =5, shuffle=True): 
    xgtrain = xgb.DMatrix(x[train_ind,:], label=y[train_ind])
    xgval = xgb.DMatrix(x[val_ind,:], label=y[val_ind])
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(params, xgtrain, num_rounds, watchlist,
                      feval= Gini, early_stopping_rounds=120,verbose_eval=False)
    gini.append(model.best_score)
print('#var: {} | Mean Gini: {:.5f} | Std Gini: {:.5f}'.format(
       train.shape[1],-np.mean(gini), np.std(gini)))

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[1997]	train-Gini:-0.522474	val-Gini:-0.394452

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2258]	train-Gini:-0.529541	val-Gini:-0.392903

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2372]	train-Gini:-0.532027	val-Gini:-0.411575

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2078]	train-Gini:-0.527605	val-Gini:-0.372250

Will train until val error hasn't decreased in 120 rounds.
Stopping. Best iteration:
[2160]	train-Gini:-0.530497	val-Gini:-0.374791



#var: 77 | Mean Gini: 0.38919 | Std Gini: 0.01440


In [None]:
import warnings
warnings.filterwarnings('ignore')

params = {  "objective": "reg:linear"
          , "eta": 0.003
          , "min_child_weight": 50
          , "subsample": .6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

num_rounds = 10000

train = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
labels = train['Hazard'].values
test  = pd.read_csv('./test.csv',  sep = ',', index_col = 'Id')
full = pd.concat(objs = [train, test])
full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)

num_mask = np.array([True if obj != 'object' else False for obj in full.dtypes])
full_num = full.iloc[:,  num_mask]
full_cat = full.iloc[:, ~num_mask]

full_num['T1_V16'] = np.unique(full_cat['T1_V16'], return_inverse = 1)[1]
full_cat.drop('T1_V16', axis=1, inplace = 1)

# define names of categorical variables
cat_to_dummy = full_cat.columns
form  = ' + '.join(cat_to_dummy)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dummies = pd.concat([full_num, x_dummies], axis = 1)
split = np.isnan(full_dummies.Hazard)
train = full_dummies.loc[~split,:]
x = train.drop('Hazard', axis=1).values
y = train.Hazard.values
params_subsamples = [.4,.6,.8]
params_colsample_bytrees = [.5,.7, 9]
for params_subsample in params_subsamples:
    for params_colsample_bytree in params_colsample_bytrees:
        params["subsample"] = params_subsample
        params["colsample_bytree"] = params_colsample_bytree
        gini = []
        for j in range(3):
            folds = cv.StratifiedKFold(labels, n_folds = 5, shuffle=True)
            for train_ind, val_ind in folds: 
                xgtrain = xgb.DMatrix(x[train_ind,:], label=y[train_ind])
                xgval = xgb.DMatrix(x[val_ind,:], label=y[val_ind])
                watchlist = [(xgtrain, 'train'),(xgval, 'val')]
                model = xgb.train(params, xgtrain, num_rounds, watchlist,
                                  feval= Gini, early_stopping_rounds=200,verbose_eval=False)
                gini.append(model.best_score)
        print('Subsample: {} | Sample By Tree: {} | Mean Gini: {:.5f} | Std Gini: {:.5f}'.format(
               params["subsample"],params["colsample_bytree"], -np.mean(gini), np.std(gini)))

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2594]	train-Gini:-0.518721	val-Gini:-0.388994

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2728]	train-Gini:-0.522800	val-Gini:-0.401975

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1891]	train-Gini:-0.498170	val-Gini:-0.385631

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2591]	train-Gini:-0.517783	val-Gini:-0.390827

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2438]	train-Gini:-0.516616	val-Gini:-0.381357

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1798]	train-Gini:-0.497300	val-Gini:-0.377504

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2539]	train-Gini:-0.515189	val-Gini:-0.402137

Will train until val error hasn't decreased in 200 rounds.
Stopping. 

Subsample: 0.4 | Sample By Tree: 0.5 | Mean Gini: 0.38905 | Std Gini: 0.00769
Subsample: 0.4 | Sample By Tree: 0.7 | Mean Gini: 0.38815 | Std Gini: 0.01404

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1798]	train-Gini:-0.500503	val-Gini:-0.409518

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2507]	train-Gini:-0.529581	val-Gini:-0.389950

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1497]	train-Gini:-0.492629	val-Gini:-0.378893

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2184]	train-Gini:-0.516617	val-Gini:-0.373910

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2437]	train-Gini:-0.524363	val-Gini:-0.385562

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2457]	train-Gini:-0.524752	val-Gini:-0.400418

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2021]	train-Gini:-0.512051	val-Gini:-0.380726

Will train until val error hasn't decreased in 200 rounds.
Stopping. 


Subsample: 0.4 | Sample By Tree: 9 | Mean Gini: 0.38797 | Std Gini: 0.01060

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2744]	train-Gini:-0.544498	val-Gini:-0.390931

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2463]	train-Gini:-0.532840	val-Gini:-0.391194

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2771]	train-Gini:-0.541960	val-Gini:-0.384555

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2794]	train-Gini:-0.544340	val-Gini:-0.393827

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2002]	train-Gini:-0.519439	val-Gini:-0.380415

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[3272]	train-Gini:-0.556709	val-Gini:-0.390332

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2968]	train-Gini:-0.548925	val-Gini:-0.399305

Will train until val error hasn't decreased in 200 rounds.
Stopping. 


Subsample: 0.6 | Sample By Tree: 0.5 | Mean Gini: 0.38916 | Std Gini: 0.00676

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2403]	train-Gini:-0.535666	val-Gini:-0.388144

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2053]	train-Gini:-0.524323	val-Gini:-0.377815

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2585]	train-Gini:-0.543456	val-Gini:-0.396411

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1468]	train-Gini:-0.502460	val-Gini:-0.386257

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2750]	train-Gini:-0.546287	val-Gini:-0.399146

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1914]	train-Gini:-0.517633	val-Gini:-0.384379

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2756]	train-Gini:-0.546843	val-Gini:-0.383903

Will train until val error hasn't decreased in 200 rounds.
Stopping. 


Subsample: 0.6 | Sample By Tree: 0.7 | Mean Gini: 0.38913 | Std Gini: 0.00775

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2334]	train-Gini:-0.540156	val-Gini:-0.409335

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[1720]	train-Gini:-0.513285	val-Gini:-0.389253

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[2117]	train-Gini:-0.529760	val-Gini:-0.388528

