In [1]:
import pandas as pd
import patsy
import numpy as np 
import xgboost as xgb
import sklearn.cross_validation as cv
import sklearn.ensemble as ens

In [2]:
def Gini(y_pred, dtrain):
    '''
    A helper function for XGBOOST early stopping
    Used as parameter feval = Gini
    y-pred: predicctions from a model, ndarray
    dtrain: true labels, DMatrix
    Returns: metrics label and value of a "Gini" metrics of missordered labels
    The returned value is negative to ensure minimization
    '''
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

In [3]:
def Gini_predict(y_true, y_pred):
    '''
    more general than above as accepts labels and returns score
    '''
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

#### Load data

In [4]:
tr = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
te =  pd.read_csv('./test.csv', sep = ',',index_col = 'Id')
labels = tr['Hazard'].values

full =  pd.concat(objs = [tr,te])
full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)
full_cat = full.select_dtypes(include = ['object'])
full_num = full.select_dtypes(exclude = ['object'])

#### Prepare train and test with GROUPED categorical variables

In [5]:
x_cat = full_cat.apply(func = lambda x: np.unique(x, return_inverse =1)[1], axis=0)

full_gr = pd.concat([full_num,x_cat], axis=1)
split = np.isnan(full.Hazard)

train_gr = full_gr[~split].drop('Hazard', axis=1).values
test_gr = full_gr[split].drop('Hazard', axis=1).values

#### Prepare train and test sets with DUMMIFIED categorical variables

In [6]:
cat_names = full_cat.columns    
form = ' + '.join(cat_names)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dum = pd.concat([full_num, x_dummies], axis = 1)

train_dum = full_dum[~split].drop('Hazard', axis=1).values
test_dum = full_dum[split].drop('Hazard', axis=1).values

#### Prepare RandomForest

In [7]:
ntrees = 300
seed = 1
model_rf_dum = ens.RandomForestRegressor(n_estimators = ntrees, random_state = seed, n_jobs = -1,
                                         max_features = 36, min_samples_leaf = 13)
model_rf_gr  = ens.RandomForestRegressor(n_estimators = ntrees, random_state = seed, n_jobs = -1,
                                         max_depth= 10, min_samples_leaf = 13)

#### Prepare XGBOOST

In [8]:
num_rounds = 10000
params = {  "objective": "reg:linear"
          , "eta": 0.01
          , "min_child_weight": 10
          , "subsample": 0.6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

#### Train

In [9]:
preds_xgb_dum = []
preds_xgb_gr = []
preds_rf_dum = []
preds_rf_gr = []
true_label = []
i = 1

for j in range(2):
    folds = cv.StratifiedKFold(labels, n_folds = 5, shuffle=True)
    for train_ind, val_ind in folds:
        # Prepare group and dummy train sets
        X_train_dum = train_dum[train_ind,:]
        X_train_gr  = train_gr[train_ind,:]
        y_train = labels[train_ind]
        # Prepare group and dummy val sets
        X_val_dum = train_dum[val_ind,:]
        X_val_gr  = train_gr[val_ind,:]
        y_val = labels[val_ind]
        true_label.append(y_val)
        # Prepare group and dummy train sets for XGB
        xgtrain_dum = xgb.DMatrix(X_train_dum, label=y_train)
        xgtrain_gr = xgb.DMatrix(X_train_gr, label=y_train)
        # Prepare group and dummy validation sets
        xgval_dum = xgb.DMatrix(X_val_dum, label=y_val)
        xgval_gr = xgb.DMatrix(X_val_gr, label=y_val)
        # Prepare group and dummy watchlists
        watchlist_dum = [(xgtrain_dum, 'train'),(xgval_dum, 'val')]
        watchlist_gr =  [(xgtrain_gr,  'train'),( xgval_gr, 'val')]
        # Train models
        model_xgb_dum = xgb.train(params, xgtrain_dum, num_rounds, watchlist_dum,
                          feval= Gini, early_stopping_rounds=200,verbose_eval=False)
        model_xgb_gr  = xgb.train(params,  xgtrain_gr, num_rounds, watchlist_gr,
                          feval= Gini, early_stopping_rounds=200,verbose_eval=False)
        model_rf_dum.fit(X_train_dum, y_train)
        model_rf_gr.fit(X_train_gr, y_train)
        # Predictions
        xgb_preds_dum = model_xgb_dum.predict(xgval_dum)
        xgb_preds_gr = model_xgb_gr.predict(xgval_gr)
        rf_preds_dum = model_rf_dum.predict(X_val_dum)
        rf_preds_gr = model_rf_gr.predict(X_val_gr)
        # Store predictions
        preds_xgb_dum.append(xgb_preds_dum)
        preds_xgb_gr.append(xgb_preds_gr)
        preds_rf_dum.append(rf_preds_dum)
        preds_rf_gr.append(rf_preds_gr)
        print('Fold {} | xgb_dum Gini {:.6f} | xgb_gr Gini {:.6f} | rf_dum Gini {:.6f}| rf_gr Gini {:.6f}'.\
              format(i, Gini_predict(y_val, xgb_preds_dum), Gini_predict(y_val, xgb_preds_gr),
                    Gini_predict(y_val, rf_preds_dum), Gini_predict(y_val, rf_preds_gr)))
        i+=1

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[444]	train-Gini:-0.558615	val-Gini:-0.379389

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[350]	train-Gini:-0.585418	val-Gini:-0.381076

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[798]	train-Gini:-0.621474	val-Gini:-0.382971

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[456]	train-Gini:-0.608957	val-Gini:-0.379251



Fold 1 | xgb_dum Gini 0.378579 | xgb_gr Gini 0.379300 | rf_dum Gini 0.368210| rf_gr Gini 0.361626
Fold 2 | xgb_dum Gini 0.381201 | xgb_gr Gini 0.378162 | rf_dum Gini 0.374937| rf_gr Gini 0.349831

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[731]	train-Gini:-0.610658	val-Gini:-0.398193

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[618]	train-Gini:-0.641604	val-Gini:-0.398758




Fold 3 | xgb_dum Gini 0.396228 | xgb_gr Gini 0.397232 | rf_dum Gini 0.384540| rf_gr Gini 0.365492

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[533]	train-Gini:-0.576768	val-Gini:-0.375483

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[435]	train-Gini:-0.603679	val-Gini:-0.380246




Fold 4 | xgb_dum Gini 0.374024 | xgb_gr Gini 0.378591 | rf_dum Gini 0.366498| rf_gr Gini 0.363551

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[958]	train-Gini:-0.645224	val-Gini:-0.382779

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[712]	train-Gini:-0.652474	val-Gini:-0.383195




Fold 5 | xgb_dum Gini 0.381011 | xgb_gr Gini 0.380921 | rf_dum Gini 0.369180| rf_gr Gini 0.346270

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[754]	train-Gini:-0.608814	val-Gini:-0.403695

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[391]	train-Gini:-0.590178	val-Gini:-0.406180




Fold 6 | xgb_dum Gini 0.401558 | xgb_gr Gini 0.405818 | rf_dum Gini 0.391899| rf_gr Gini 0.374201

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[400]	train-Gini:-0.549120	val-Gini:-0.374839

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[495]	train-Gini:-0.614370	val-Gini:-0.374261




Fold 7 | xgb_dum Gini 0.372869 | xgb_gr Gini 0.373369 | rf_dum Gini 0.364712| rf_gr Gini 0.354090

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[595]	train-Gini:-0.586525	val-Gini:-0.378129

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[737]	train-Gini:-0.663595	val-Gini:-0.378470




Fold 8 | xgb_dum Gini 0.376586 | xgb_gr Gini 0.375888 | rf_dum Gini 0.361978| rf_gr Gini 0.347157

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[613]	train-Gini:-0.591066	val-Gini:-0.387793

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[799]	train-Gini:-0.670023	val-Gini:-0.380674




Fold 9 | xgb_dum Gini 0.386841 | xgb_gr Gini 0.378102 | rf_dum Gini 0.372040| rf_gr Gini 0.350744

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[555]	train-Gini:-0.585723	val-Gini:-0.370061

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[540]	train-Gini:-0.632172	val-Gini:-0.375268




Fold 10 | xgb_dum Gini 0.369123 | xgb_gr Gini 0.373858 | rf_dum Gini 0.366221| rf_gr Gini 0.355270


In [72]:
gini = 0
weights_xgb = {}
final = list(zip(true_label, preds_xgb_dum, preds_xgb_gr))
for w in np.linspace(0,1,11):
    for p1 in np.linspace(0,1,21):
        for p2 in np.linspace(0,1,21):
            gini_fold = []
            for item in final:
                pred = w * (item[1]**p1) + (1-w) *(item[2]**p2)
                gini_fold.append(Gini_predict(item[0], pred))
            mean_gini = np.mean(gini_fold)
            if  mean_gini > gini:
                gini = mean_gini
                weights_xgb['w'] = w
                weights_xgb['p1']= p1
                weights_xgb['p2']= p2
                weights_xgb['gini'] = gini
    print('Weight is: {}'.format(w))
                    
weights_xgb['gini']

Weight is: 0.0
Weight is: 0.1
Weight is: 0.2
Weight is: 0.30000000000000004
Weight is: 0.4
Weight is: 0.5
Weight is: 0.6000000000000001
Weight is: 0.7000000000000001
Weight is: 0.8
Weight is: 0.9
Weight is: 1.0


0.38505675068841427

In [73]:
weights_xgb

{'gini': 0.38505675068841427,
 'p1': 0.050000000000000003,
 'p2': 0.050000000000000003,
 'w': 0.5}

In [69]:
gini = 0
weights_rf = {}
final = list(zip(true_label, preds_xgb_dum, preds_rf_dum))
for w in np.linspace(0,1,11):
    for p1 in np.linspace(0,1,21):
        for p2 in np.linspace(0,1,21):
            gini_fold = []
            for item in final:
                pred = w * (item[1]**p1) + (1-w) *(item[2]**p2)
                gini_fold.append(Gini_predict(item[0], pred))
            mean_gini = np.mean(gini_fold)
            if  mean_gini > gini:
                gini = mean_gini
                weights_rf['w'] = w
                weights_rf['p1']= p1
                weights_rf['p2']= p2
                weights_rf['gini'] = gini
    print('Weight is: {}'.format(w))
                    
weights_rf['gini']

Weight is: 0.0
Weight is: 0.1
Weight is: 0.2
Weight is: 0.30000000000000004
Weight is: 0.4
Weight is: 0.5
Weight is: 0.6000000000000001
Weight is: 0.7000000000000001
Weight is: 0.8
Weight is: 0.9
Weight is: 1.0


0.38311750407569795

In [71]:
weights

{'gini': 0.38311750407569795,
 'p1': 0.050000000000000003,
 'p2': 0.050000000000000003,
 'w': 0.70000000000000007}

In [74]:
gini = 0
weights_rf2 = {}
final = list(zip(true_label, preds_xgb_dum, preds_rf_dum))
for w in np.linspace(0,1,11):
    gini_fold = []
    for item in final:
        pred = w * item[1] + (1-w) * item[2]
        gini_fold.append(Gini_predict(item[0], pred))
    mean_gini = np.mean(gini_fold)
    if  mean_gini > gini:
        gini = mean_gini
        weights_rf2['w'] = w
        weights_rf2['gini'] = gini
    print('Weight is: {}'.format(w))
                    
weights_rf2['gini']

Weight is: 0.0
Weight is: 0.1
Weight is: 0.2
Weight is: 0.30000000000000004
Weight is: 0.4
Weight is: 0.5
Weight is: 0.6000000000000001
Weight is: 0.7000000000000001
Weight is: 0.8
Weight is: 0.9
Weight is: 1.0


0.38306744208795834

In [76]:
gini = 0
weights_rf2 = {}
final = list(zip(true_label, preds_xgb_dum, preds_rf_dum))
for w in np.linspace(0,1,101):
    gini_fold = []
    for item in final:
        pred = w * item[1] + (1-w) * item[2]
        gini_fold.append(Gini_predict(item[0], pred))
    mean_gini = np.mean(gini_fold)
    if  mean_gini > gini:
        gini = mean_gini
        weights_rf2['w'] = w
        weights_rf2['gini'] = gini
    print('Weight is: {}'.format(w))
                    
weights_rf2['gini']

Weight is: 0.0
Weight is: 0.01
Weight is: 0.02
Weight is: 0.03
Weight is: 0.04
Weight is: 0.05
Weight is: 0.06
Weight is: 0.07
Weight is: 0.08
Weight is: 0.09
Weight is: 0.1
Weight is: 0.11
Weight is: 0.12
Weight is: 0.13
Weight is: 0.14
Weight is: 0.15
Weight is: 0.16
Weight is: 0.17
Weight is: 0.18
Weight is: 0.19
Weight is: 0.2
Weight is: 0.21
Weight is: 0.22
Weight is: 0.23
Weight is: 0.24
Weight is: 0.25
Weight is: 0.26
Weight is: 0.27
Weight is: 0.28
Weight is: 0.29
Weight is: 0.3
Weight is: 0.31
Weight is: 0.32
Weight is: 0.33
Weight is: 0.34
Weight is: 0.35000000000000003
Weight is: 0.36
Weight is: 0.37
Weight is: 0.38
Weight is: 0.39
Weight is: 0.4
Weight is: 0.41000000000000003
Weight is: 0.42
Weight is: 0.43
Weight is: 0.44
Weight is: 0.45
Weight is: 0.46
Weight is: 0.47000000000000003
Weight is: 0.48
Weight is: 0.49
Weight is: 0.5
Weight is: 0.51
Weight is: 0.52
Weight is: 0.53
Weight is: 0.54
Weight is: 0.55
Weight is: 0.56
Weight is: 0.5700000000000001
Weight is: 0.58
Wei

0.38308043968880068

In [77]:
weights_rf2

{'gini': 0.38308043968880068, 'w': 0.72999999999999998}