In [1]:
import pandas as pd
import patsy
import numpy as np 
import xgboost as xgb
import sklearn.cross_validation as cv
import sklearn.ensemble as ens

In [2]:
def Gini(y_pred, dtrain):
    '''
    A helper function for XGBOOST early stopping
    Used as parameter feval = Gini
    y-pred: predicctions from a model, ndarray
    dtrain: true labels, DMatrix
    Returns: metrics label and value of a "Gini" metrics of missordered labels
    The returned value is negative to ensure minimization
    '''
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

In [3]:
def Gini_predict(y_true, y_pred):
    '''
    more general than above as accepts labels and returns score
    '''
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

#### Load data

In [4]:
tr = pd.read_csv('./train.csv', sep = ',', index_col = 'Id')
te =  pd.read_csv('./test.csv', sep = ',',index_col = 'Id')
labels = tr['Hazard'].values

full =  pd.concat(objs = [tr,te])
full.drop(['T1_V10', 'T1_V13', 'T2_V7', 'T2_V10'], axis=1, inplace = 1)
full_cat = full.select_dtypes(include = ['object'])
full_num = full.select_dtypes(exclude = ['object'])

#### Prepare train and test with GROUPED categorical variables

In [5]:
x_cat = full_cat.apply(func = lambda x: np.unique(x, return_inverse =1)[1], axis=0)

full_gr = pd.concat([full_num,x_cat], axis=1)
split = np.isnan(full.Hazard)

train_gr = full_gr[~split].drop('Hazard', axis=1).values
test_gr = full_gr[split].drop('Hazard', axis=1).values

#### Prepare train and test sets with DUMMIFIED categorical variables

In [6]:
cat_names = full_cat.columns    
form = ' + '.join(cat_names)
form += ' - 1'
x_dummies = patsy.dmatrix(form, full_cat, return_type='dataframe')
full_dum = pd.concat([full_num, x_dummies], axis = 1)

train_dum = full_dum[~split].drop('Hazard', axis=1).values
test_dum = full_dum[split].drop('Hazard', axis=1).values

#### Prepare XGBOOST

In [8]:
num_rounds = 10000
params = {  "objective": "reg:linear"
          , "eta": 0.005
          , "min_child_weight": 5
          , "subsample": 0.6
          , "colsample_bytree": 0.7
          , "scale_pos_weight": 1
          , "silent": 1
          , "max_depth": 9}

#### Train

In [16]:
preds_xgb_dum = []
preds_xgb_gr = []
preds_rf_dum = []
preds_rf_gr = []
true_label = []
tr_ind = []
i = 1

for j in range(2):
#    folds = cv.StratifiedKFold(labels, n_folds = 5, shuffle=True)
    folds = cv.KFold(len(labels), n_folds = 5, shuffle=True)
    for train_ind, val_ind in folds:
        # Prepare group and dummy train sets
        X_train_dum = train_dum[train_ind,:]
        X_train_gr  = train_gr[train_ind,:]
        y_train = labels[train_ind]
        # Prepare group and dummy val sets
        X_val_dum = train_dum[val_ind,:]
        X_val_gr  = train_gr[val_ind,:]
        y_val = labels[val_ind]
        true_label.append(y_val)
        tr_ind.append(train_ind)
        # Prepare group and dummy train sets for XGB
        xgtrain_dum = xgb.DMatrix(X_train_dum, label=y_train)
        xgtrain_gr = xgb.DMatrix(X_train_gr, label=y_train)
        # Prepare group and dummy validation sets
        xgval_dum = xgb.DMatrix(X_val_dum, label=y_val)
        xgval_gr = xgb.DMatrix(X_val_gr, label=y_val)
        # Prepare group and dummy watchlists
        watchlist_dum = [(xgtrain_dum, 'train'),(xgval_dum, 'val')]
        watchlist_gr =  [(xgtrain_gr,  'train'),( xgval_gr, 'val')]
        # Train models
        model_xgb_dum = xgb.train(params, xgtrain_dum, num_rounds, watchlist_dum,
                          feval= Gini, early_stopping_rounds=200,verbose_eval=False)
        model_xgb_gr  = xgb.train(params,  xgtrain_gr, num_rounds, watchlist_gr,
                          feval= Gini, early_stopping_rounds=200,verbose_eval=False)
#        model_rf_dum.fit(X_train_dum, y_train)
#        model_rf_gr.fit(X_train_gr, y_train)
        # Predictions
        xgb_preds_dum = model_xgb_dum.predict(xgval_dum, ntree_limit = model_xgb_dum.best_iteration)
        xgb_preds_gr = model_xgb_gr.predict(xgval_gr, ntree_limit = model_xgb_gr.best_iteration)
#        rf_preds_dum = model_rf_dum.predict(X_val_dum)
#        rf_preds_gr = model_rf_gr.predict(X_val_gr)
        # Store predictions
        preds_xgb_dum.append(xgb_preds_dum)
        preds_xgb_gr.append(xgb_preds_gr)
#        preds_rf_dum.append(rf_preds_dum)
#        preds_rf_gr.append(rf_preds_gr)
#        print('Fold {} | xgb_dum Gini {:.6f} | xgb_gr Gini {:.6f} | rf_dum Gini {:.6f}| rf_gr Gini {:.6f}'.\
#              format(i, Gini_predict(y_val, xgb_preds_dum), Gini_predict(y_val, xgb_preds_gr),
#                    Gini_predict(y_val, rf_preds_dum), Gini_predict(y_val, rf_preds_gr)))
        print('Fold {} | xgb_dum Gini {:.6f} | xgb_gr Gini {:.6f}'.\
              format(i, Gini_predict(y_val, xgb_preds_dum), Gini_predict(y_val, xgb_preds_gr)))
        i+=1

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[642]	train-Gini:-0.595043	val-Gini:-0.377767

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[577]	train-Gini:-0.631964	val-Gini:-0.376365

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[561]	train-Gini:-0.579630	val-Gini:-0.399703

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[553]	train-Gini:-0.627691	val-Gini:-0.397538



Fold 1 | xgb_dum Gini 0.377742 | xgb_gr Gini 0.376320
Fold 2 | xgb_dum Gini 0.399600 | xgb_gr Gini 0.397472

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[549]	train-Gini:-0.577391	val-Gini:-0.384281

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[456]	train-Gini:-0.609361	val-Gini:-0.388140




Fold 3 | xgb_dum Gini 0.384262 | xgb_gr Gini 0.388032

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[590]	train-Gini:-0.591883	val-Gini:-0.381508

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[639]	train-Gini:-0.648066	val-Gini:-0.387097




Fold 4 | xgb_dum Gini 0.381424 | xgb_gr Gini 0.387088

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[555]	train-Gini:-0.581607	val-Gini:-0.374217

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[495]	train-Gini:-0.613150	val-Gini:-0.378541




Fold 5 | xgb_dum Gini 0.374183 | xgb_gr Gini 0.378476

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[664]	train-Gini:-0.597836	val-Gini:-0.373530

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[625]	train-Gini:-0.640274	val-Gini:-0.374968




Fold 6 | xgb_dum Gini 0.373484 | xgb_gr Gini 0.374873

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[649]	train-Gini:-0.596450	val-Gini:-0.378931

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[612]	train-Gini:-0.639278	val-Gini:-0.380457




Fold 7 | xgb_dum Gini 0.378895 | xgb_gr Gini 0.380413

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[569]	train-Gini:-0.579364	val-Gini:-0.396235

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[560]	train-Gini:-0.624949	val-Gini:-0.397234




Fold 8 | xgb_dum Gini 0.396157 | xgb_gr Gini 0.397173

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[613]	train-Gini:-0.590951	val-Gini:-0.378158

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[489]	train-Gini:-0.615664	val-Gini:-0.376962




Fold 9 | xgb_dum Gini 0.378085 | xgb_gr Gini 0.376907

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[755]	train-Gini:-0.614914	val-Gini:-0.395411

Will train until val error hasn't decreased in 200 rounds.
Stopping. Best iteration:
[596]	train-Gini:-0.638028	val-Gini:-0.394493




Fold 10 | xgb_dum Gini 0.395361 | xgb_gr Gini 0.394421


In [17]:
gini = 0
weights_xgb = {}
final = list(zip(true_label, preds_xgb_dum, preds_xgb_gr))
for w in np.linspace(0,1,21):
    for p1 in np.linspace(0,1,51):
        for p2 in np.linspace(0,1,51):
            gini_fold = []
            for item in final:
                pred = w * (item[1]**p1) + (1-w) *(item[2]**p2)
                gini_fold.append(Gini_predict(item[0], pred))
            mean_gini = np.mean(gini_fold)
            if  mean_gini > gini:
                gini = mean_gini
                weights_xgb['w'] = w
                weights_xgb['p1']= p1
                weights_xgb['p2']= p2
                weights_xgb['gini'] = gini
    print('Weight is: {}'.format(w))
                    
weights_xgb['gini']

Weight is: 0.0
Weight is: 0.1
Weight is: 0.2
Weight is: 0.30000000000000004
Weight is: 0.4
Weight is: 0.5
Weight is: 0.6000000000000001
Weight is: 0.7000000000000001
Weight is: 0.8
Weight is: 0.9
Weight is: 1.0


0.38742693389735827

In [18]:
weights_xgb

{'gini': 0.38742693389735827,
 'p1': 0.050000000000000003,
 'p2': 0.20000000000000001,
 'w': 0.80000000000000004}

In [31]:
np.mean(preds_xgb_dum)

ValueError: operands could not be broadcast together with shapes (10200,) (10199,) 