# 1. Feature Engineering (Obtained before hand)

In [1]:
import pandas as pd
import numpy as np

def FeatureEngineering(trainpath,testpath):
    #### Load Data
    train = pd.read_csv(trainpath)
    test = pd.read_csv(testpath)

    ### 
    y = train['target'].values
    testid= test['id'].values
    
    ### Drop calc
    unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
    train = train.drop(unwanted, axis=1)  
    test = test.drop(unwanted, axis=1)

    train.drop(['id','target'],axis=1,inplace=True)
    test.drop(['id'],axis=1,inplace=True)


    
    trainX = np.array(train)
    testX = np.array(test)
    trainy = np.array(y)
    
    return trainX, trainy, testX

# 2. Metric

In [2]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_coefficient(preds,dtrain):
    y = dtrain.get_label()
    return 'gini', -gini_normalized(y,preds)

# 3. Class for stacking

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split

class Clf4Stack_xgb(object):
    def __init__(self, model, metric, early_stopping_rounds=100, test_size=0.25, verbose=False, n_splits=5):
        self.n_splits = n_splits
        self.model = model
        self.metric = metric
        self.early_stopping_rounds = early_stopping_rounds
        self.test_size = test_size
        self.verbose = verbose
        

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=0)

        for i, (train_index,test_index) in enumerate(skf.split(trainX, trainy)):
            print("=====Round {0}/{1}=====".format(i+1,self.n_splits))
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=self.test_size, random_state=99)
            
            self.model.fit(x1, y1, 
                           eval_set=[(x1,y1),(x2,y2)], 
                           eval_metric=self.metric,
                           early_stopping_rounds=self.early_stopping_rounds,
                           verbose=self.verbose)            
            
            y_pred = self.model.predict_proba(X_test,ntree_limit=self.model.best_ntree_limit+50)[:,1]
            self.train4stack[test_index] = y_pred
            self.test4stack += self.model.predict_proba(testX,ntree_limit=self.model.best_ntree_limit+50)[:,1]
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)

# 4. Run

## 4.1 Prepare data

In [4]:
trainpath = "train.csv"
testpath = "test.csv"

trainX, trainy, testX = FeatureEngineering(trainpath,testpath)

## 4.2 Build Model (the optimal hyperparameters obtained before hand)

In [8]:
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBClassifier

params = {'learning_rate': 0.02, 
          'n_estimators': 2000,
          'max_depth': 4, 
          'subsample': 0.9,
          'colsample_bytree': 0.9, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'objective': 'binary:logistic', 
          'seed': 99, 
          'silent': True}

xgb = XGBClassifier()
xgb.set_params(**params)

XGBClassifier(base_score=0.5, colsample_bylevel=0.7, colsample_bytree=0.9,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=4,
       min_child_weight=100, missing=None, n_estimators=2000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=99, silent=True, subsample=0.9)

## 4.3 Generate data for stacking

In [9]:
%%time
C4S = Clf4Stack_xgb(xgb, 
                    gini_coefficient, 
                    early_stopping_rounds=100, 
                    test_size=0.25, 
                    verbose=100, 
                    n_splits=5)

Wall time: 0 ns


In [10]:
%%time
C4S.fit_predict(trainX, trainy, testX)

C4S.output()

=====Round 1/5=====
[0]	validation_0-gini:-0.200038	validation_1-gini:-0.179792
Multiple eval metrics have been passed: 'validation_1-gini' will be used for early stopping.

Will train until validation_1-gini hasn't improved in 100 rounds.
[100]	validation_0-gini:-0.257237	validation_1-gini:-0.234429
[200]	validation_0-gini:-0.277127	validation_1-gini:-0.249765
[300]	validation_0-gini:-0.295401	validation_1-gini:-0.261181
[400]	validation_0-gini:-0.306533	validation_1-gini:-0.267397
[500]	validation_0-gini:-0.315217	validation_1-gini:-0.27096
[600]	validation_0-gini:-0.322254	validation_1-gini:-0.273083
[700]	validation_0-gini:-0.328146	validation_1-gini:-0.274274
[800]	validation_0-gini:-0.333728	validation_1-gini:-0.275217
[900]	validation_0-gini:-0.338815	validation_1-gini:-0.275758
[1000]	validation_0-gini:-0.343453	validation_1-gini:-0.276294
[1100]	validation_0-gini:-0.348444	validation_1-gini:-0.276361
Stopping. Best iteration:
[1040]	validation_0-gini:-0.345522	validation_1-gin