# 1. Feature Engineering (Obtained before hand)

In [11]:
import pandas as pd
import numpy as np

def FeatureEngineering(trainpath,testpath):
    #### Load Data
    train = pd.read_csv(trainpath)
    test = pd.read_csv(testpath)

    ### 
    y = train['target'].values
    testid= test['id'].values
    
    ### Drop calc
    unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
    train = train.drop(unwanted, axis=1)  
    test = test.drop(unwanted, axis=1)

    train.drop(['id','target'],axis=1,inplace=True)
    test.drop(['id'],axis=1,inplace=True)


    
    trainX = np.array(train)
    testX = np.array(test)
    trainy = np.array(y)
    
    return trainX, trainy, testX

# 2. Metric

In [12]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_coefficient(preds,dtrain):
    y = dtrain.get_label()
    return 'gini', -gini_normalized(y,preds)

# 3. Class for stacking

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split

class Clf4Stack_xgb(object):
    def __init__(self, model, metric, nrounds, early_stopping_rounds=100, test_size=0.25, verbose=False, n_splits=5):
        self.n_splits = n_splits
        self.model = model
        self.metric = metric
        self.nrounds = nrounds
        self.early_stopping_rounds = early_stopping_rounds
        self.test_size = test_size
        self.verbose = verbose
        

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=44)

        for i, (train_index,test_index) in enumerate(skf.split(trainX, trainy)):
            print("=====Round {0}/{1}=====".format(i+1,self.n_splits))
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=self.test_size, random_state=99)
            
            nrounds=2000
            d_train = xgb.DMatrix(X_train, y_train) 
            d_valid = xgb.DMatrix(X_test, y_test) 
            watchlist = [(d_train, 'train'), (d_valid, 'valid')]
            xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                                  feval=gini_coefficient, maximize=True, verbose_eval=100)
            #sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values),
            # ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
            
            y_pred = self.model.predict_proba(X_test,ntree_limit=self.model.best_ntree_limit+50)[:,1]
            self.train4stack[test_index] = y_pred
            self.test4stack += self.model.predict_proba(testX,ntree_limit=self.model.best_ntree_limit+50)[:,1]
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)

# 4. Run

## 4.1 Prepare data

In [22]:
trainpath = "train.csv"
testpath = "test.csv"

trainX, trainy, testX = FeatureEngineering(trainpath,testpath)

## 4.2 Build Model (the optimal hyperparameters obtained before hand)

In [27]:
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBClassifier

params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}


#xgb = XGBClassifier()
#xgb.set_params(**params)

## 4.3 Generate data for stacking

In [28]:
%%time
C4S = Clf4Stack_xgb(xgb, 
                    gini_coefficient, 
                    nrounds=2000,
                    early_stopping_rounds=100, 
                    test_size=0.25, 
                    verbose=100, 
                    n_splits=5)

Wall time: 0 ns


In [29]:
%%time
C4S.fit_predict(trainX, trainy, testX)

C4S.output()

=====Round 1/5=====
[0]	train-gini:-0.191351	valid-gini:-0.171783
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:-0.250998	valid-gini:-0.237442
Stopping. Best iteration:
[0]	train-gini:-0.191351	valid-gini:-0.171783



AttributeError: module 'xgboost' has no attribute 'predict_proba'