In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold,StratifiedKFold, cross_val_score, train_test_split


import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier,Pool, cv

from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, f1_score

In [2]:
train = pd.read_csv("trainF.csv")
test = pd.read_csv("testF.csv")
test_id = pd.read_csv("test.csv")
test_id = test_id['id']
train.shape, test.shape

((78369, 66), (50226, 65))

In [3]:
train['redemption_status'].value_counts()

0    77640
1      729
Name: redemption_status, dtype: int64

In [4]:
target = train['redemption_status']
train.drop('redemption_status', axis = 1, inplace = True)

In [5]:
lgb_params = {
     'metric': 'auc',
     'objective' : 'binary',
     'early_stopping_round':1000,
     'bagging_fraction': 0.4,
     'colsample_bytree': 0.4,
     'feature_fraction': 0.4,
     'lambda_l1': 0.0,
     'lambda_l2': 0.0,
     'learning_rate': 0.001,
     'max_depth': 15,
     'min_child_samples': 100,
     'min_child_weight': 50,
     'min_split_gain': 0.0010640577871197205,
     'n_estimators': 1520,
     'num_leaves': 196,
     'subsample': 0.4,
     'is_unbalance' :True
}


In [6]:
xgb_params = {
    'booster' : 'gbtree',
    'eta' : 0.1,
    'nthread' : 4,
    'silent' : True,
 #   'seed' : 1993,
    'scale_pos_weight': 106,
    'colsample_bytree': 0.547582701158404,
 'gamma': 7.99661203221007,
 'learning_rate': 0.036024836896883045,
 'max_delta_step': 9.545974437155495,
 'max_depth': 4,
 'min_child_weight': 3.9108711107457483,
 'n_estimators': 1015,
 'num_leaves': 100,
 'reg_alpha': 6.095828143522551,
 'reg_lambda': 2.9017833667247137,
 'subsample': 0.9458396927859323
}

In [7]:
def run_XGB(params,trainX, targetX, testX, seed = 0):    
    err_xgbX=[]
    y_pred_test_xgbX=[]
    y_pred_trainCV_xgbX = np.zeros(train.shape[0])
    y_pred_train_xgbX = []
    from sklearn.model_selection import KFold,StratifiedKFold
    fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
    i=1
    for train_index, test_index in fold.split(trainX,targetX):
        x_train, x_test = trainX.iloc[train_index], trainX.iloc[test_index]
        y_train, y_test = targetX[train_index], targetX[test_index]
        m = XGBClassifier(**params, random_state=1993)
        m.fit(x_train,y_train,eval_set=[(x_test, y_test)],eval_metric='auc', early_stopping_rounds=200,verbose=200)
        preds=m.predict_proba(x_test)[:,-1]
        print("err: ",roc_auc_score(y_test,preds))
        err_xgbX.append(roc_auc_score(y_test,preds))
        p = m.predict_proba(testX)[:,-1]
        y_pred_trainCV_xgbX[test_index] = preds 
        q = m.predict_proba(trainX)[:,-1]
        i=i+1
        y_pred_train_xgbX.append(q)
        y_pred_test_xgbX.append(p)
    return np.mean(y_pred_test_xgbX,0)

In [8]:
def run_LGB(params, trainX, targetX, testX, seed = 0):    
    err_lgb=[]
    y_pred_test_lgb=[]
    y_pred_trainCV_lgb = np.zeros(train.shape[0])
    y_pred_train_lgb = []
    from sklearn.model_selection import KFold,StratifiedKFold
    fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
    i=1
    for train_index, test_index in fold.split(trainX,targetX):
        x_train, x_test = trainX.iloc[train_index], trainX.iloc[test_index]
        y_train, y_test = targetX[train_index], targetX[test_index]
        m = LGBMClassifier(**params, random_state=1993)
        m.fit(x_train,y_train,eval_set=[(x_test, y_test)],eval_metric='auc', early_stopping_rounds=200,verbose=200)
        preds=m.predict_proba(x_test)[:,-1]
        print("err: ",roc_auc_score(y_test,preds))
        err_lgb.append(roc_auc_score(y_test,preds))
        p = m.predict_proba(testX)[:,-1]
        y_pred_trainCV_lgb[test_index] = preds 
        q = m.predict_proba(trainX)[:,-1]
        i=i+1
        y_pred_train_lgb.append(q)
        y_pred_test_lgb.append(p)
    return np.mean(y_pred_test_lgb,0)

In [9]:
def do_bag(train, test,target, params, seeds, model_type = "LGB"):
    if model_type not in ("LGB", "XGB"):
        raise ValueError("`model_type` must be either `LGB` or `XGB`")
    preds = np.zeros(test.shape[0])
    for i, seed in enumerate(seeds):
        print("#" * 18)
        print(f"RUN - {i+1} , SEED - {seed}")
        print("#" * 18)

        np.random.seed(seed)
        random.seed(seed)
        os.environ["PYTHONHASHSEED"]=str(seed)
        if model_type == 'LGB':
            params.update({
                "feature_fraction_seed" : seed,
                "bagging_fraction_seed" : seed
            })
            preds_ = run_LGB(params, train, target, test, seed=seeds[i])
        else:
            params.update({"seed" : seed})
            preds_ = run_XGB(params, train, target, test, seed=seeds[i])
        preds += preds
    return preds / len(seeds)

In [10]:
lgb_seed = [2019, 2031, 90, 192, 83123, 5601]
xgb_seed  = [2119, 1031, 190, 13192, 23123, 5603]

In [None]:
import random
import os

p1 = do_bag(train, test, target, lgb_params, lgb_seed)
p2 = do_bag(train, test, target, xgb_params, xgb_seed, model_type = "XGB")
final_preds = 0.6 * p1 + 0.4 * p2 

##################
RUN - 1 , SEED - 2019
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.930758
[400]	valid_0's auc: 0.933549
[600]	valid_0's auc: 0.933994
[800]	valid_0's auc: 0.934185
[1000]	valid_0's auc: 0.934062
[1200]	valid_0's auc: 0.933868
[1400]	valid_0's auc: 0.933679
Did not meet early stopping. Best iteration is:
[859]	valid_0's auc: 0.934222
err:  0.9342215652855116




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.942513
[400]	valid_0's auc: 0.945359
[600]	valid_0's auc: 0.94627
[800]	valid_0's auc: 0.946931
[1000]	valid_0's auc: 0.947289
[1200]	valid_0's auc: 0.947454
[1400]	valid_0's auc: 0.947552
Did not meet early stopping. Best iteration is:
[1519]	valid_0's auc: 0.947631
err:  0.9476310579915732


In [None]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test_id
submit['redemption_status'] = final_preds
submit.head()

In [None]:
submit.to_csv("bagged.csv")