In [15]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold,StratifiedKFold, cross_val_score, train_test_split


import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier,Pool, cv

from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, f1_score

In [16]:
train = pd.read_csv("trainF.csv")
test = pd.read_csv("testF.csv")
test_id = pd.read_csv("test.csv")
test_id = test_id['id']
train.shape, test.shape

((78369, 66), (50226, 65))

In [17]:
train['redemption_status'].value_counts()

0    77640
1      729
Name: redemption_status, dtype: int64

In [18]:
target = train['redemption_status']
train.drop('redemption_status', axis = 1, inplace = True)

In [19]:
lgb_params = {
     'metric': 'auc',
     'objective' : 'binary',
     'early_stopping_round':1000,
     'bagging_fraction': 0.4,
     'colsample_bytree': 0.4,
     'feature_fraction': 0.4,
     'lambda_l1': 0.0,
     'lambda_l2': 0.0,
     'learning_rate': 0.001,
     'max_depth': 15,
     'min_child_samples': 100,
     'min_child_weight': 50,
     'min_split_gain': 0.0010640577871197205,
     'n_estimators': 1520,
     'num_leaves': 196,
     'subsample': 0.4,
     'is_unbalance' :True
}


In [20]:
xgb_params = {
    'booster' : 'gbtree',
    'eta' : 0.1,
    'nthread' : 4,
    'silent' : True,
 #   'seed' : 1993,
    'scale_pos_weight': 106,
    'colsample_bytree': 0.547582701158404,
 'gamma': 7.99661203221007,
 'learning_rate': 0.036024836896883045,
 'max_delta_step': 9.545974437155495,
 'max_depth': 4,
 'min_child_weight': 3.9108711107457483,
 'n_estimators': 1015,
 'num_leaves': 100,
 'reg_alpha': 6.095828143522551,
 'reg_lambda': 2.9017833667247137,
 'subsample': 0.9458396927859323
}

In [21]:
def run_XGB(params,trainX, targetX, testX, seed = 0):    
    err_xgbX=[]
    y_pred_test_xgbX=[]
    y_pred_trainCV_xgbX = np.zeros(train.shape[0])
    y_pred_train_xgbX = []
    from sklearn.model_selection import KFold,StratifiedKFold
    fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
    i=1
    for train_index, test_index in fold.split(trainX,targetX):
        x_train, x_test = trainX.iloc[train_index], trainX.iloc[test_index]
        y_train, y_test = targetX[train_index], targetX[test_index]
        m = XGBClassifier(**params, random_state=1993)
        m.fit(x_train,y_train,eval_set=[(x_test, y_test)],eval_metric='auc', early_stopping_rounds=200,verbose=200)
        preds=m.predict_proba(x_test)[:,-1]
        print("err: ",roc_auc_score(y_test,preds))
        err_xgbX.append(roc_auc_score(y_test,preds))
        p = m.predict_proba(testX)[:,-1]
        y_pred_trainCV_xgbX[test_index] = preds 
        q = m.predict_proba(trainX)[:,-1]
        i=i+1
        y_pred_train_xgbX.append(q)
        y_pred_test_xgbX.append(p)
    return np.mean(y_pred_test_xgbX,0)

In [22]:
def run_LGB(params, trainX, targetX, testX, seed = 0):    
    err_lgb=[]
    y_pred_test_lgb=[]
    y_pred_trainCV_lgb = np.zeros(train.shape[0])
    y_pred_train_lgb = []
    from sklearn.model_selection import KFold,StratifiedKFold
    fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
    i=1
    for train_index, test_index in fold.split(trainX,targetX):
        x_train, x_test = trainX.iloc[train_index], trainX.iloc[test_index]
        y_train, y_test = targetX[train_index], targetX[test_index]
        m = LGBMClassifier(**params, random_state=1993)
        m.fit(x_train,y_train,eval_set=[(x_test, y_test)],eval_metric='auc', early_stopping_rounds=200,verbose=200)
        preds=m.predict_proba(x_test)[:,-1]
        print("err: ",roc_auc_score(y_test,preds))
        err_lgb.append(roc_auc_score(y_test,preds))
        p = m.predict_proba(testX)[:,-1]
        y_pred_trainCV_lgb[test_index] = preds 
        q = m.predict_proba(trainX)[:,-1]
        i=i+1
        y_pred_train_lgb.append(q)
        y_pred_test_lgb.append(p)
    return np.mean(y_pred_test_lgb,0)

In [23]:
def do_bag(train, test,target, params, seeds, model_type = "LGB"):
    if model_type not in ("LGB", "XGB"):
        raise ValueError("`model_type` must be either `LGB` or `XGB`")
    preds = []
    for i, seed in enumerate(seeds):
        print("#" * 18)
        print(f"RUN - {i+1} , SEED - {seed}")
        print("#" * 18)

        np.random.seed(seed)
        random.seed(seed)
        os.environ["PYTHONHASHSEED"]=str(seed)
        if model_type == 'LGB':
            params.update({
                "feature_fraction_seed" : seed,
                "bagging_fraction_seed" : seed
            })
            p = run_LGB(params, train, target, test, seed=seeds[i])
        else:
            params.update({"seed" : seed})
            p = run_XGB(params, train, target, test, seed=seeds[i])
        preds.append(p) 
    return np.mean(preds,0)

In [30]:
lgb_seed = [2019, 2031, 90, 192, 83123, 5601]
xgb_seed  = [2119, 1031, 190, 13192, 23123, 5603]

In [31]:
import random
import os

p1 = do_bag(train, test, target, lgb_params, lgb_seed)
p2 = do_bag(train, test, target, xgb_params, xgb_seed, model_type = "XGB")
final_preds = 0.6 * p1 + 0.4 * p2 

##################
RUN - 1 , SEED - 2019
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.930758
[400]	valid_0's auc: 0.933549
[600]	valid_0's auc: 0.933994
[800]	valid_0's auc: 0.934185
[1000]	valid_0's auc: 0.934062
[1200]	valid_0's auc: 0.933868
[1400]	valid_0's auc: 0.933679
Did not meet early stopping. Best iteration is:
[859]	valid_0's auc: 0.934222
err:  0.9342215652855116




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.942513
[400]	valid_0's auc: 0.945359
[600]	valid_0's auc: 0.94627
[800]	valid_0's auc: 0.946931
[1000]	valid_0's auc: 0.947289
[1200]	valid_0's auc: 0.947454
[1400]	valid_0's auc: 0.947552
Did not meet early stopping. Best iteration is:
[1519]	valid_0's auc: 0.947631
err:  0.9476310579915732




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.945328
[400]	valid_0's auc: 0.946446
[600]	valid_0's auc: 0.947238
[800]	valid_0's auc: 0.947495
[1000]	valid_0's auc: 0.94759
[1200]	valid_0's auc: 0.947731
[1400]	valid_0's auc: 0.947758
Did not meet early stopping. Best iteration is:
[1520]	valid_0's auc: 0.947923
err:  0.9479228419893715




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.922017
[400]	valid_0's auc: 0.925762
[600]	valid_0's auc: 0.927242
[800]	valid_0's auc: 0.928091
[1000]	valid_0's auc: 0.928659
[1200]	valid_0's auc: 0.929215
[1400]	valid_0's auc: 0.929644
Did not meet early stopping. Best iteration is:
[1520]	valid_0's auc: 0.929858
err:  0.929858479247387




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.940304
[400]	valid_0's auc: 0.942384
[600]	valid_0's auc: 0.943
[800]	valid_0's auc: 0.943154
[1000]	valid_0's auc: 0.94331
[1200]	valid_0's auc: 0.943477
[1400]	valid_0's auc: 0.943449
Did not meet early stopping. Best iteration is:
[1509]	valid_0's auc: 0.943602
err:  0.9436022135763649
##################
RUN - 2 , SEED - 2031
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.936105
[400]	valid_0's auc: 0.93897
[600]	valid_0's auc: 0.939343
[800]	valid_0's auc: 0.939799
[1000]	valid_0's auc: 0.940259
[1200]	valid_0's auc: 0.940556
[1400]	valid_0's auc: 0.941188
Did not meet early stopping. Best iteration is:
[1500]	valid_0's auc: 0.941403
err:  0.9414030244966229




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.92791
[400]	valid_0's auc: 0.930782
[600]	valid_0's auc: 0.931855
[800]	valid_0's auc: 0.932759
[1000]	valid_0's auc: 0.93355
[1200]	valid_0's auc: 0.934009
[1400]	valid_0's auc: 0.934508
Did not meet early stopping. Best iteration is:
[1520]	valid_0's auc: 0.934801
err:  0.9348011634307976




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.931875
[400]	valid_0's auc: 0.933088
[600]	valid_0's auc: 0.933581
[800]	valid_0's auc: 0.933983
[1000]	valid_0's auc: 0.934076
[1200]	valid_0's auc: 0.934295
[1400]	valid_0's auc: 0.934378
Did not meet early stopping. Best iteration is:
[1505]	valid_0's auc: 0.934507
err:  0.9345071739606049




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.938043
[400]	valid_0's auc: 0.93939
[600]	valid_0's auc: 0.939153
[800]	valid_0's auc: 0.939543
[1000]	valid_0's auc: 0.939868
[1200]	valid_0's auc: 0.940154
[1400]	valid_0's auc: 0.940392
Did not meet early stopping. Best iteration is:
[1490]	valid_0's auc: 0.940549
err:  0.9405490655854558




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.938523
[400]	valid_0's auc: 0.941165
[600]	valid_0's auc: 0.941778
[800]	valid_0's auc: 0.942154
[1000]	valid_0's auc: 0.942299
[1200]	valid_0's auc: 0.942325
[1400]	valid_0's auc: 0.942346
Did not meet early stopping. Best iteration is:
[1520]	valid_0's auc: 0.942467
err:  0.9424672227255769
##################
RUN - 3 , SEED - 90
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.922991
[400]	valid_0's auc: 0.924965
[600]	valid_0's auc: 0.926113
[800]	valid_0's auc: 0.926628
[1000]	valid_0's auc: 0.927183
[1200]	valid_0's auc: 0.927653
[1400]	valid_0's auc: 0.928054
Did not meet early stopping. Best iteration is:
[1519]	valid_0's auc: 0.928168
err:  0.9281682052042091




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.939149
[400]	valid_0's auc: 0.941963
[600]	valid_0's auc: 0.942759
[800]	valid_0's auc: 0.943371
[1000]	valid_0's auc: 0.943698
[1200]	valid_0's auc: 0.943878
[1400]	valid_0's auc: 0.943981
Did not meet early stopping. Best iteration is:
[1323]	valid_0's auc: 0.94413
err:  0.9441298705652361




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.931359
[400]	valid_0's auc: 0.936643
[600]	valid_0's auc: 0.938166
[800]	valid_0's auc: 0.938959
[1000]	valid_0's auc: 0.939496
[1200]	valid_0's auc: 0.9398
[1400]	valid_0's auc: 0.940128
Did not meet early stopping. Best iteration is:
[1509]	valid_0's auc: 0.940356
err:  0.9403560867509333




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.939227
[400]	valid_0's auc: 0.941712
[600]	valid_0's auc: 0.942737
[800]	valid_0's auc: 0.943314
[1000]	valid_0's auc: 0.943876
[1200]	valid_0's auc: 0.944241
[1400]	valid_0's auc: 0.944639
Did not meet early stopping. Best iteration is:
[1505]	valid_0's auc: 0.944765
err:  0.944765267162104




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.94173
[400]	valid_0's auc: 0.943102
[600]	valid_0's auc: 0.94344
[800]	valid_0's auc: 0.943862
[1000]	valid_0's auc: 0.943927
[1200]	valid_0's auc: 0.943982
[1400]	valid_0's auc: 0.944066
Did not meet early stopping. Best iteration is:
[1493]	valid_0's auc: 0.944198
err:  0.9441982447725132
##################
RUN - 4 , SEED - 192
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.934336
[400]	valid_0's auc: 0.936983
[600]	valid_0's auc: 0.937731
[800]	valid_0's auc: 0.938482
[1000]	valid_0's auc: 0.938984
[1200]	valid_0's auc: 0.939529
[1400]	valid_0's auc: 0.939936
Did not meet early stopping. Best iteration is:
[1518]	valid_0's auc: 0.940235
err:  0.9402347857692335




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.936461
[400]	valid_0's auc: 0.937882
[600]	valid_0's auc: 0.937995
[800]	valid_0's auc: 0.938033
[1000]	valid_0's auc: 0.938241
[1200]	valid_0's auc: 0.938483
[1400]	valid_0's auc: 0.93842
Did not meet early stopping. Best iteration is:
[1140]	valid_0's auc: 0.938579
err:  0.9385789170954104




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.932657
[400]	valid_0's auc: 0.935112
[600]	valid_0's auc: 0.935731
[800]	valid_0's auc: 0.935823
[1000]	valid_0's auc: 0.936127
[1200]	valid_0's auc: 0.935935
[1400]	valid_0's auc: 0.935974
Did not meet early stopping. Best iteration is:
[1015]	valid_0's auc: 0.9362
err:  0.9362000945706562




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.937683
[400]	valid_0's auc: 0.940625
[600]	valid_0's auc: 0.941695
[800]	valid_0's auc: 0.942227
[1000]	valid_0's auc: 0.942951
[1200]	valid_0's auc: 0.943192
[1400]	valid_0's auc: 0.943391
Did not meet early stopping. Best iteration is:
[1512]	valid_0's auc: 0.943522
err:  0.9435218218260605




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.938872
[400]	valid_0's auc: 0.942011
[600]	valid_0's auc: 0.943525
[800]	valid_0's auc: 0.943988
[1000]	valid_0's auc: 0.944319
[1200]	valid_0's auc: 0.944317
[1400]	valid_0's auc: 0.944329
Did not meet early stopping. Best iteration is:
[1515]	valid_0's auc: 0.944443
err:  0.944442741921157
##################
RUN - 5 , SEED - 83123
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.938097
[400]	valid_0's auc: 0.93971
[600]	valid_0's auc: 0.940703
[800]	valid_0's auc: 0.941166
[1000]	valid_0's auc: 0.94136
[1200]	valid_0's auc: 0.941789
[1400]	valid_0's auc: 0.94204
Did not meet early stopping. Best iteration is:
[1503]	valid_0's auc: 0.942258
err:  0.9422583066912268




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.92164
[400]	valid_0's auc: 0.924653
[600]	valid_0's auc: 0.925779
[800]	valid_0's auc: 0.926348
[1000]	valid_0's auc: 0.926933
[1200]	valid_0's auc: 0.927437
[1400]	valid_0's auc: 0.927846
Did not meet early stopping. Best iteration is:
[1496]	valid_0's auc: 0.928137
err:  0.928137108043446




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.930818
[400]	valid_0's auc: 0.931671
[600]	valid_0's auc: 0.931603
[800]	valid_0's auc: 0.931667
[1000]	valid_0's auc: 0.931689
[1200]	valid_0's auc: 0.931944
[1400]	valid_0's auc: 0.931966
Did not meet early stopping. Best iteration is:
[1519]	valid_0's auc: 0.932185
err:  0.9321848115291511




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.940153
[400]	valid_0's auc: 0.942778
[600]	valid_0's auc: 0.943609
[800]	valid_0's auc: 0.94364
[1000]	valid_0's auc: 0.943933
[1200]	valid_0's auc: 0.944011
[1400]	valid_0's auc: 0.944025
Did not meet early stopping. Best iteration is:
[1518]	valid_0's auc: 0.944118
err:  0.9441177404670661




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.944264
[400]	valid_0's auc: 0.948354
[600]	valid_0's auc: 0.949947
[800]	valid_0's auc: 0.950515
[1000]	valid_0's auc: 0.950917
[1200]	valid_0's auc: 0.951486
[1400]	valid_0's auc: 0.95169
Did not meet early stopping. Best iteration is:
[1495]	valid_0's auc: 0.951823
err:  0.9518227362362095
##################
RUN - 6 , SEED - 5601
##################




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.938751
[400]	valid_0's auc: 0.941483
[600]	valid_0's auc: 0.942708
[800]	valid_0's auc: 0.943393
[1000]	valid_0's auc: 0.943789
[1200]	valid_0's auc: 0.943947
[1400]	valid_0's auc: 0.944297
Did not meet early stopping. Best iteration is:
[1388]	valid_0's auc: 0.944302
err:  0.944301676864771




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.934643
[400]	valid_0's auc: 0.93757
[600]	valid_0's auc: 0.938081
[800]	valid_0's auc: 0.938424
[1000]	valid_0's auc: 0.938836
[1200]	valid_0's auc: 0.939138
[1400]	valid_0's auc: 0.939505
Did not meet early stopping. Best iteration is:
[1519]	valid_0's auc: 0.939687
err:  0.9396869464264289




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.933482
[400]	valid_0's auc: 0.935987
[600]	valid_0's auc: 0.936927
[800]	valid_0's auc: 0.937594
[1000]	valid_0's auc: 0.937967
[1200]	valid_0's auc: 0.93837
[1400]	valid_0's auc: 0.938713
Did not meet early stopping. Best iteration is:
[1520]	valid_0's auc: 0.938851
err:  0.9388506312944183




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.935889
[400]	valid_0's auc: 0.939743
[600]	valid_0's auc: 0.94089
[800]	valid_0's auc: 0.941113
[1000]	valid_0's auc: 0.941578
[1200]	valid_0's auc: 0.941727
[1400]	valid_0's auc: 0.941993
Did not meet early stopping. Best iteration is:
[1507]	valid_0's auc: 0.942234
err:  0.9422338259476473




Training until validation scores don't improve for 1000 rounds.
[200]	valid_0's auc: 0.934852
[400]	valid_0's auc: 0.937726
[600]	valid_0's auc: 0.938904
[800]	valid_0's auc: 0.939581
[1000]	valid_0's auc: 0.940486
[1200]	valid_0's auc: 0.940942
[1400]	valid_0's auc: 0.941524
Did not meet early stopping. Best iteration is:
[1518]	valid_0's auc: 0.941927
err:  0.9419267085931532
##################
RUN - 1 , SEED - 2119
##################
[0]	validation_0-auc:0.877646
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.942691
[400]	validation_0-auc:0.94561
[600]	validation_0-auc:0.945545
[800]	validation_0-auc:0.94664
[1000]	validation_0-auc:0.947582
[1014]	validation_0-auc:0.947616
err:  0.9476385565977148
[0]	validation_0-auc:0.856305
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.915408
[400]	validation_0-auc:0.915006
Stopping. Best iteration:
[268]	validation_0-auc:0.917035

err:  0.9170352011037948
[0]	

err:  0.9264009601744616
[0]	validation_0-auc:0.904574
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.951067
[400]	validation_0-auc:0.953764
[600]	validation_0-auc:0.95448
[800]	validation_0-auc:0.954807
Stopping. Best iteration:
[647]	validation_0-auc:0.955043

err:  0.9550434301623933
[0]	validation_0-auc:0.877417
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.933727
[400]	validation_0-auc:0.935183
Stopping. Best iteration:
[294]	validation_0-auc:0.935753

err:  0.9357532658635219
[0]	validation_0-auc:0.857741
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.930139
[400]	validation_0-auc:0.932267
Stopping. Best iteration:
[370]	validation_0-auc:0.933037

err:  0.933037226609642
[0]	validation_0-auc:0.860669
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.933493
[400]	validation_0-auc:0.937399
[600]	validation_0-auc:0.

In [32]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test_id
submit['redemption_status'] = final_preds
submit.head()

Unnamed: 0,id,redemption_status
0,3,0.65768
1,4,0.048866
2,5,0.599021
3,8,0.006927
4,10,0.010786


In [33]:
submit.to_csv("bagged.csv")