In [30]:
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder


import lightgbm as lgb
from sklearn.metrics import roc_auc_score

import math
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold

In [2]:
train=pd.read_csv('train_final2.csv')
test=pd.read_csv('test_final2.csv')

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
SEED = 42
seed_everything(SEED)
TARGET = 'isFraud'  
print(train.shape, test.shape)

(590540, 603) (506691, 602)


In [17]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT','TransactionID','uid','uid2','bank_type'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

X_test = test.drop(['TransactionDT','TransactionID','uid','uid2','bank_type'], axis=1)


179

In [18]:
obj_cols = X.dtypes
obj_cols[obj_cols=='object']

M1       object
id_35    object
dtype: object

In [19]:
f = lambda x: 1 if x==True else 0 if x==False else x

In [20]:
X[['M1','id_35']]=X[['M1','id_35']].applymap(f)

In [21]:
X_test[['M1','id_35']]=X_test[['M1','id_35']].applymap(f)

In [22]:
X.shape

(590540, 597)

In [18]:
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                } 

In [29]:
%%time

# Kfold oof - Out Of Fold
# as we will use one fold as validation
# and stop training when validation metric
# stops improve
#shuffle=True : https://www.linkedin.com/pulse/data-shuffling-why-important-machine-learning-how-do-deepak-n-gowda 
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.963606	valid_1's auc: 0.942321
[400]	training's auc: 0.989132	valid_1's auc: 0.960787
[600]	training's auc: 0.99617	valid_1's auc: 0.96748
[800]	training's auc: 0.998536	valid_1's auc: 0.971005
[1000]	training's auc: 0.999427	valid_1's auc: 0.972926
[1200]	training's auc: 0.999786	valid_1's auc: 0.974152
[1400]	training's auc: 0.999921	valid_1's auc: 0.975162
[1600]	training's auc: 0.999974	valid_1's auc: 0.975887
[1800]	training's auc: 0.999992	valid_1's auc: 0.976428
[2000]	training's auc: 0.999998	valid_1's auc: 0.976831
[2200]	training's auc: 0.999999	valid_1's auc: 0.977159
[2400]	training's auc: 1	valid_1's auc: 0.977391
[2600]	training's auc: 1	valid_1's auc: 0.97757
[2800]	training's auc: 1	valid_1's auc: 0.977815
[3000]	training's auc: 1	valid_1's auc: 0.977866
[3200]	training's auc: 1	valid_1's auc: 0.977954
[3400]	training's auc: 1	valid_1's auc: 0.977984
[3600]	training's auc: 1	valid_1's

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv("lgb_kfold.csv", index=False)
## 1st: Training score goes upto 1 and it's not normal situation
## we definitely should stop before 
## 2nd: Our LB probing gave 0.9442 and it is too far away from validation score
## some difference is normal, but such gap is too big

In [31]:
%%time
from sklearn.model_selection import StratifiedKFold
print('#'*20)
print('StratifiedKFold training...')
# Same as normal kfold but we can be sure
# that our target is perfectly distribuited
# over folds
NFOLDS = 5
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

####################
StratifiedKFold training...
Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.964852	valid_1's auc: 0.936883
[400]	training's auc: 0.989432	valid_1's auc: 0.956731
[600]	training's auc: 0.996356	valid_1's auc: 0.963665
[800]	training's auc: 0.998636	valid_1's auc: 0.967067
[1000]	training's auc: 0.999461	valid_1's auc: 0.969175
[1200]	training's auc: 0.9998	valid_1's auc: 0.970594
[1400]	training's auc: 0.999927	valid_1's auc: 0.97152
[1600]	training's auc: 0.999976	valid_1's auc: 0.972292
[1800]	training's auc: 0.999993	valid_1's auc: 0.972776
[2000]	training's auc: 0.999998	valid_1's auc: 0.97325
[2200]	training's auc: 1	valid_1's auc: 0.973603
[2400]	training's auc: 1	valid_1's auc: 0.973865
[2600]	training's auc: 1	valid_1's auc: 0.974113
[2800]	training's auc: 1	valid_1's auc: 0.974239
[3000]	training's auc: 1	valid_1's auc: 0.974469
[3200]	training's auc: 1	valid_1's auc: 0.974497
[3400]	training's auc: 1	valid_1's auc: 0.

In [32]:
sub = pd.read_csv('sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv("lgb_Strat.csv", index=False) #0.9445
## We have same "problems" here as in normal kfold
## 1st: Training score goes upto 1 and it's not normal situation
## we definitely should stop before 
## 2nd: Our LB probing gave 0.9445 and it is too far away from validation score
## some difference is normal, but such gap is too big

In [33]:
#LBO (last block out)
#For Time series data (what we have here) we can use (sometimes) last Time block as validation subset 
# and track mean early stopping round.

In [5]:
f = lambda x: 1 if x==True else 0 if x==False else x
train[['M1','id_35']]=train[['M1','id_35']].applymap(f)
test[['M1','id_35']]=test[['M1','id_35']].applymap(f)

In [6]:
train=train.drop(['TransactionID','uid','uid2','bank_type'], axis=1)

In [8]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [9]:
## We need Divide Train Set by Time blocks
## Convert TransactionDT to Months
## And use last month as Validation
train['DT_M'] = train['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train['DT_M'] = (train['DT_M'].dt.year-2017)*12 + train['DT_M'].dt.month 

main_train_set = train[train['DT_M']<(train['DT_M'].max())].reset_index(drop=True)
validation_set = train[train['DT_M']==train['DT_M'].max()].reset_index(drop=True)

In [10]:
# Main Data
X= main_train_set.drop(['isFraud', 'TransactionDT'], axis=1)
y = main_train_set['isFraud']
# Validation Data
v_X=validation_set.drop(['isFraud', 'TransactionDT'], axis=1)
v_y=validation_set['isFraud']
test['DT_M'] = test['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
test['DT_M'] = (test['DT_M'].dt.year-2017)*12 + test['DT_M'].dt.month 
X_test = test.drop(['TransactionDT','TransactionID','uid','uid2','bank_type'], axis=1)

In [11]:
X.shape

(501214, 598)

In [12]:
v_X.shape

(89326, 598)

In [13]:
X_test.shape

(506691, 598)

In [20]:
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

estimators_bestround = []
y_preds = np.zeros(X_test.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(v_X, label=v_y)  

    estimator = lgb.train(
            params,
            train_data,
            10000,
            valid_sets = [train_data, valid_data],
            verbose_eval=200, early_stopping_rounds=500
        )
    estimators_bestround.append(estimator.current_iteration())
    y_preds += estimator.predict(X_test) / NFOLDS
    del estimator,tr_x, tr_y, train_data,valid_data
    gc.collect()

Fold: 1
Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.968971	valid_1's auc: 0.912174
[400]	training's auc: 0.99219	valid_1's auc: 0.928999
[600]	training's auc: 0.997864	valid_1's auc: 0.933664
[800]	training's auc: 0.999347	valid_1's auc: 0.934857
[1000]	training's auc: 0.999802	valid_1's auc: 0.935005
[1200]	training's auc: 0.999944	valid_1's auc: 0.935198
[1400]	training's auc: 0.999985	valid_1's auc: 0.935251
[1600]	training's auc: 0.999997	valid_1's auc: 0.934984
Early stopping, best iteration is:
[1268]	training's auc: 0.999964	valid_1's auc: 0.935446
Fold: 2
Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.967858	valid_1's auc: 0.912896
[400]	training's auc: 0.992214	valid_1's auc: 0.929305
[600]	training's auc: 0.997732	valid_1's auc: 0.935173
[800]	training's auc: 0.999281	valid_1's auc: 0.937163
[1000]	training's auc: 0.999771	valid_1's auc: 0.938183
[1200]	training's auc: 0.999932	valid_1's auc: 0

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv("lgb_lbo.csv", index=False) #0.9387

In [37]:
corrected_lgb_params = params.copy()

In [38]:
corrected_lgb_params['n_estimators'] = int(np.mean(estimators_bestround))
corrected_lgb_params['early_stopping_rounds'] = None
print('#'*10)
print('Mean Best round:', corrected_lgb_params['n_estimators'])

##########
Mean Best round: 1601


In [42]:
X= train.drop(['isFraud', 'TransactionDT','groups'], axis=1)
y = train['isFraud']

In [43]:
X.shape

(590540, 598)

In [44]:
corrected_lgb_params

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.7,
 'early_stopping_rounds': None,
 'learning_rate': 0.01,
 'max_bin': 255,
 'max_depth': -1,
 'metric': 'auc',
 'n_estimators': 1601,
 'n_jobs': -1,
 'num_leaves': 256,
 'objective': 'binary',
 'seed': 42,
 'subsample': 0.7,
 'subsample_freq': 1,
 'tree_learner': 'serial',
 'verbose': -1}

In [45]:
NUMBER_OF_MODELS = 3
y_preds = np.zeros(X_test.shape[0])
for current_model in range(NUMBER_OF_MODELS):
    print('Model:',current_model+1)
    SEED += 1
    seed_everything(SEED)    
    train_data = lgb.Dataset(X, label=y)

    estimator = lgb.train(
            corrected_lgb_params,
            train_data
        )
    
    y_preds += estimator.predict(X_test) / NUMBER_OF_MODELS

Model: 1
Model: 2
Model: 3


In [46]:
sub = pd.read_csv('sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv("lgb_lbo2.csv", index=False)  #0.9455

In [22]:
#GroupKFold
#The folds are approximately balanced in the sense that the number of distinct groups is approximately the same in each fold.

#Why we may use it? Let's imagine that we want to separate train data by time blocks groups or client IDs or something else. 
# With GroupKFold we can be sure that our validation fold will contain groupIDs that are not in main train set. 
#Sometimes it helps to deal with "dataleakage" and overfit.

In [26]:
from sklearn.model_selection import GroupKFold
NFOLDS = 5
folds = GroupKFold(n_splits=NFOLDS)

## We need Divide Train Set by Time blocks
## Convert TransactionDT to Months
# Main Data
train['groups'] = train['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train['groups'] = (train['groups'].dt.year-2017)*12 + train['groups'].dt.month 
split_groups = train['groups']

In [27]:
X= train.drop(['isFraud', 'TransactionDT','groups'], axis=1)
y = train['isFraud']

In [31]:
columns = X.columns
splits = folds.split(X, y, groups=split_groups)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.966919	valid_1's auc: 0.894047
[400]	training's auc: 0.990188	valid_1's auc: 0.90792
[600]	training's auc: 0.996667	valid_1's auc: 0.913338
[800]	training's auc: 0.998783	valid_1's auc: 0.915384
[1000]	training's auc: 0.99955	valid_1's auc: 0.916772
[1200]	training's auc: 0.999835	valid_1's auc: 0.917604
[1400]	training's auc: 0.999942	valid_1's auc: 0.918081
[1600]	training's auc: 0.999981	valid_1's auc: 0.918398
[1800]	training's auc: 0.999994	valid_1's auc: 0.918534
[2000]	training's auc: 0.999998	valid_1's auc: 0.918736
[2200]	training's auc: 1	valid_1's auc: 0.918661
[2400]	training's auc: 1	valid_1's auc: 0.918661
[2600]	training's auc: 1	valid_1's auc: 0.918542
[2800]	training's auc: 1	valid_1's auc: 0.91819
Early stopping, best iteration is:
[2411]	training's auc: 1	valid_1's auc: 0.918777
Fold 1 | AUC: 0.9186551265190323
Training until validation scores don't improve for 500 rounds.
[200]	tr

In [32]:
sub = pd.read_csv('sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv("lgb_group.csv", index=False) #0.9472

In [33]:
from sklearn.model_selection import GroupKFold
NFOLDS = 5
folds = GroupKFold(n_splits=NFOLDS)

## We need Divide Train Set by Time blocks
## Convert TransactionDT to Months
# Main Data
train['groups'] = ''
for col in ['card1','card2','card3','card5','addr1','addr2',]:
    train['groups'] = '_' + train[col].astype(str)
split_groups = train['groups']

In [34]:
columns = X.columns
splits = folds.split(X, y, groups=split_groups)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.985615	valid_1's auc: 0.696925
[400]	training's auc: 0.998088	valid_1's auc: 0.693199
[600]	training's auc: 0.999895	valid_1's auc: 0.688065
Early stopping, best iteration is:
[192]	training's auc: 0.984733	valid_1's auc: 0.698358
Fold 1 | AUC: 0.6983583512294759
Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.967338	valid_1's auc: 0.891433
[400]	training's auc: 0.992707	valid_1's auc: 0.894243
[600]	training's auc: 0.998273	valid_1's auc: 0.897631
[800]	training's auc: 0.999599	valid_1's auc: 0.900997
[1000]	training's auc: 0.999897	valid_1's auc: 0.903263
[1200]	training's auc: 0.999968	valid_1's auc: 0.903569
[1400]	training's auc: 0.999993	valid_1's auc: 0.90365
[1600]	training's auc: 0.999998	valid_1's auc: 0.90416
[1800]	training's auc: 1	valid_1's auc: 0.904409
[2000]	training's auc: 1	valid_1's auc: 0.904053
[2200]	training's auc: 1	valid_1's auc: 0.9038

In [35]:
sub = pd.read_csv('sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv("lgb_group2.csv", index=False) #0.9404