In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import glob
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler
import time
import optuna
import matplotlib.pyplot as plt

In [2]:
data_path = "/data/rjoshi/erdos/"

In [3]:
files = glob.glob(f"{data_path}/*.csv")
files = [f for f in files if "sample_submission" not in f]

In [4]:
df_test_transaction = pd.read_csv(files[0])
df_train_transaction = pd.read_csv(files[3])

In [30]:
cols_to_use = ['card1', 'V304', 'V123', 'card3', 'V34', 'V74', 'V94', 'V52', 'V257', 'isFraud']

In [59]:
df_train_to_use = df_train_transaction[cols_to_use]
df_train_to_use = df_train_to_use.fillna(-999)
sample_weights = compute_sample_weight(class_weight="balanced", y=df_train_to_use['isFraud'])

In [60]:
split_index1 = int(df_train_to_use.shape[0]*0.8)
split_index2 = int(df_train_to_use.shape[0]*0.9)
train_df, val_df, test_df = df_train_to_use[:split_index1], df_train_to_use[split_index1:split_index2], df_train_to_use[split_index2:]
sample_weights_train, sample_weights_val = sample_weights[:split_index1], sample_weights[split_index1:split_index2]
y_train  = train_df.pop('isFraud')
y_test = test_df.pop('isFraud')
y_val = val_df.pop('isFraud')
x_train = train_df
x_test = test_df
x_val = val_df

In [39]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
#skf = StratifiedKFold()
model = xgb.XGBClassifier(callbacks = [xgb.callback.EarlyStopping(3)], random_state = 42)
#for i, (train_index, val_index) in enumerate(skf.split()):
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val,y_val)])

[0]	validation_0-logloss:0.17643	validation_1-logloss:0.17091
[1]	validation_0-logloss:0.15714	validation_1-logloss:0.15176
[2]	validation_0-logloss:0.14399	validation_1-logloss:0.13920
[3]	validation_0-logloss:0.13579	validation_1-logloss:0.13090
[4]	validation_0-logloss:0.13026	validation_1-logloss:0.12546
[5]	validation_0-logloss:0.12680	validation_1-logloss:0.12192
[6]	validation_0-logloss:0.12466	validation_1-logloss:0.11981
[7]	validation_0-logloss:0.12304	validation_1-logloss:0.11861
[8]	validation_0-logloss:0.12212	validation_1-logloss:0.11775
[9]	validation_0-logloss:0.12123	validation_1-logloss:0.11715
[10]	validation_0-logloss:0.12075	validation_1-logloss:0.11688
[11]	validation_0-logloss:0.12002	validation_1-logloss:0.11691
[12]	validation_0-logloss:0.11961	validation_1-logloss:0.11666
[13]	validation_0-logloss:0.11947	validation_1-logloss:0.11662
[14]	validation_0-logloss:0.11887	validation_1-logloss:0.11636
[15]	validation_0-logloss:0.11869	validation_1-logloss:0.11629
[1

In [42]:
y_pred_train = model.predict(x_train)
y_pred_val = model.predict(x_val)
print(metrics.classification_report(y_train, y_pred_train, target_names=["Not Fraud", "Fraud"]))
print(metrics.classification_report(y_val, y_pred_val, target_names=["Not Fraud", "Fraud"]))
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_pred_val)
print(metrics.auc(fpr, tpr))

              precision    recall  f1-score   support

   Not Fraud       0.97      1.00      0.99    455833
       Fraud       0.83      0.22      0.35     16599

    accuracy                           0.97    472432
   macro avg       0.90      0.61      0.67    472432
weighted avg       0.97      0.97      0.96    472432

              precision    recall  f1-score   support

   Not Fraud       0.97      1.00      0.99     57203
       Fraud       0.69      0.17      0.27      1851

    accuracy                           0.97     59054
   macro avg       0.83      0.58      0.63     59054
weighted avg       0.96      0.97      0.96     59054

0.5825148076759961


In [61]:
# Hyper-parameter optimization
def optuna_objective(trial, x_train, y_train, sample_weights, x_validation, y_validation, sample_weights_val = None, metric='f1'):
    metric = metric.lower()
    params = {
       'tree_method': 'hist',
       'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 0.001, 25, log=True),
        'reg_alpha': trial.suggest_float('alpha', 0, 25),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.8),
        'objective': 'binary:logistic',
        'eval_metric': ['logloss', 'auc', 'aucpr'],
        'n_estimators': trial.suggest_int('n_estimators', 40, 100),
        'max_delta_step': trial.suggest_float('max_delta_step', 1, 9),
        'early_stopping_rounds': 5
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'validation_1-logloss')
    bdt_model = xgb.XGBClassifier(**params, callbacks=[pruning_callback])
    bdt_model.fit(x_train, y_train, sample_weight=sample_weights, eval_set=[(x_train, y_train), (x_validation,y_validation)], 
                  sample_weight_eval_set=[sample_weights, sample_weights_val])
    
    y_pred_val =  bdt_model.predict(x_validation)
    #f1Score = metrics.f1_score(y_validation, y_pred_val, average="weighted", sample_weight = sample_weights_val)
    f1Score = metrics.f1_score(y_validation, y_pred_val)
    print(f1Score)
    return f1Score

def hyper_parameter_optimization(x_train, y_train, x_validation, y_validation, sample_weights = None, sample_weights_val=None, metric='f1', 
                                    ntrials = 50):
    sampler = optuna.samplers.TPESampler(seed=123)
    #sampler = optuna.samplers.QMCSampler(seed=123)
    pruner = optuna.pruners.HyperbandPruner()
    study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
    study.optimize(lambda trial: optuna_objective(trial, x_train, y_train, sample_weights, x_validation, y_validation, sample_weights_val, 
                                                  metric=metric), n_trials = ntrials)
    best_params = study.best_params
    return best_params

In [None]:
hyper_parameter_optimization(x_train, y_train, x_val, y_val, sample_weights = sample_weights_train)

In [67]:
# Best parameters based on f1 score of validation set
best_params = {'max_depth': 9,
 'subsample': 0.8893665432759819,
 'colsample_bynode': 0.6064516826780068,
 'lambda': 0.41141085381708425,
 'alpha': 13.919629809857218,
 'learning_rate': 0.12800875567163347,
 'n_estimators': 49,
 'max_delta_step': 6.564236230167287}

In [76]:
df_cv_train = df_train_to_use[:split_index2]
y_cv_train = df_cv_train.pop('isFraud')
x_cv_train = df_cv_train
sample_weights_cv = sample_weights[:split_index2]
del df_cv_train

In [79]:
skf = StratifiedKFold()
model = xgb.XGBClassifier(callbacks = [xgb.callback.EarlyStopping(3)], random_state = 42, eval_metric =['logloss', 'auc', 'aucpr'],
                          **best_params)
for i, (train_index, val_index) in enumerate(skf.split(x_cv_train, y_cv_train)):
    print(f"-------------------------- \nFold {i}\n----------------------------")
    x_train_fold, x_val_fold = x_cv_train.iloc[train_index], x_cv_train.iloc[val_index]
    y_train_fold, y_val_fold = y_cv_train.iloc[train_index], y_cv_train.iloc[val_index]
    sample_weights_fold = sample_weights_cv[train_index]
    model.fit(x_train_fold, y_train_fold, sample_weight=sample_weights_fold, eval_set=[(x_train_fold, y_train_fold), (x_val_fold, y_val_fold)]) 

-------------------------- 
Fold 0
----------------------------
[0]	validation_0-logloss:0.66177	validation_0-auc:0.74197	validation_0-aucpr:0.30775	validation_1-logloss:0.67604	validation_1-auc:0.65109	validation_1-aucpr:0.20918
[1]	validation_0-logloss:0.63697	validation_0-auc:0.75859	validation_0-aucpr:0.33272	validation_1-logloss:0.66045	validation_1-auc:0.66781	validation_1-aucpr:0.23716
[2]	validation_0-logloss:0.61743	validation_0-auc:0.76358	validation_0-aucpr:0.33695	validation_1-logloss:0.64989	validation_1-auc:0.67344	validation_1-aucpr:0.23983
[3]	validation_0-logloss:0.60010	validation_0-auc:0.76979	validation_0-aucpr:0.34275	validation_1-logloss:0.64024	validation_1-auc:0.68029	validation_1-aucpr:0.24604
[4]	validation_0-logloss:0.58603	validation_0-auc:0.77225	validation_0-aucpr:0.34476	validation_1-logloss:0.63226	validation_1-auc:0.68263	validation_1-aucpr:0.24935
[5]	validation_0-logloss:0.57374	validation_0-auc:0.77297	validation_0-aucpr:0.34811	validation_1-logloss: