In [1]:
import random
import copy
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Loading the merged train and test dataset

train_data = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee-fraud-detection/Data/train_fe.pkl')
test_data = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee-fraud-detection/Data/test_fe.pkl')

In [5]:
train_data["card3_FE"]

0         0.882729
1         0.882729
2         0.882729
3         0.882729
4         0.882729
            ...   
590535    0.882729
590536    0.882729
590537    0.882729
590538    0.882729
590539    0.882729
Name: card3_FE, Length: 590540, dtype: float64

In [None]:
def hyperparam_vs_auc(train_roc_auc, cv_roc_auc):
    
    '''
        Utility Function to plot the Training and Cross Validation ROC-AUC Values
        for different Hyperparameter.
    '''
    
    plt.plot(range(len(train_roc_auc)), train_roc_auc, label='Train AUC')
    plt.plot(range(len(cv_roc_auc)), cv_roc_auc, label='CV AUC')

    plt.scatter(range(len(train_roc_auc)), train_roc_auc, label='Train AUC points')
    plt.scatter(range(len(cv_roc_auc)), cv_roc_auc, label='CV AUC points')
    
    plt.xticks(range(len(train_roc_auc)))
    plt.legend()
    plt.xlabel("Hyperparameter Index")
    plt.ylabel("AUC")
    plt.title("\n Hyperparameter vs ROC-AUC \n")
    plt.grid()
    plt.show()

In [None]:
X_train = train_data.drop(['isFraud'], axis=1)
y_train = train_data['isFraud']

X_test = test_data

del train_data, test_data

In [None]:
# Hyperparameters

learning_rate = [2e-2, 3e-1, 1e-1]
max_depth = [8, 12, 16]
subsample = [0.6,0.8,1]
colsample_bytree = [0.6,0.8,1]

In [None]:
results = {}

dtrain = xgb.DMatrix(X_train, label=y_train)

for rate in learning_rate:
  for depth in max_depth:
    for sample in subsample:
      for colsample in colsample_bytree:

        params = {
          'objective' : 'binary:logistic',
          'eval_metric' : 'auc',
          'learning_rate' : rate,
          'max_depth' : depth,
          'subsample' : sample,
          'colsample_bytree' : colsample,
          'tree_method' : 'gpu_hist',
          'random_state' : 3,
        }

        history = xgb.cv(
            params,
            dtrain,
            num_boost_round = 3000,
            nfold = 3,
            metrics ='auc',
            early_stopping_rounds = 100,
            verbose_eval=100,
            seed=3,
            shuffle = False
        )

        name = "learning_rate : "+str(rate)+" max_depth : "+str(depth)+" subsample : "+str(sample)+" colsample_bytree : "+str(colsample)
        results[name] = (history.iloc[-1]['train-auc-mean'],history.iloc[-1]['test-auc-mean'])

[0]	train-auc:0.81863+0.00087	test-auc:0.80471+0.00660
[100]	train-auc:0.89960+0.00424	test-auc:0.86987+0.00717
[200]	train-auc:0.94470+0.00145	test-auc:0.89431+0.01057
[300]	train-auc:0.96862+0.00108	test-auc:0.90968+0.01142
[400]	train-auc:0.97847+0.00103	test-auc:0.91551+0.01259
[500]	train-auc:0.98393+0.00079	test-auc:0.91827+0.01277
[600]	train-auc:0.98765+0.00062	test-auc:0.91972+0.01321
[700]	train-auc:0.99046+0.00044	test-auc:0.92095+0.01301
[800]	train-auc:0.99252+0.00024	test-auc:0.92129+0.01355
[900]	train-auc:0.99412+0.00015	test-auc:0.92159+0.01338
[1000]	train-auc:0.99537+0.00014	test-auc:0.92142+0.01373
[1054]	train-auc:0.99597+0.00013	test-auc:0.92147+0.01347
[0]	train-auc:0.80739+0.00603	test-auc:0.78052+0.02079
[100]	train-auc:0.89907+0.00485	test-auc:0.86912+0.00646
[200]	train-auc:0.94556+0.00215	test-auc:0.89491+0.01095
[300]	train-auc:0.96951+0.00149	test-auc:0.91041+0.01247
[400]	train-auc:0.97888+0.00111	test-auc:0.91639+0.01249
[500]	train-auc:0.98423+0.00080	t

XGBoostError: ignored