<a href="https://colab.research.google.com/github/anbinh-bui/Graph-Neural-Networks-for-Credit-Card-Fraud-Detection/blob/main/IEEE_Credit_Card_Fraud_Detection_Classic_ML_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import random

import os
from tqdm import tqdm
from os.path import isfile

import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
import joblib  # Import joblib directly instead of sklearn.externals.joblib
from sklearn.decomposition import TruncatedSVD

%pip install bayesian-optimization==1.4.1
from bayes_opt import BayesianOptimization
from bayes_opt.event import Events
from bayes_opt.util import load_logs

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

import time
import datetime

import warnings
warnings.filterwarnings("ignore")

# List files in the input directory
input_dir = "../input"
if os.path.exists(input_dir):
    print(os.listdir(input_dir))
else:
    print(f"Directory {input_dir} does not exist.")
print()

# Print library versions
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)
print()
print("lightgbm:", lgb.__version__)
print("xgboost:", xgb.__version__)

# Handle joblib deprecation warning
print(joblib.__version__)


Collecting bayesian-optimization==1.4.1
  Downloading bayesian_optimization-1.4.1-py3-none-any.whl (18 kB)
Collecting colorama (from bayesian-optimization==1.4.1)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Reason for being yanked: https://github.com/fmfn/BayesianOptimization/pull/388[0m[33m
[0mInstalling collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.1 colorama-0.4.6
Directory ../input does not exist.

pandas: 2.0.3
numpy: 1.25.2
sklearn: 1.2.2

lightgbm: 4.1.0
xgboost: 2.0.3
1.4.2


In [None]:
train_transaction = pd.read_csv('/content/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('/content/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('/content/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('/content/test_identity.csv', index_col='TransactionID')


In [None]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(f'Shape of train set: {train.shape}')
print(f'Shape of test set: {test.shape}')

Shape of train set: (17474, 433)
Shape of test set: (16492, 432)


In [None]:
train = train.sample(frac=0.1, random_state=42)
train.reset_index(drop=True, inplace=True)

In [None]:
y = train.isFraud.values

train = train.drop('isFraud', axis=1)
test = test.copy()
train = train.fillna(-1)
test = test.fillna(-1)
del train_transaction, train_identity, test_transaction, test_identity

In [None]:
one_hot_encoded_training_predictors = pd.get_dummies(train)
one_hot_encoded_test_predictors = pd.get_dummies(test)
train, test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left',
                                                                    axis=1)


In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#Reduce memory
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to  1.86 Mb (68.2% reduction)
Mem. usage decreased to 41.03 Mb (46.5% reduction)


In [None]:
cols = list(train.columns)
len(cols)

710

In [None]:
scaler = StandardScaler() #MinMaxScaler StandardScaler RobustScaler

train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

In [None]:
N = 50

svd = TruncatedSVD(n_components=N, random_state=42)
X = svd.fit_transform(train[cols], y)
svd.explained_variance_ratio_.sum()

0.6638059995504755

In [None]:
df = pd.DataFrame()
df["target"] = y

for i in range(50):
    df[i] = X[:,i]

df.tail()

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
1742,0,-4.16927,-6.137926,0.225054,1.735267,0.242071,1.482906,0.776729,1.194452,-0.223661,...,0.052586,0.138284,-0.606346,-0.424987,-0.085323,0.892422,-1.335924,-0.181874,0.955157,0.59786
1743,0,-6.43222,-1.313934,1.425615,-3.090562,-1.71805,1.30442,3.28692,-4.076557,0.392986,...,-0.610437,-0.382381,0.137573,0.449639,-0.151563,-0.239922,1.222568,0.372538,-0.88579,-0.624837
1744,0,-6.737579,6.572508,-4.502834,4.637856,-0.016813,1.456501,-1.617009,1.661543,0.024143,...,0.416063,-0.607013,1.360209,-1.083716,-0.37309,0.766927,-0.645248,-0.661573,0.155829,-0.405337
1745,0,-6.284942,-0.615041,2.654329,-0.671305,0.386302,-0.230218,2.683456,-1.789377,-0.868489,...,-1.345255,0.626161,0.947301,0.048771,0.49873,0.084642,1.528615,-0.270982,-0.015048,0.282759
1746,0,26.988478,-3.380037,-0.421206,-2.845088,-3.659114,-2.730503,-4.279836,-0.980839,-2.602258,...,-0.768825,-1.460621,-1.614932,1.379103,1.385379,1.165956,-2.657414,1.353836,-2.474972,0.166621


Logistic Regression

In [None]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "reg"

train[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]

    reg = LogisticRegression(C=1,
                             solver="newton-cg",
                             penalty="l2",
                             n_jobs=-1,
                             max_iter=100).fit(X_train, y_train)

    y_pred = reg.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 5))

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.63088
1 ROC AUC: 0.79176
2 ROC AUC: 0.88595
3 ROC AUC: 0.75882
4 ROC AUC: 0.83235

OOF ROC AUC: 0.77124

CPU times: user 194 ms, sys: 300 ms, total: 495 ms
Wall time: 4.9 s


Random Forest

In [None]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "rfc"

train[arch] = 0
test[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]

    rfc = RandomForestClassifier(n_estimators=100,
                                 criterion='gini',
                                 n_jobs=-1).fit(X_train, y_train)

    y_pred = rfc.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 5))

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.79794
1 ROC AUC: 0.64647
2 ROC AUC: 0.78366
3 ROC AUC: 0.72271
4 ROC AUC: 0.68676

OOF ROC AUC: 0.72666

CPU times: user 6.44 s, sys: 150 ms, total: 6.59 s
Wall time: 11.1 s


LGBM

In [None]:
from sklearn.model_selection import StratifiedKFold

# Parameters for LightGBM
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'seed': 42,
    'max_depth': -1,
    'verbose': -1,
    'n_jobs': -1
}

# Parameters for training to prevent eary_stop_rounds
rounds = 10000
early_stop_rounds = 300

# Prepare data for LightGBM
X = svd.fit_transform(train[cols], y)  # Use SVD transformed data
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "lgbm"

train[arch] = 0
test[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    # Create LightGBM datasets
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_valid, label=y_valid, reference=d_train)

    # Train the model
    model = lgb.train(params,
                      d_train,
                      num_boost_round=rounds,
                      valid_sets=[d_valid],
                      callbacks=[
                          lgb.early_stopping(stopping_rounds=early_stop_rounds),
                          lgb.log_evaluation(period=100)
                      ])

    # Predict and store the results
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    train.loc[valid_index, arch] = y_pred
    print(f"Fold {i + 1} ROC AUC: {round(roc_auc_score(y_valid, y_pred), 5)}")

    # Ensure no NaNs in the test data before transforming
    test_filled = test[cols].fillna(-1)

    # Predict on test set and average predictions
    test_pred = model.predict(svd.transform(test_filled), num_iteration=model.best_iteration)
    test[arch] += test_pred / skf.n_splits

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()


Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.863824
[200]	valid_0's auc: 0.867941
[300]	valid_0's auc: 0.859412
Early stopping, best iteration is:
[8]	valid_0's auc: 0.881176
Fold 1 ROC AUC: 0.88118
Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.680588
[200]	valid_0's auc: 0.677647
[300]	valid_0's auc: 0.677353
Early stopping, best iteration is:
[67]	valid_0's auc: 0.701471
Fold 2 ROC AUC: 0.70147
Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.815359
[200]	valid_0's auc: 0.829412
[300]	valid_0's auc: 0.827451
[400]	valid_0's auc: 0.821569
Early stopping, best iteration is:
[147]	valid_0's auc: 0.841176
Fold 3 ROC AUC: 0.84118
Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.706209
[200]	valid_0's auc: 0.699346
[300]	valid_0's auc: 0.687255
Early stopping, best iteration is:
[84]	valid_0's auc: 0.712745
Fold 4 ROC AUC: 0.71275
Training u