<a href="https://colab.research.google.com/github/anbinh-bui/Graph-Neural-Networks-for-Credit-Card-Fraud-Detection/blob/main/Classic%20ML%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import random

import os
from tqdm import tqdm
from os.path import isfile

import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
import joblib  # Import joblib directly instead of sklearn.externals.joblib
from sklearn.decomposition import TruncatedSVD

%pip install bayesian-optimization==1.4.1
from bayes_opt import BayesianOptimization
from bayes_opt.event import Events
from bayes_opt.util import load_logs

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

import time
import datetime

import warnings
warnings.filterwarnings("ignore")

# List files in the input directory
input_dir = "../input"
if os.path.exists(input_dir):
    print(os.listdir(input_dir))
else:
    print(f"Directory {input_dir} does not exist.")
print()

# Print library versions
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)
print()
print("lightgbm:", lgb.__version__)
print("xgboost:", xgb.__version__)

# Handle joblib deprecation warning
print(joblib.__version__)


Collecting bayesian-optimization==1.4.1
  Downloading bayesian_optimization-1.4.1-py3-none-any.whl (18 kB)
Collecting colorama (from bayesian-optimization==1.4.1)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Reason for being yanked: https://github.com/fmfn/BayesianOptimization/pull/388[0m[33m
[0mInstalling collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.1 colorama-0.4.6
Directory ../input does not exist.

pandas: 2.0.3
numpy: 1.25.2
sklearn: 1.2.2

lightgbm: 4.1.0
xgboost: 2.0.3
1.4.2


In [2]:
train_transaction = pd.read_csv('/content/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('/content/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('/content/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('/content/test_identity.csv', index_col='TransactionID')


In [3]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(f'Shape of train set: {train.shape}')
print(f'Shape of test set: {test.shape}')

Shape of train set: (41530, 433)
Shape of test set: (41866, 432)


In [4]:
train = train.sample(frac=1, random_state=42)
train.reset_index(drop=True, inplace=True)

In [5]:
y = train.isFraud.values

train = train.drop('isFraud', axis=1)
test = test.copy()
train = train.fillna(-1)
test = test.fillna(-1)
del train_transaction, train_identity, test_transaction, test_identity

In [6]:
one_hot_encoded_training_predictors = pd.get_dummies(train)
one_hot_encoded_test_predictors = pd.get_dummies(test)
train, test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left',
                                                                    axis=1)


In [7]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
#Reduce memory
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 71.29 Mb (56.7% reduction)
Mem. usage decreased to 246.52 Mb (26.8% reduction)


In [9]:
cols = list(train.columns)
len(cols)

1349

In [10]:
scaler = StandardScaler() #MinMaxScaler StandardScaler RobustScaler

train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

In [11]:
N = 50

svd = TruncatedSVD(n_components=N, random_state=42)
X = svd.fit_transform(train[cols], y)
svd.explained_variance_ratio_.sum()

0.33218849573252385

In [12]:
df = pd.DataFrame()
df["target"] = y

for i in range(50):
    df[i] = X[:,i]

df.tail()

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
41525,0,-3.931403,-6.390133,-0.378094,-0.739131,5.078942,0.90475,0.146746,2.917129,2.085497,...,0.618575,0.615482,-0.943608,-0.086375,0.24909,0.058517,0.1297,-0.508904,-0.001805,-0.349203
41526,0,-5.179286,-4.740266,1.263651,4.150465,7.658622,-3.974839,-1.167574,0.44048,0.010661,...,-0.025686,0.318546,-1.545833,0.736259,0.670077,0.080393,0.063427,0.163561,0.416911,-0.086231
41527,0,-7.08637,0.317322,0.341879,1.58618,2.550078,-2.067627,-0.248387,-2.410086,-3.081503,...,-0.894179,-0.467677,-0.464813,-0.095301,0.186778,-0.207279,0.030415,0.02209,-0.283772,-0.028923
41528,0,-7.079897,0.682491,-0.592107,-0.681009,2.021262,0.531182,1.149648,-0.976707,-2.455856,...,-0.02398,0.543899,-0.513808,0.138184,0.620513,-0.264506,0.157638,-0.060483,0.073515,-0.11996
41529,0,-7.083493,0.646407,-0.539995,-0.65678,1.858491,0.481196,1.211968,-1.021635,-2.518545,...,0.066178,0.550838,-0.499764,0.130495,0.616233,-0.215519,0.168253,-0.021169,0.044999,-0.09706


Logistic Regression

In [13]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "reg"

train[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]

    reg = LogisticRegression(C=1,
                             solver="newton-cg",
                             penalty="l2",
                             n_jobs=-1,
                             max_iter=100).fit(X_train, y_train)

    y_pred = reg.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 5))

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.81715
1 ROC AUC: 0.8097
2 ROC AUC: 0.82677
3 ROC AUC: 0.8061
4 ROC AUC: 0.78896

OOF ROC AUC: 0.80956

CPU times: user 360 ms, sys: 562 ms, total: 922 ms
Wall time: 8.87 s


Random Forest

In [14]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "rfc"

train[arch] = 0
test[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]

    rfc = RandomForestClassifier(n_estimators=100,
                                 criterion='gini',
                                 n_jobs=-1).fit(X_train, y_train)

    y_pred = rfc.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 5))

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.83163
1 ROC AUC: 0.85955
2 ROC AUC: 0.86347
3 ROC AUC: 0.7999
4 ROC AUC: 0.81556

OOF ROC AUC: 0.8338

CPU times: user 6min 36s, sys: 2.43 s, total: 6min 39s
Wall time: 4min 18s


LGBM

In [17]:
from sklearn.model_selection import StratifiedKFold

# Parameters for LightGBM
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'seed': 42,
    'max_depth': -1,
    'verbose': -1,
    'n_jobs': -1
}

# Parameters for training to prevent eary_stop_rounds
rounds = 10000
early_stop_rounds = 300

# Prepare data for LightGBM
X = svd.fit_transform(train[cols], y)  # Use SVD transformed data
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "lgbm"

train[arch] = 0
test[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    # Create LightGBM datasets
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_valid, label=y_valid, reference=d_train)

    # Train the model
    model = lgb.train(params,
                      d_train,
                      num_boost_round=rounds,
                      valid_sets=[d_valid],
                      callbacks=[
                          lgb.early_stopping(stopping_rounds=early_stop_rounds),
                          lgb.log_evaluation(period=100)
                      ])

    # Predict and store the results
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    train.loc[valid_index, arch] = y_pred
    print(f"Fold {i + 1} ROC AUC: {round(roc_auc_score(y_valid, y_pred), 5)}")

    # Ensure no NaNs in the test data before transforming
    test_filled = test[cols].fillna(-1)

    # Predict on test set and average predictions
    test_pred = model.predict(svd.transform(test_filled), num_iteration=model.best_iteration)
    test[arch] += test_pred / skf.n_splits

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.860424
[200]	valid_0's auc: 0.86803
[300]	valid_0's auc: 0.871949
[400]	valid_0's auc: 0.870211
[500]	valid_0's auc: 0.872746
[600]	valid_0's auc: 0.871365
[700]	valid_0's auc: 0.872601
Early stopping, best iteration is:
[446]	valid_0's auc: 0.873275
Fold 1 ROC AUC: 0.87327
Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.880637
[200]	valid_0's auc: 0.885164
[300]	valid_0's auc: 0.891171
[400]	valid_0's auc: 0.888425
[500]	valid_0's auc: 0.889885
[600]	valid_0's auc: 0.888812
Early stopping, best iteration is:
[305]	valid_0's auc: 0.891623
Fold 2 ROC AUC: 0.89162
Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.878884
[200]	valid_0's auc: 0.87748
[300]	valid_0's auc: 0.876839
Early stopping, best iteration is:
[80]	valid_0's auc: 0.880714
Fold 3 ROC AUC: 0.88071
Training until validation scores don't improve for 300 rounds
[100]	va