In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score, log_loss

from sklearn.model_selection import train_test_split

from src.data_prep import prepare_data
# from src.models import logistic_regression_model, decision_tree_model, random_forest_model, lightgbm_model
# from src.train import train_sklearn_model
from src.evaluation import evaluate_model

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib

import torch
import torch.nn as nn
import torch.optim as optim

from imblearn.over_sampling import SMOTE

import lightgbm as lgb

sns.set(style="whitegrid")

In [2]:
df = pd.read_csv('data/creditcard/creditcard.csv')
X = df.drop(columns='Class')
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=42
)

X_fit, X_val, y_fit, y_val = train_test_split(
    X_train, y_train,
    random_state=42
)

X_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
83225,59741.0,-1.648591,1.22813,1.370169,-1.735542,-0.029455,-0.484129,0.918645,-0.43875,0.982144,...,0.384201,-0.218076,-0.203458,-0.213015,0.011372,-0.304481,0.632063,-0.262968,-0.099863,38.42
52800,45648.0,-0.234775,-0.493269,1.236728,-2.338793,-1.176733,0.885733,-1.960981,-2.363412,-2.694774,...,0.364679,-1.495358,-0.083066,0.074612,-0.347329,0.5419,-0.433294,0.089293,0.212029,61.2
21293,31579.0,1.134626,-0.77446,-0.16339,-0.533358,-0.604555,-0.244482,-0.212682,0.040782,-1.136627,...,-0.396476,-0.684454,-1.855269,0.171997,-0.387783,-0.062985,0.245118,-0.061178,0.01218,110.95
133600,80455.0,0.069514,1.017753,1.033117,1.384376,0.223233,-0.310845,0.597287,-0.127658,-0.701533,...,0.14876,0.097023,0.369957,-0.219266,-0.124941,-0.049749,-0.112946,0.11444,0.066101,10.0
38225,39302.0,-0.199441,0.610092,-0.114437,0.256565,2.290752,4.008475,-0.12353,1.038374,-0.075846,...,0.292972,-0.019733,0.165463,-0.080978,1.020656,-0.30073,-0.269595,0.481769,0.254114,22.0


In [31]:
fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': 'binary'
    },
    train_set=fit,
    num_boost_round=400,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
  callbacks=[
    lgb.early_stopping(stopping_rounds=20),
    lgb.log_evaluation(period=100)
    ]
)

y_pred = model.predict(X_test)

print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")

[LightGBM] [Info] Number of positive: 283, number of negative: 159920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 160203, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001767 -> initscore=-6.336982
[LightGBM] [Info] Start training from score -6.336982
Training until validation scores don't improve for 20 rounds
[100]	fit's binary_logloss: 0.0018981	val's binary_logloss: 0.0035569
[200]	fit's binary_logloss: 0.00080822	val's binary_logloss: 0.00283644
[300]	fit's binary_logloss: 0.000396519	val's binary_logloss: 0.00264941
Early stopping, best iteration is:
[352]	fit's binary_logloss: 0.000281286	val's binary_logloss: 0.00261413

Test's ROC AUC: 0.97772
Test's logloss: 0.00237


In [32]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

# Convert continuous predictions to binary predictions based on the optimal threshold
y_pred_binary = (y_pred >= 0.5).astype(int)

# Save binary predictions and true labels
binary_predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred_binary})
binary_predictions.to_csv("artifacts/predictions/lightgbm_predictions_bce.csv", index=False)

print(f"Optimal Threshold: {optimal_threshold:.5f}")
print("Binary predictions saved successfully!")

Optimal Threshold: 0.00080
Binary predictions saved successfully!


In [33]:
import numpy as np
from scipy import optimize
from scipy import special

class FocalLoss:

    def __init__(self, gamma, alpha=None):
        self.alpha = alpha
        self.gamma = gamma

    def at(self, y):
        if self.alpha is None:
            return np.ones_like(y)
        return np.where(y, self.alpha, 1 - self.alpha)

    def pt(self, y, p):
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return np.where(y, p, 1 - p)

    def __call__(self, y_true, y_pred):
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        return -at * (1 - pt) ** self.gamma * np.log(pt)

    def grad(self, y_true, y_pred):
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma
        return at * y * (1 - pt) ** g * (g * pt * np.log(pt) + pt - 1)

    def hess(self, y_true, y_pred):
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma

        u = at * y * (1 - pt) ** g
        du = -at * y * g * (1 - pt) ** (g - 1)
        v = g * pt * np.log(pt) + pt - 1
        dv = g * np.log(pt) + g + 1

        return (du * v + u * dv) * y * (pt * (1 - pt))

    def init_score(self, y_true):
        res = optimize.minimize_scalar(
            lambda p: self(y_true, p).sum(),
            bounds=(0, 1),
            method='bounded'
        )
        p = res.x
        log_odds = np.log(p / (1 - p))
        return log_odds

    def lgb_obj(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        return self.grad(y, p), self.hess(y, p)

    def lgb_eval(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        is_higher_better = False
        return 'focal_loss', self(y, p).mean(), is_higher_better

In [34]:
fl = FocalLoss(alpha=None, gamma=0)

fit = lgb.Dataset(
    X_fit, y_fit,
    init_score=np.full_like(y_fit, fl.init_score(y_fit), dtype=float)
)

val = lgb.Dataset(
    X_val, y_val,
    init_score=np.full_like(y_val, fl.init_score(y_fit), dtype=float),
    reference=fit
)

model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': fl.lgb_obj
    },
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=100)
    ],
    feval=fl.lgb_eval
)

y_pred = special.expit(fl.init_score(y_fit) + model.predict(X_test))

print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")

[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 160203, number of used features: 30
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
[100]	fit's focal_loss: 0.00190475	val's focal_loss: 0.00356043
[200]	fit's focal_loss: 0.000811846	val's focal_loss: 0.00285806
[300]	fit's focal_loss: 0.000401933	val's focal_loss: 0.00267161
Early stopping, best iteration is:
[345]	fit's focal_loss: 0.000297174	val's focal_loss: 0.00263719

Test's ROC AUC: 0.97948
Test's logloss: 0.00237


In [35]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

# Convert continuous predictions to binary predictions based on the optimal threshold
y_pred_binary = (y_pred >= 0.5).astype(int)

# Save binary predictions and true labels
binary_predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred_binary})
binary_predictions.to_csv("artifacts/predictions/lightgbm_predictions_focal_loss.csv", index=False)

print(f"Optimal Threshold: {optimal_threshold:.5f}")
print("Binary predictions saved successfully!")

Optimal Threshold: 0.00074
Binary predictions saved successfully!
