In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import optimize
from scipy import special

from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score, log_loss

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

from src.data_prep import prepare_data
# from src.models import logistic_regression_model, decision_tree_model, random_forest_model, lightgbm_model
# from src.train import train_sklearn_model
from src.evaluation import evaluate_model

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib

import torch
import torch.nn as nn
import torch.optim as optim

from imblearn.over_sampling import SMOTE

import lightgbm as lgb

sns.set(style="whitegrid")

In [14]:
# df = pd.read_csv('data/creditcard/creditcard.csv')
# X = df.drop(columns='Class')
# y = df['Class']

df = pd.read_csv('data/loan/loandefault.csv')
df = df.drop(["LoanID"],axis = 1)

encoder = LabelEncoder()

for i in df.columns[9:16]:
    df[i] = encoder.fit_transform(df[i])
    
x,y = df.iloc[:,:16], df.iloc[:,16]
scaler = MinMaxScaler()
X = scaler.fit_transform(x)
#######################


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=42,
)

X_fit, X_val, y_fit, y_val = train_test_split(
    X_train, y_train,
    random_state=42
)

# X_train.head()

In [15]:
fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': 'binary'
    },
    train_set=fit,
    num_boost_round=400,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
  callbacks=[
    lgb.early_stopping(stopping_rounds=20),
    lgb.log_evaluation(period=100)
    ]
)

y_pred = model.predict(X_test)

print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")

[LightGBM] [Info] Number of positive: 16695, number of negative: 126937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 143632, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116235 -> initscore=-2.028582
[LightGBM] [Info] Start training from score -2.028582
Training until validation scores don't improve for 20 rounds
[100]	fit's binary_logloss: 0.325066	val's binary_logloss: 0.325833
[200]	fit's binary_logloss: 0.315383	val's binary_logloss: 0.318095
[300]	fit's binary_logloss: 0.310531	val's binary_logloss: 0.315022
[400]	fit's binary_logloss: 0.30747	val's binary_logloss: 0.313654
Did not meet early stopping. Best iteration is:
[400]	fit's binary_logloss: 0.30747	val's binary_logloss: 0.

In [157]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]

y_pred_binary = (y_pred >= optimal_threshold).astype(int)
# y_pred_binary = (y_pred >= 0.5).astype(int)

# Save binary predictions and true labels
# binary_predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred_binary})
# binary_predictions.to_csv("artifacts/predictions/lightgbm_bce_predictions.csv", index=False)

predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred})
predictions.to_csv("artifacts/lightgbm_preds/loan/lightgbm_bce_continuous_predictions.csv", index=False)

# print(f"Optimal Threshold: {optimal_threshold:.5f}")
print("Binary predictions saved successfully!")

Binary predictions saved successfully!


In [6]:


class FocalLoss:

    def __init__(self, gamma, alpha=None):
        self.alpha = alpha
        self.gamma = gamma

    def at(self, y):
        if self.alpha is None:
            return np.ones_like(y)
        return np.where(y, self.alpha, 1 - self.alpha)

    def pt(self, y, p):
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return np.where(y, p, 1 - p)

    def __call__(self, y_true, y_pred):
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        return -at * (1 - pt) ** self.gamma * np.log(pt)

    def grad(self, y_true, y_pred):
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma
        return at * y * (1 - pt) ** g * (g * pt * np.log(pt) + pt - 1)

    def hess(self, y_true, y_pred):
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma

        u = at * y * (1 - pt) ** g
        du = -at * y * g * (1 - pt) ** (g - 1)
        v = g * pt * np.log(pt) + pt - 1
        dv = g * np.log(pt) + g + 1

        return (du * v + u * dv) * y * (pt * (1 - pt))

    def init_score(self, y_true):
        res = optimize.minimize_scalar(
            lambda p: self(y_true, p).sum(),
            bounds=(0, 1),
            method='bounded'
        )
        p = res.x
        log_odds = np.log(p / (1 - p))
        return log_odds

    def lgb_obj(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        return self.grad(y, p), self.hess(y, p)

    def lgb_eval(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        is_higher_better = False
        return 'focal_loss', self(y, p).mean(), is_higher_better

In [159]:
fl = FocalLoss(alpha=None, gamma=0)

fit = lgb.Dataset(
    X_fit, y_fit,
    init_score=np.full_like(y_fit, fl.init_score(y_fit), dtype=float)
)

val = lgb.Dataset(
    X_val, y_val,
    init_score=np.full_like(y_val, fl.init_score(y_fit), dtype=float),
    reference=fit
)

model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': fl.lgb_obj
    },
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=100)
    ],
    feval=fl.lgb_eval
)

y_pred = special.expit(fl.init_score(y_fit) + model.predict(X_test))

print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")

[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002856 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 143632, number of used features: 16
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
[100]	fit's focal_loss: 0.325066	val's focal_loss: 0.325833
[200]	fit's focal_loss: 0.315383	val's focal_loss: 0.318095
[300]	fit's focal_loss: 0.310531	val's focal_loss: 0.315022
[400]	fit's focal_loss: 0.30747	val's focal_loss: 0.313654
[500]	fit's focal_loss: 0.305233	val's focal_loss: 0.312942
[600]	fit's focal_loss: 0.303328	val's focal_loss: 0.312637
[700]	fit's focal_loss: 0.301592	val's focal_loss: 0.312523
[800]	fit's focal_loss: 0.299999	val's focal_loss: 0.3

In [160]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]

y_pred_binary = (y_pred >= optimal_threshold).astype(int)

# y_pred_binary = (y_pred >= 0.5).astype(int)

# binary_predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred_binary})
# binary_predictions.to_csv("artifacts/predictions/lightgbm_focal_loss_predictions.csv", index=False)

predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred})
predictions.to_csv("artifacts/lightgbm_preds/loan/lightgbm_focal_loss_continuous_predictions.csv", index=False)

print(f"Optimal Threshold: {optimal_threshold:.5f}")
print("Binary predictions saved successfully!")

Optimal Threshold: 0.11746
Binary predictions saved successfully!


## ROC-Star

In [16]:
class ROCStarLoss:

    def __init__(self, delta=2, gamma=0.4):

        self.delta = delta

        self.gamma = gamma

        self.epoch_true = None

        self.epoch_pred = None

        self.bce_epoch = 0

        self.BCE = nn.BCELoss()

 

    def calc_loss(self, y_true, y_pred):

       

        # If first epoch, BCE Loss

        if self.bce_epoch > 0:

            self.bce_epoch -= 1

            self.epoch_true = y_true.clone()

            self.epoch_pred = y_pred.clone()

 

            return self.BCE(y_pred, y_true)

 

        # B/W cmparison for Appeal/No Appeal - [B(+), W(-)]

 

        pos_ind = y_true >= 0.5

        neg_ind = y_true < 0.5

 

        B = y_pred[pos_ind]

        W = y_pred[neg_ind]

 

        B_shifted = B - self.gamma

 

        # Batch-wise loss calculatoin

        batch_size = 30000

        loss = 0.0

 

        for i in range(0, len(W), batch_size):

            W_batch = W[i:i + batch_size]

 

            comparisons_batch = W_batch.unsqueeze(1) - B_shifted.unsqueeze(0)

 

            loss += torch.sum((torch.clamp(comparisons_batch, min=0))**2)

 

        # end code #

 

        # full dataset loss

        # comparisons = W.unsqueeze(1) - B_shifted.unsqueeze(0)

 

        # loss_matrix = torch.clamp(comparisons, min=0)

        # loss2 = torch.sum(loss_matrix ** 2)

        # print("Losses for full and batch-wise")

        # print(loss)

        # print(loss2)

        # end code #

 

        return loss

 

    def init_score(self, y_true):

        p = np.mean(y_true)

        p = np.clip(p, 1e-15, 1 - 1e-15)

        log_odds = np.log(p / (1 - p))

        return log_odds

 

    def grad(self, y_true, y_pred):

        B_ind = y_true >= 0.5

        W_ind = y_true < 0.5

 

        B = y_pred[B_ind]

        W = y_pred[W_ind]

 

        dB = B * (1 - B)

        dW = W * (1 - W)

 

        # Batching for GPU Memory saving

        batch_size = 30000

 

        sum_comparisons_B = torch.zeros_like(B)

        sum_comparisons_W = torch.zeros_like(W)

 

        for i in range(0, len(W), batch_size):

            W_batch = W[i: i + batch_size]

 

            comparisons_B_batch = W_batch.unsqueeze(1) - B.unsqueeze(0) + self.gamma

 

            sum_comparisons_B_batch = torch.clamp(comparisons_B_batch, min=0).sum(dim=0)

            sum_comparisons_W_batch = torch.clamp(comparisons_B_batch, min=0).sum(dim=1)

 

            sum_comparisons_B += sum_comparisons_B_batch

            sum_comparisons_W[i:i + batch_size] += sum_comparisons_W_batch

 

        dLdx = -2 * sum_comparisons_B * dB

        dLdy = 2 * sum_comparisons_W * dW

 

        # end code #

 

        # Normal matrix code for full dataset

        # comparisons = W.unsqueeze(1) - B.unsqueeze(0) + self.gamma

        # comparisons = torch.clamp(comparisons, min=0)

 

        # dLdx2 = -2 * torch.sum(comparisons, dim=0) * dB

        # dLdy2 = 2 * torch.sum(comparisons, dim=1) * dW

 

        # print('Sums')

        # print(torch.all(torch.isclose(dLdx, dLdx2)))

        # print(torch.all(torch.isclose(dLdy, dLdy2)))

        # end code #

 

        dL = torch.zeros_like(y_pred)

        dL[B_ind] = dLdx

        dL[W_ind] = dLdy

 

        return dL

 

 

    def hess(self, y_true, y_pred):

        B_ind = y_true >= 0.5

        W_ind = y_true < 0.5

 

        B = y_pred[B_ind]

        W = y_pred[W_ind]

 

        B_hessian = torch.zeros_like(B)

        W_hessian = torch.zeros_like(W)

 

        batch_size = 20000

 

        for i in range(0, len(W), batch_size):

            W_batch = W[i:i + batch_size]

 

            margin_matrix_batch = W_batch.unsqueeze(1) - B.unsqueeze(0) + self.gamma

            hessian_matrix_batch = torch.clamp(margin_matrix_batch, min=0)

 

            sum_dx_batch = torch.sum(hessian_matrix_batch, dim=0)

            sum_dy_batch = torch.sum(hessian_matrix_batch, dim=1)

 

            B_hessian += -2*B*(1-B)*sum_dx_batch

            B_hessian += 4*B*((1-B)**2)*sum_dx_batch

            B_hessian += torch.sum(2*((B*(1-B))**2) * (hessian_matrix_batch != 0), dim=0)

 

            W_hessian[i:i + batch_size] += 2*W_batch*(1-W_batch)*sum_dy_batch

            W_hessian[i:i + batch_size] += -4*W_batch*((1-W_batch)**2)*sum_dy_batch

            W_hessian[i:i + batch_size] += torch.sum(2*((W_batch*(1-W_batch))**2) * (hessian_matrix_batch != 0).T, dim=0)

 

        # Entire Dataset

 

        # margin_matrix = W.unsqueeze(1) - B.unsqueeze(0) + self.gamma

        # hessian_matrix = torch.clamp(margin_matrix, min=0)

 

        # sum_dx = torch.sum(hessian_matrix, dim=0)

        # sum_dy = torch.sum(hessian_matrix, dim=1)

 

        # B_hessian2 = -2*B*(1-B)*sum_dx + 4*B*((1-B)**2)*sum_dx + torch.sum(2*((B*(1-B))**2) * (hessian_matrix != 0), dim=0)

        # W_hessian2 = 2*W*(1-W)*sum_dy - 4*W*((1-W)**2)*sum_dy + torch.sum(2*((W*(1-W))**2) * (hessian_matrix != 0).T, dim=0)

 

        # end code #

 

        # print('Hessians')

        # if not torch.all(torch.isclose(B_hessian, B_hessian2, atol=1e-1)).item():

        #     print(B_hessian)

        #     print(B_hessian2)

        # print(torch.all(torch.isclose(W_hessian, W_hessian2)))

 

        hessians = torch.zeros_like(y_pred)

        hessians[B_ind] = B_hessian

        hessians[W_ind] = W_hessian

 

        return hessians

 

 

    def calc_grad_hess(self, y_true, y_pred):

        y_true = torch.tensor(y_true, dtype=torch.float32).cuda()

        y_pred = torch.tensor(y_pred, dtype=torch.float32).cuda()

 

        grad = self.grad(y_true, y_pred)

        hess = self.hess(y_true, y_pred)

 

        # del y_true

        # del y_pred

 

        grad = grad.cpu().detach().numpy()

        hess = hess.cpu().detach().numpy()

 

        # return grad, np.ones(grad.shape)

        return grad, hess

 

    def rocstar_obj(self, preds, train_data):

        y = train_data.get_label()

        p = special.expit(preds)

 

        grad, hess = self.calc_grad_hess(y, p)

 

        return grad, hess

 

    def rocstar_eval(self, preds, train_data):

        y = train_data.get_label()

        p = special.expit(preds)

 

        loss_metric = 'bce_loss' if self.bce_epoch > 0 else 'rocstar_loss'

        loss = self.calc_loss(torch.tensor(y, dtype=torch.float32).cuda(), torch.tensor(p, dtype=torch.float32).cuda())

        is_higher_better = False

 

        return loss_metric, loss.item(), is_higher_better

In [17]:
def train_model(X_train, y_train, X_fit, X_val, y_fit, y_val):
    fit = lgb.Dataset(X_fit, y_fit, free_raw_data=False)
    val = lgb.Dataset(X_val, y_val, reference=fit, free_raw_data=False)
    gamma = 0.8
    
    rocstar = ROCStarLoss(gamma=gamma)

    params = {
        'learning_rate': 0.03,
        'num_leaves': 100,
        'gamma': gamma,
        'is_unbalance': True,
        'objective': rocstar.rocstar_obj
    }

    print('#######   Training LightGBM with roc-star   #######')
    model = lgb.train(
        params=params,
        train_set=fit,
        num_boost_round=10000,
        valid_sets=(fit, val),
        valid_names=('fit', 'val'),
        callbacks=[
            lgb.early_stopping(stopping_rounds=30),
            lgb.log_evaluation(period=1)
        ],
        feval=rocstar.rocstar_eval,
    )
    return model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=42
)

X_fit, X_val, y_fit, y_val = train_test_split(
    X_train, y_train,
    random_state=42
)

rocstar_model = train_model(X_train, y_train, X_fit, X_val, y_fit, y_val)

#######   Training LightGBM with roc-star   #######
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 143632, number of used features: 16
[LightGBM] [Info] Using self-defined objective function
[1]	fit's rocstar_loss: 1.32837e+09	val's rocstar_loss: 1.47072e+08
Training until validation scores don't improve for 30 rounds
[2]	fit's rocstar_loss: 1.30106e+09	val's rocstar_loss: 1.44307e+08
[3]	fit's rocstar_loss: 1.27426e+09	val's rocstar_loss: 1.41592e+08
[4]	fit's rocstar_loss: 1.2478e+09	val's rocstar_loss: 1.38931e+08
[5]	fit's rocstar_loss: 1.222e+09	val's rocstar_loss: 1.36314e+08
[6]	fit's rocstar_loss: 1.19621e+09	val's rocstar_loss: 1.33732e+08
[7]	fit's rocsta

In [19]:
def predict_results(model, X_test, y_test):
    y_pred = special.expit(model.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)

    optimal_idx = (tpr - fpr).argmax()
    optimal_threshold = thresholds[optimal_idx]
    
    y_pred_binary = (y_pred >= optimal_threshold).astype(int)
    # y_pred_binary = (y_pred > 0.5).astype(int)
    print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
    print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")

    return y_pred, y_pred_binary

In [20]:
y_pred, y_pred_binary = predict_results(rocstar_model, X_test, y_test)

# binary_predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred_binary})
# binary_predictions.to_csv("artifacts/predictions/lightgbm_rocstar_predictions.csv", index=False)

predictions = pd.DataFrame({"y_true": y_test, "y_pred": y_pred})
predictions.to_csv("artifacts/lightgbm_preds/loan/lightgbm_rocstar_continuous_predictions.csv", index=False)

Test's ROC AUC: 0.74620
Test's logloss: 0.86630
