### Libraries

In [1]:
from scipy.stats import norm
import kagglehub
import pandas as pd
import numpy as np
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


### Functions

In [2]:
# Enable copy-on-write mode for pandas DataFrames
pd.set_option("mode.copy_on_write", True)


# Downloads and loads the dataset from Kaggle and returns the CSV path
def download_data():
    path = kagglehub.dataset_download("uciml/default-of-credit-card-clients-dataset")
    files = os.listdir(path)
    csv_file = files[0]
    return os.path.join(path, csv_file)


# Reads in the dataset
def load_data(csv_path: str) -> pd.DataFrame:
    return pd.read_csv(csv_path, encoding="ISO-8859-1")

### Data

In [3]:
data_ = load_data(download_data())
print(data_.shape)
data_.head()

(30000, 25)


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
df = data_[["default.payment.next.month", "LIMIT_BAL", "ID"]].copy()
df.rename(
    columns={
        "LIMIT_BAL": "CREDIT_LINE",
        "default.payment.next.month": "DEFAULT_STATUS",
    },
    inplace=True,
)
print(df.shape)
df.head()

(30000, 3)


Unnamed: 0,DEFAULT_STATUS,CREDIT_LINE,ID
0,1,20000.0,1
1,1,120000.0,2
2,0,90000.0,3
3,0,50000.0,4
4,0,50000.0,5


In [5]:
credit_lines = df.CREDIT_LINE.values
credit_lines

array([ 20000., 120000.,  90000., ...,  30000.,  80000.,  50000.],
      shape=(30000,))

### Parameter values

In [6]:
correlation_factor = 0.04  # Correlation between defaults
# payment_default = [0.03, 0.15]          # Payment Default Range
payment_default = 0.05  # Payment Default
threshold = norm.ppf(payment_default)

In [7]:
# n_default = 0
# total_amount_losses = []
# n_losses = []

N = len(credit_lines)
n_simulations = 1000

total_losses = np.zeros(n_simulations)
n_user_default = np.zeros(n_simulations)

### Simulation

In [8]:
for i in range(n_simulations):
    systematic_factor = np.random.normal(0, 1)  # Economic environment
    idiosyncratic_factor = np.random.normal(0, 1, N)  # Individual factor

    # Vasicek Formula
    X = (
        np.sqrt(correlation_factor) * systematic_factor
        + np.sqrt(1 - correlation_factor) * idiosyncratic_factor
    )

    flag = (X < threshold).astype(int)
    n_user_default[i] = np.sum(flag)
    total_losses[i] = np.sum(flag * credit_lines)

In [9]:
print(f"Default Rate: {payment_default:.2%}")
print(f"Correlation rate: {correlation_factor}\n")

print(f"Number of simulations: {n_simulations}\n")
print(f"Mean default rate: {np.mean(n_user_default)/N:.2%}")
print(f"Mean loss amount: ${np.mean(total_losses):,.2f}\n")

print(
    f"Worst scenario: {np.max(n_user_default):.0f} defaults (${np.max(total_losses):,.0f})"
)
print(
    f"Best scenario: {np.min(n_user_default):.0f} defaults (${np.min(total_losses):,.0f})"
)

Default Rate: 5.00%
Correlation rate: 0.04

Number of simulations: 1000

Mean default rate: 5.15%
Mean loss amount: $258,850,442.08

Worst scenario: 5282 defaults ($875,270,000)
Best scenario: 311 defaults ($49,660,000)
