In [2]:
# Credit Risk Modeling - Python Template
# ---------------------------------------
import pandas as pd
import numpy as np

# Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# Load dataset (replace with your file)
df = pd.read_csv("credit_data.csv")

# -----------------------------
# Step 1: Data Preprocessing
# -----------------------------
# Example cleanup (adapt for your dataset)
df = df.dropna(subset=["loan_amount", "income"])   # drop rows missing key values
df.fillna(0, inplace=True)  # impute other missing values with 0

# Encode categoricals
df = pd.get_dummies(df, drop_first=True)

# -----------------------------
# Step 2: Feature Engineering
# -----------------------------
if "credit_limit" in df.columns and "current_debt" in df.columns:
    df["credit_utilization"] = df["current_debt"] / (df["credit_limit"] + 1)

if "total_debt" in df.columns and "income" in df.columns:
    df["debt_to_income"] = df["total_debt"] / (df["income"] + 1)

# Define target and features
target = "default"   # <-- replace with actual target column (1 = default, 0 = non-default)
X = df.drop(columns=[target])
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# -----------------------------
# Step 3: Logistic Regression
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(class_weight="balanced", max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression Results:")
print("AUC:", roc_auc_score(y_test, y_prob_lr))
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# -----------------------------
# Step 4: LightGBM
# -----------------------------
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting": "gbdt",
    "is_unbalance": True,  # handles class imbalance
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=500,
    callbacks=[
        lgb.early_stopping(50),     # replaces early_stopping_rounds
        lgb.log_evaluation(50)      # replaces verbose_eval
    ]
)


y_prob_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
y_pred_lgb = (y_prob_lgb > 0.5).astype(int)

print("\nLightGBM Results:")
print("AUC:", roc_auc_score(y_test, y_prob_lgb))
print(classification_report(y_test, y_pred_lgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgb))

# -----------------------------
# Step 5: Predict Probability for New Borrower
# -----------------------------
# Example new applicant (replace with real values)
new_applicant = pd.DataFrame([{
    "loan_amount": 5000,
    "income": 45000,
    "current_debt": 2000,
    "credit_limit": 10000,
    "total_debt": 15000,
    # include all other features your dataset has!
}])

# Feature engineering for new applicant
new_applicant["credit_utilization"] = new_applicant["current_debt"] / (new_applicant["credit_limit"] + 1)
new_applicant["debt_to_income"] = new_applicant["total_debt"] / (new_applicant["income"] + 1)

# Align features with training data
new_applicant = new_applicant.reindex(columns=X.columns, fill_value=0)

# Logistic Regression probability
new_app_scaled = scaler.transform(new_applicant)
prob_default_lr = log_reg.predict_proba(new_app_scaled)[:, 1][0]

# LightGBM probability
prob_default_lgb = lgb_model.predict(new_applicant, num_iteration=lgb_model.best_iteration)[0]

print(f"\nPredicted Probability of Default (Logistic Regression): {prob_default_lr:.3f}")
print(f"Predicted Probability of Default (LightGBM): {prob_default_lgb:.3f}")


Logistic Regression Results:
AUC: 0.7791533233728691
              precision    recall  f1-score   support

           0       0.88      0.77      0.82     23216
           1       0.45      0.63      0.52      6784

    accuracy                           0.74     30000
   macro avg       0.66      0.70      0.67     30000
weighted avg       0.78      0.74      0.75     30000

Confusion Matrix:
 [[17897  5319]
 [ 2500  4284]]
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.778943
Early stopping, best iteration is:
[13]	valid_0's auc: 0.779585

LightGBM Results:
AUC: 0.779585181449155
              precision    recall  f1-score   support

           0       0.82      0.98      0.89     23216
           1       0.76      0.26      0.39      6784

    accuracy                           0.81     30000
   macro avg       0.79      0.62      0.64     30000
weighted avg       0.81      0.81      0.78     30000

Confusion Matrix:
 [[22656   560]
 [ 4992  1792