# Credit Risk PD/LGD Model

This notebook demonstrates credit risk modeling for:
- **PD** (Probability of Default) - Classification
- **LGD** (Loss Given Default) - Regression
- **EL** (Expected Loss) = PD × LGD × EAD

Author: Avni Derashree

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import prepare_credit_data
from src.feature_engineering import FeatureEngineer, calculate_information_value
from src.pd_model import train_all_pd_models, model_comparison_table
from src.lgd_model import train_all_lgd_models, lgd_comparison_table
from src.evaluation import create_ks_table, calculate_ks_statistic
from src.visualization import (
    plot_roc_curve, plot_ks_chart, plot_calibration_curve,
    plot_feature_importance, plot_lift_chart
)

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Credit Data

In [None]:
# Load synthetic credit data
X_train, X_test, y_train, y_test = prepare_credit_data(use_synthetic=True)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Default rate: {y_train['default'].mean():.2%}")

In [None]:
# View sample data
X_train.head()

## 2. Feature Engineering

In [None]:
# Apply feature engineering
fe = FeatureEngineer()
X_train_fe = fe.fit_transform(X_train)
X_test_fe = fe.transform(X_test)

print(f"Original features: {X_train.shape[1]}")
print(f"Engineered features: {X_train_fe.shape[1]}")

In [None]:
# Information Value Analysis
iv_df = calculate_information_value(X_train, y_train['default'])
iv_df.head(15)

## 3. Train PD Models

In [None]:
# Train multiple PD models
pd_results = train_all_pd_models(
    X_train_fe, y_train['default'],
    X_test_fe, y_test['default']
)

# Compare models
model_comparison_table(pd_results)

In [None]:
# Select best model
best_pd = max(pd_results.values(), key=lambda x: x.roc_auc)
print(best_pd)

## 4. Model Evaluation

In [None]:
# Get predictions
pd_predictions = best_pd.model.predict_proba(X_test_fe)[:, 1]

# ROC Curve
plot_roc_curve(y_test['default'].values, pd_predictions, best_pd.model_name)
plt.show()

In [None]:
# KS Chart
plot_ks_chart(y_test['default'].values, pd_predictions)
plt.show()

In [None]:
# Calibration Curve
plot_calibration_curve(y_test['default'].values, pd_predictions, best_pd.model_name)
plt.show()

In [None]:
# Feature Importance
plot_feature_importance(best_pd.feature_importance, top_n=15)
plt.show()

In [None]:
# Lift Chart
plot_lift_chart(y_test['default'].values, pd_predictions)
plt.show()

## 5. KS Table Analysis

In [None]:
# Create KS table
ks_table = create_ks_table(y_test['default'].values, pd_predictions)
ks_table[['decile', 'total', 'events', 'event_rate', 'cum_event_pct', 'ks']]

## 6. LGD Modeling

In [None]:
# Filter to defaulted loans
default_train = y_train['default'] == 1
default_test = y_test['default'] == 1

X_train_lgd = X_train[default_train]
y_train_lgd = y_train.loc[default_train, 'lgd']
X_test_lgd = X_test[default_test]
y_test_lgd = y_test.loc[default_test, 'lgd']

print(f"LGD training samples: {len(X_train_lgd)}")

In [None]:
# Train LGD models
fe_lgd = FeatureEngineer()
X_train_lgd_fe = fe_lgd.fit_transform(X_train_lgd)
X_test_lgd_fe = fe_lgd.transform(X_test_lgd)

lgd_results = train_all_lgd_models(
    X_train_lgd_fe, y_train_lgd,
    X_test_lgd_fe, y_test_lgd
)

lgd_comparison_table(lgd_results)

## 7. Expected Loss Calculation

In [None]:
# Calculate Expected Loss
best_lgd = min(lgd_results.values(), key=lambda x: x.rmse)
mean_lgd = best_lgd.mean_lgd_pred

ead = X_test['loan_amount'].values if 'loan_amount' in X_test.columns else np.ones(len(X_test)) * 10000

expected_loss = pd_predictions * mean_lgd * ead

print(f"Average PD: {pd_predictions.mean():.2%}")
print(f"Average LGD: {mean_lgd:.2%}")
print(f"Total Expected Loss: ${expected_loss.sum():,.0f}")

## 8. Risk Segmentation

In [None]:
# Create risk buckets
risk_df = pd.DataFrame({
    'pd': pd_predictions,
    'expected_loss': expected_loss
})

risk_df['risk_bucket'] = pd.cut(
    risk_df['pd'],
    bins=[0, 0.05, 0.15, 0.30, 1.0],
    labels=['Low', 'Medium', 'High', 'Very High']
)

risk_df.groupby('risk_bucket').agg({
    'pd': ['count', 'mean'],
    'expected_loss': ['sum', 'mean']
}).round(2)