# Regime-Aware Credit PD Pipeline

This notebook follows step-by-step instructions from README.md to build a regime-aware credit probability of default (PD) modeling pipeline.

## Outline
1. Read README.md File
2. Parse Instructions from README.md
3. Execute Instructions Programmatically

In [None]:
# Section 1: Read README.md File
with open('README.md', 'r', encoding='utf-8') as f:
    readme_content = f.read()
print(readme_content[:1000])  # Print first 1000 characters for preview

In [None]:
# Section 2: Parse Instructions from README.md
import re

# Extract step instructions from README.md
steps = re.findall(r'\u{1F4C8}|\u{1F4C9}|\u{1F4C7}|\u{1F4C6}|\u{1F4C5}|\u{1F4C4}|\u{1F4C3}|\u{1F4C2}|\u{1F4C1}|\u{1F4C0}|STEP \d+.*?(?=\n\n|\n\u{1F4C8}|\n\u{1F4C9}|\n\u{1F4C7}|\n\u{1F4C6}|\n\u{1F4C5}|\n\u{1F4C4}|\n\u{1F4C3}|\n\u{1F4C2}|\n\u{1F4C1}|\n\u{1F4C0}|$)', readme_content, re.DOTALL)

if not steps:
    # Fallback: split by 'STEP' if emoji fails
    steps = re.split(r'(?=STEP \d+)', readme_content)

for i, step in enumerate(steps):
    print(f"\n--- Step {i+1} ---\n{step[:500]}")

## Section 3: Execute Instructions Programmatically

The following cells will implement each step from the README.md instructions, including environment setup, data loading, HMM fitting, merging, modeling, calibration, and comparison.

In [None]:
# STEP 0 — Setup Environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from hmmlearn.hmm import GaussianHMM
import xgboost as xgb

print('Environment setup complete. Libraries imported.')

In [None]:
# STEP 1 — Load Borrower Dataset
# For prototype, generate synthetic borrower data
np.random.seed(42)
num_records = 1000
dates = pd.date_range('2015-01-01', periods=num_records, freq='M')

borrower_df = pd.DataFrame({
    'date': dates,
    'leverage': np.random.normal(2, 0.5, num_records),
    'interest_coverage': np.random.normal(5, 1.5, num_records),
    'asset_growth': np.random.normal(0.05, 0.02, num_records),
    'default': np.random.binomial(1, 0.08, num_records)
})

borrower_df['date'] = pd.to_datetime(borrower_df['date'])
borrower_df = borrower_df.sort_values('date')
print(borrower_df.head())

In [None]:
# STEP 2 — Create Synthetic Macro Data
macro_dates = borrower_df['date'].unique()
macro_df = pd.DataFrame({'date': macro_dates})

# Create regime periods
regime_periods = [
    (0, 300, {'gdp_growth': 0.03, 'unemployment': 0.04, 'interest_rate': 0.03}),  # strong growth
    (300, 700, {'gdp_growth': 0.02, 'unemployment': 0.06, 'interest_rate': 0.04}), # moderate
    (700, 1000, {'gdp_growth': 0.01, 'unemployment': 0.08, 'interest_rate': 0.05}) # low growth
]

macro_df['gdp_growth'] = 0.0
macro_df['unemployment'] = 0.0
macro_df['interest_rate'] = 0.0

for start, end, values in regime_periods:
    macro_df.loc[start:end, 'gdp_growth'] = np.random.normal(values['gdp_growth'], 0.005, end-start+1)
    macro_df.loc[start:end, 'unemployment'] = np.random.normal(values['unemployment'], 0.01, end-start+1)
    macro_df.loc[start:end, 'interest_rate'] = np.random.normal(values['interest_rate'], 0.005, end-start+1)

print(macro_df.head())

In [None]:
# STEP 3 — Fit Hidden Markov Model
from sklearn.preprocessing import StandardScaler

macro_vars = ['gdp_growth', 'unemployment', 'interest_rate']
scaler = StandardScaler()
macro_scaled = scaler.fit_transform(macro_df[macro_vars])

hmm = GaussianHMM(n_components=3, covariance_type='full', random_state=42)
hmm.fit(macro_scaled)
macro_df['regime'] = hmm.predict(macro_scaled)

print('Transition matrix:')
print(hmm.transmat_)

print('Average macro values per regime:')
print(macro_df.groupby('regime')[macro_vars].mean())

In [None]:
# STEP 4 — Merge Regime with Borrower Data
borrower_df = borrower_df.merge(macro_df[['date', 'regime']], on='date', how='left')
print(borrower_df.head())

In [None]:
# STEP 5 — Train Baseline PD Model (No Regime)
features = ['leverage', 'interest_coverage', 'asset_growth']

# Split by date
split_idx = int(0.7 * len(borrower_df))
train_df = borrower_df.iloc[:split_idx]
test_df = borrower_df.iloc[split_idx:]

X_train = train_df[features]
y_train = train_df['default']
X_test = test_df[features]
y_test = test_df['default']

lr = LogisticRegression()
lr.fit(X_train, y_train)
pred_proba_lr = lr.predict_proba(X_test)[:, 1]
pred_lr = lr.predict(X_test)

auc_lr = roc_auc_score(y_test, pred_proba_lr)
cm_lr = confusion_matrix(y_test, pred_lr)

print(f'Baseline Logistic Regression AUC: {auc_lr:.3f}')
print('Confusion Matrix:')
print(cm_lr)

# Calibration check
plt.figure(figsize=(6,4))
plt.scatter(pred_proba_lr, y_test, alpha=0.2)
plt.xlabel('Predicted PD')
plt.ylabel('Actual Default')
plt.title('Default vs Predicted PD (Baseline)')
plt.show()

In [None]:
# STEP 6 — Train Regime-Aware PD Model (XGBoost)
from sklearn.preprocessing import LabelEncoder

regime_encoder = LabelEncoder()
borrower_df['regime_cat'] = regime_encoder.fit_transform(borrower_df['regime'])

features_regime = features + ['regime_cat']
X_train_regime = train_df[features_regime]
X_test_regime = test_df[features_regime]

xgb_model = xgb.XGBClassifier(max_depth=3, learning_rate=0.05, n_estimators=100, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_regime, y_train)
pred_proba_xgb = xgb_model.predict_proba(X_test_regime)[:, 1]
pred_xgb = xgb_model.predict(X_test_regime)

auc_xgb = roc_auc_score(y_test, pred_proba_xgb)
print(f'Regime-Aware XGBoost AUC: {auc_xgb:.3f}')

In [None]:
# STEP 7 — Add Monotonic Constraints
monotone_constraints = [1, -1, 0, 0]  # leverage (increasing), interest_coverage (decreasing), asset_growth (none), regime_cat (none)

xgb_monotone = xgb.XGBClassifier(max_depth=3, learning_rate=0.05, n_estimators=100, monotone_constraints=monotone_constraints, use_label_encoder=False, eval_metric='logloss')
xgb_monotone.fit(X_train_regime, y_train)
pred_proba_xgb_monotone = xgb_monotone.predict_proba(X_test_regime)[:, 1]
auc_xgb_monotone = roc_auc_score(y_test, pred_proba_xgb_monotone)
print(f'Monotone XGBoost AUC: {auc_xgb_monotone:.3f}')

In [None]:
# STEP 8 — Probability Calibration
calibrator = CalibratedClassifierCV(xgb_monotone, method='isotonic', cv='prefit')
calibrator.fit(X_train_regime, y_train)
pred_proba_calibrated = calibrator.predict_proba(X_test_regime)[:, 1]

# Calibration curve
plt.figure(figsize=(6,4))
prob_true, prob_pred = calibration_curve(y_test, pred_proba_calibrated, n_bins=10)
plt.plot(prob_pred, prob_true, marker='o', label='Calibrated')
prob_true_raw, prob_pred_raw = calibration_curve(y_test, pred_proba_xgb_monotone, n_bins=10)
plt.plot(prob_pred_raw, prob_true_raw, marker='x', label='Raw')
plt.xlabel('Predicted PD')
plt.ylabel('Actual Default Rate')
plt.title('Calibration Curve')
plt.legend()
plt.show()

auc_calibrated = roc_auc_score(y_test, pred_proba_calibrated)
print(f'Calibrated XGBoost AUC: {auc_calibrated:.3f}')

In [None]:
# STEP 9 — Compare Results
results = pd.DataFrame({
    'Model': ['Baseline Logistic', 'XGBoost', 'Calibrated XGBoost'],
    'AUC': [auc_lr, auc_xgb, auc_calibrated],
    'Comments': ['No regime', 'With regime', 'With regime + calibration']
})
print(results)


In [None]:
# STEP 10 — Print Insights
print('Regime transition matrix:')
print(hmm.transmat_)

print('Default rate per regime:')
default_rate_per_regime = borrower_df.groupby('regime')['default'].mean()
print(default_rate_per_regime)

print('AUC comparison:')
print(results)

print('Feature importance from XGBoost:')
importances = xgb_model.feature_importances_
for feat, imp in zip(features_regime, importances):
    print(f'{feat}: {imp:.3f}')