In [None]:
"""
03_modeling_PD_LGD_EAD.py

Trains machine learning models to predict Probability of Default (PD), Loss Given Default (LGD),
and Exposure at Default (EAD). Splits data, applies preprocessing, fits classifiers/regressors,
and evaluates model performance.
"""


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/engineered_loan_data.csv', parse_dates=['OriginationDate', 'DefaultDate'])

features = [
    'CreditScore', 'Income', 'LoanAmount', 'CustomerAge', 'DTI', 'LTI',
    'GDP_Growth_3MAvg', 'Unemployment_3MAvg', 'Inflation_3MAvg'
]

df.dropna(subset=features + ['Defaulted'], inplace=True)
X = df[features]
y_pd = df['Defaulted']

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Single train-test split for PD model
X_train, X_test, y_train, y_test = train_test_split(
    X, y_pd, test_size=0.3, random_state=42, stratify=y_pd
)

print("Train class distribution:")
print(y_train.value_counts())
print("Test class distribution:")
print(y_test.value_counts())
print("Full data class distribution:")
print(y_pd.value_counts())
print(df[features + ['Defaulted']].isna().sum())
print(df.groupby('Defaulted')[features].apply(lambda x: x.isna().sum()))
print(df['Defaulted'].value_counts())


# Train PD model on train data only
pd_model = RandomForestClassifier(n_estimators=100, random_state=42)
pd_model.fit(X_train, y_train)

# Predict on test set
y_pred = pd_model.predict(X_test)
y_proba = pd_model.predict_proba(X_test)[:, 1]

print("PD Model classification report:")
print(classification_report(y_test, y_pred))
print("PD Model AUC:", roc_auc_score(y_test, y_proba))


# Predict PD probabilities for the full dataset (all loans)
df['PD'] = pd_model.predict_proba(X)[:, 1]

# Prepare LGD modeling on only defaulted loans
df_defaulted = df[df['Defaulted'] == 1].copy()

# Simulate RecoveryRate and LGD
np.random.seed(42)
df_defaulted['RecoveryRate'] = np.random.beta(5, 2, size=len(df_defaulted))
df_defaulted['LGD'] = 1 - df_defaulted['RecoveryRate']

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# LGD model (regression on defaulted loans only)
X_lgd = df_defaulted[features]
y_lgd = df_defaulted['LGD']

lgd_model = RandomForestRegressor(n_estimators=100, random_state=42)
lgd_model.fit(X_lgd, y_lgd)

df['LGD'] = lgd_model.predict(df[features])

# EAD modeling on defaulted loans only
df_defaulted['EAD'] = df_defaulted['LoanAmount'] * np.random.uniform(0.9, 1.1, size=len(df_defaulted))

X_ead = df_defaulted[features]
y_ead = df_defaulted['EAD']

ead_model = RandomForestRegressor(n_estimators=100, random_state=42)
ead_model.fit(X_ead, y_ead)

df['EAD'] = ead_model.predict(df[features])


# Save final output
df.to_csv('../data/model_outputs.csv', index=False)


Train class distribution:
Defaulted
0    2811
1     689
Name: count, dtype: int64
Test class distribution:
Defaulted
0    1205
1     295
Name: count, dtype: int64
Full data class distribution:
Defaulted
0    4016
1     984
Name: count, dtype: int64
CreditScore           0
Income                0
LoanAmount            0
CustomerAge           0
DTI                   0
LTI                   0
GDP_Growth_3MAvg      0
Unemployment_3MAvg    0
Inflation_3MAvg       0
Defaulted             0
dtype: int64
           CreditScore  Income  LoanAmount  CustomerAge  DTI  LTI  \
Defaulted                                                           
0                    0       0           0            0    0    0   
1                    0       0           0            0    0    0   

           GDP_Growth_3MAvg  Unemployment_3MAvg  Inflation_3MAvg  
Defaulted                                                         
0                         0                   0                0  
1                   