In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# --- 1. Load Data ---
# IMPORTANT: Ensure 'Task 3 and 4_Loan_Data.csv' is uploaded to your Colab environment
try:
    df = pd.read_csv("Task 3 and 4_Loan_Data.csv")
    print("Data loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'Task 3 and 4_Loan_Data.csv' not found.")
    raise

# --- 2. Define Features and Target ---
FEATURES = ['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding',
            'income', 'years_employed', 'fico_score']
TARGET = 'default'

X = df[FEATURES]
y = df[TARGET]

# Split data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Data loaded successfully.
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  

Training set size: 8000
Test set size: 2000


In [None]:
# --- 3. Build and Train PD Model (Logistic Regression) ---
# The pipeline scales the features and then fits the Logistic Regression model.
pd_model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(random_state=42, solver='liblinear'))
])

print("Training the Logistic Regression (PD) model...")
pd_model_pipeline.fit(X_train, y_train)
print("Training complete.")

# --- 4. Define the Expected Loss (EL) Function ---

# Constants for EL calculation
RECOVERY_RATE = 0.10
LGD = 1.0 - RECOVERY_RATE # Loss Given Default = 0.90 (90%)

def calculate_expected_loss(loan_properties: dict, model_pipeline: Pipeline = pd_model_pipeline) -> float:
    """
    Calculates the Expected Loss (EL) for a single loan based on its properties.

    Formula: EL = PD * EAD * LGD

    Args:
        loan_properties (dict): A dictionary containing the borrower's properties,
                                including the loan_amt_outstanding (used as EAD).
                                Keys must match the model's expected features:
                                'credit_lines_outstanding', 'loan_amt_outstanding',
                                'total_debt_outstanding', 'income', 'years_employed',
                                'fico_score'.
        model_pipeline (Pipeline): The trained scikit-learn pipeline (PD model).

    Returns:
        float: The calculated Expected Loss in currency units.
    """
    # 1. Convert the input dictionary to a DataFrame for the model
    loan_df = pd.DataFrame([loan_properties], columns=FEATURES)

    # 2. Get EAD (Exposure at Default) - proxied by loan_amt_outstanding
    ead = loan_df['loan_amt_outstanding'].iloc[0]

    # 3. Predict PD (Probability of Default)
    # .predict_proba[:, 1] gives the probability of the positive class (default=1)
    pd_prediction = model_pipeline.predict_proba(loan_df)[:, 1][0]

    # 4. Calculate EL
    expected_loss = pd_prediction * ead * LGD

    return expected_loss, pd_prediction, ead, LGD

Training the Logistic Regression (PD) model...
Training complete.


In [None]:
# --- 5. Example Usage ---

# Example 1: A borrower with high income, low debt, and good FICO score (Low Default Risk)
example_loan_low_risk = {
    'credit_lines_outstanding': 0,
    'loan_amt_outstanding': 5000.00,
    'total_debt_outstanding': 3000.00,
    'income': 100000.00,
    'years_employed': 10,
    'fico_score': 750
}

# Example 2: A borrower with low income, high debt, and poor FICO score (High Default Risk)
example_loan_high_risk = {
    'credit_lines_outstanding': 5,
    'loan_amt_outstanding': 15000.00,
    'total_debt_outstanding': 25000.00,
    'income': 30000.00,
    'years_employed': 1,
    'fico_score': 550
}

# Calculate and print the Expected Loss for the examples
el_low, pd_low, ead_low, lgd_low = calculate_expected_loss(example_loan_low_risk)
el_high, pd_high, ead_high, lgd_high = calculate_expected_loss(example_loan_high_risk)

print("\n--- Low-Risk Loan Analysis ---")
print(f"EAD (Loan Amount): ${ead_low:,.2f}")
print(f"LGD (Loss Given Default): {lgd_low:.2f}")
print(f"Predicted PD: {pd_low:.4f} (or {pd_low*100:.2f}%)")
print(f"Calculated Expected Loss (EL): ${el_low:,.2f}")

print("\n--- High-Risk Loan Analysis ---")
print(f"EAD (Loan Amount): ${ead_high:,.2f}")
print(f"LGD (Loss Given Default): {lgd_high:.2f}")
print(f"Predicted PD: {pd_high:.4f} (or {pd_high*100:.2f}%)")
print(f"Calculated Expected Loss (EL): ${el_high:,.2f}")

# --- Optional: Print Model Performance on Test Set ---
y_pred_proba = pd_model_pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\n--- Model Performance on Test Set ---")
print(f"The model's ROC-AUC score is: {roc_auc:.4f} (A measure of its ability to rank defaults.)")


--- Low-Risk Loan Analysis ---
EAD (Loan Amount): $5,000.00
LGD (Loss Given Default): 0.90
Predicted PD: 0.0000 (or 0.00%)
Calculated Expected Loss (EL): $0.00

--- High-Risk Loan Analysis ---
EAD (Loan Amount): $15,000.00
LGD (Loss Given Default): 0.90
Predicted PD: 1.0000 (or 100.00%)
Calculated Expected Loss (EL): $13,500.00

--- Model Performance on Test Set ---
The model's ROC-AUC score is: 1.0000 (A measure of its ability to rank defaults.)
