In [29]:
!pip install xgboost scikit-learn pandas numpy shap




In [30]:
# ============================
# Imports
# ============================
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

# Global constants
RECOVERY_RATE = 0.10
LGD = 1 - RECOVERY_RATE   # Loss Given Default = 0.90


# ============================
# Preprocessing function
# ============================
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess loan borrower data:
      - missing values
      - derived features
      - one-hot encoding
    """
    df = df.copy()

    # Missing handling
    df["credit_lines_na"] = df["credit_lines_outstanding"].isna().astype(int)
    df["credit_lines_outstanding"] = df["credit_lines_outstanding"].fillna(
        df["credit_lines_outstanding"].median()
    )

    if "years_employed" in df.columns:
        df["years_employed"] = df["years_employed"].fillna(
            df["years_employed"].median()
        )
    else:
        df["years_employed"] = 0

    df["income"] = df["income"].replace(0, np.nan)

    # ===============================
    # Derived Features
    # ===============================

    # Debt-to-income ratio
    df["debt_to_income"] = (
        df["total_debt_outstanding"] / df["income"]
    )

    # Avg balance per credit line
    df["avg_balance_per_line"] = (
        df["loan_amt_outstanding"]
        / df["credit_lines_outstanding"].replace(0, np.nan)
    )

    # Employment features
    df["low_experience_flag"] = (df["years_employed"] < 1).astype(int)

    df["employment_bucket"] = pd.cut(
        df["years_employed"],
        bins=[-1, 1, 3, 7, 15, 100],
        labels=["<1yr", "1-3", "4-7", "8-15", "15+"]
    )

    df = pd.get_dummies(df, columns=["employment_bucket"], drop_first=True)

    # Clean up infinities / NaNs
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)

    return df
# ============================
# Load data
# ============================
df_raw = pd.read_csv(r"C:/Users/athil/JPM quantitative research/Task3/Task 3 and 4_Loan_Data.csv")

target_col = "default"


df = preprocess(df_raw)

# Features = everything except target
feature_cols = [c for c in df.columns 
                if c not in ["customerid", target_col]]


X = df[feature_cols]
y = df[target_col]


# ============================
# Train-test split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

# ============================
# Train XGBoost model
# ============================
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42
)

model.fit(X_train, y_train)

# ============================
# Evaluate model
# ============================
y_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print("AUC:", round(auc, 3))

print("\nClassification Report:")
print(classification_report(y_test, (y_proba > 0.5).astype(int)))


# ============================
# EAD (Exposure At Default)
# ============================
def calculate_ead(outstanding: float) -> float:
    """
    No credit line is provided.
    For installment loans:
        EAD = loan_amt_outstanding
    """
    return outstanding


# ============================
# Expected Loss function
# ============================
def expected_loss(
    borrower_data: dict,
    model: XGBClassifier,
    feature_cols: list,
    lgd: float = LGD
):
    customer_id = borrower_data.get("customerid", None)

    df_new = pd.DataFrame([borrower_data])
    df_new = preprocess(df_new)

    # Drop ID column if present
    if "customerid" in df_new.columns:
        df_new = df_new.drop(columns=["customerid"])

    # Align with training columns
    df_new = df_new.reindex(columns=feature_cols, fill_value=0)

    # PD
    pd_val = float(model.predict_proba(df_new)[0, 1])

    # EAD = outstanding only
    outstanding = borrower_data["loan_amt_outstanding"]
    ead = outstanding
    el = pd_val * ead * lgd

    return {
        "customerid": customer_id,
        "PD": round(pd_val, 4),
        "EAD": round(ead, 2),
        "LGD": lgd,
        "Expected_Loss": round(el, 2)
    }


# ============================
# Example borrower test
# ============================
example_borrower = {
    "customerid": 10234,
    "income": 75000,
    "fico_score": 680,
    "loan_amt_outstanding": 16000,
    "total_debt_outstanding": 72000,
    "credit_lines_outstanding": 3,
    "years_employed": 2
}

result = expected_loss(
    example_borrower,
    model,
    feature_cols
)

print(result)



AUC: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2037
           1       1.00      0.99      0.99       463

    accuracy                           1.00      2500
   macro avg       1.00      0.99      1.00      2500
weighted avg       1.00      1.00      1.00      2500

{'customerid': 10234, 'PD': 0.9958, 'EAD': 16000, 'LGD': 0.9, 'Expected_Loss': 14339.64}


In [None]:
#Create a function that processes a whole DataFrame
def compute_expected_loss_for_dataframe(df_raw, model, feature_cols, lgd=LGD):
    """
    df_raw: original unprocessed loan dataset (with borrower features)
    model: trained XGBoost model
    feature_cols: model input columns
    lgd: loss given default (default 0.90)

    Returns:
        A DataFrame with columns:
        ['customerid', 'PD', 'EAD', 'LGD', 'Expected_Loss']
    """

    # Preprocess the entire dataset at once
    df_processed = preprocess(df_raw)

    # Drop customerid from model input if present
    df_model = df_processed.copy()
    if "customerid" in df_model.columns:
        customer_ids = df_model["customerid"]
        df_model = df_model.drop(columns=["customerid"])
    else:
        customer_ids = np.arange(len(df_model))

    # Align with training feature columns
    df_model = df_model.reindex(columns=feature_cols, fill_value=0)

    # Predict PD for every row
    pd_values = model.predict_proba(df_model)[:, 1]

    # EAD = loan_amt_outstanding (no credit line case)
    ead_values = df_raw["loan_amt_outstanding"].values

    # Expected Loss
    el_values = pd_values * ead_values * lgd

    # Create output DataFrame
    output = pd.DataFrame({
        "customerid": customer_ids,
        "PD": pd_values.round(4),
        "EAD": ead_values.round(2),
        "LGD": lgd,
        "Expected_Loss": el_values.round(2)
    })

    return output


In [None]:
#Run Expected Loss on the Entire Dataset
portfolio_el = compute_expected_loss_for_dataframe(
    df_raw=df_raw,
    model=model,
    feature_cols=feature_cols,
    lgd=LGD
)

portfolio_el.head()


Unnamed: 0,customerid,PD,EAD,LGD,Expected_Loss
0,0,0.0,5221.55,0.9,0.12
1,1,0.9999,1958.93,0.9,1762.82
2,2,0.0,3363.01,0.9,0.06
3,3,0.0,4766.65,0.9,0.05
4,4,0.0,1345.83,0.9,0.03


In [None]:
#Save to CSV
portfolio_el.to_csv("Portfolio_Expected_Loss_Output.csv", index=False)


In [None]:
#Total Expected Loss for whole portfolio
total_el = portfolio_el["Expected_Loss"].sum()
print("Total Expected Loss for portfolio:", round(total_el, 2))


Total Expected Loss for portfolio: 7414592.09
