In [10]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

# =========================================
# 1. Load data
# =========================================

df = pd.read_csv(r"C:/Users/athil/JPM quantitative research/Task3/Task 3 and 4_Loan_Data.csv")

# Make sure these column names match your file:
fico_col = "fico_score"
target_col = "default"

fico = df[fico_col].values
y = df[target_col].values.astype(int)

# =========================================
# 2. Create fast FICO buckets using quantiles
# =========================================

N_BUCKETS = 10   # you can change to 5, 7, etc.

# qcut will create ~equal-sized buckets in terms of number of rows
# retbins=True gives us the bin edges so we can reuse them later
df["fico_bucket"], bins = pd.qcut(
    df[fico_col],
    q=N_BUCKETS,
    retbins=True,
    duplicates="drop"
)

# buckets are categorical intervals like (650.0, 680.0]
# convert them into ordered integer ratings

# First, get codes 0..K-1 in ascending FICO order
df["bucket_code"] = df["fico_bucket"].cat.codes   # 0 = lowest FICO bucket

# We want rating 1 = BEST (highest FICO)
# So invert: rating = K - code
K = df["bucket_code"].max() + 1
df["rating"] = K - df["bucket_code"]

print("Sample of FICO → bucket → rating:")
print(df[[fico_col, "fico_bucket", "rating"]].head())

# =========================================
# 3. Train PD model on rating (categorical)
# =========================================

# One-hot encode rating for logistic regression
X = pd.get_dummies(df["rating"].astype("category"),
                   prefix="rating",
                   drop_first=True)

feature_cols = X.columns  # save for later
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

logit = LogisticRegression(max_iter=1000)
logit.fit(X_train, y_train)

# =========================================
# 4. Evaluate the model
# =========================================

y_proba = logit.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print("\nAUC:", round(auc, 3))

print("\nClassification report (threshold=0.5):")
print(classification_report(y_test, (y_proba > 0.5).astype(int)))

# =========================================
# 5. Helper: map new FICO → rating, using the same bins
# =========================================

def map_fico_to_rating_quick(score, bins):
    """
    score : numeric fico score
    bins  : array of bin edges from qcut

    Returns rating where 1 = best, K = worst.
    """
    # np.digitize gives bin index 1..len(bins)-1
    code = np.digitize(score, bins[1:-1], right=True)  # 0..K-1
    K = len(bins) - 1
    rating = K - code
    return rating


# =========================================
# 6. Predict PD for a new FICO score
# =========================================

def predict_pd_for_new_fico(fico_score, bins, model, feature_cols):
    """
    Given a new FICO score, map it to rating and compute PD.
    """
    rating = map_fico_to_rating_quick(fico_score, bins)

    tmp = pd.DataFrame({"rating": [rating]})
    X_new = pd.get_dummies(tmp["rating"].astype("category"),
                           prefix="rating",
                           drop_first=True)

    # align with training columns
    X_new = X_new.reindex(columns=feature_cols, fill_value=0)

    pd_val = float(model.predict_proba(X_new)[0, 1])

    return {
        "fico_score": fico_score,
        "rating": rating,
        "PD": round(pd_val, 4)
    }

# =========================================
# 7. Example predictions
# =========================================

for s in [620, 680, 720, 780, 820]:
    print(predict_pd_for_new_fico(s, bins, logit, feature_cols))


Sample of FICO → bucket → rating:
   fico_score     fico_bucket  rating
0         605  (587.0, 607.0]       8
1         572  (560.0, 587.0]       9
2         602  (587.0, 607.0]       8
3         612  (607.0, 623.0]       7
4         631  (623.0, 638.0]       6

AUC: 0.709

Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.84      0.93      0.89      2445
           1       0.45      0.24      0.31       555

    accuracy                           0.80      3000
   macro avg       0.65      0.59      0.60      3000
weighted avg       0.77      0.80      0.78      3000

{'fico_score': 620, 'rating': np.int64(7), 'PD': 0.0545}
{'fico_score': 680, 'rating': np.int64(3), 'PD': 0.0545}
{'fico_score': 720, 'rating': np.int64(1), 'PD': 0.0545}
{'fico_score': 780, 'rating': np.int64(1), 'PD': 0.0545}
{'fico_score': 820, 'rating': np.int64(1), 'PD': 0.0545}


In [None]:
#add PD for all rows in the dataset
# X is the full feature matrix for all rows (built earlier from df["rating"])
# feature_cols = X.columns

# 1) Predict PD for every row in the dataset
df["PD_estimated"] = logit.predict_proba(X)[:, 1]

# 2) Optional: check a few rows
print(df[["fico_score", "rating", "PD_estimated"]].head())


   fico_score  rating  PD_estimated
0         605       8      0.242023
1         572       9      0.313827
2         602       8      0.242023
3         612       7      0.168794
4         631       6      0.168713


In [None]:
#new CSV
df.to_csv("mortgage_with_PD.csv", index=False)


In [None]:
#Expected Loss for each row
LGD = 0.9  
EAD_col = "loan_amt_outstanding"  

df["Expected_Loss"] = df["PD_estimated"] * df[EAD_col] * LGD


In [None]:
#Total Expected Loss 
total_EL = df["Expected_Loss"].sum()
print("Total Expected Loss", round(total_EL, 2))


Total Expected Loss for mortgage portfolio: 6982996.29


In [None]:
#Because the exact dynamic-programming optimization of the quantization boundaries
#  is computationally heavy for large loan portfolios, 
# I adopted a fast quantile-based bucketing strategy. 
# FICO scores are split into K buckets with roughly equal populations using qcut,
#  then mapped to ordinal ratings where rating 1 represents the best credit quality.
#  These ratings are one-hot encoded and used as categorical inputs 
# to a logistic regression PD model.”