In [None]:

import os
import math
import json
import numpy as np
import pandas as pd

from pathlib import Path

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 160)

FEATURES_DIR = Path('/content/drive/MyDrive/freddie mac/features/')
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

MERGED_PATH = Path('/content/drive/MyDrive/freddie mac/merged_loan_performance_2010_2024.parquet')
assert MERGED_PATH.exists(), f"Missing merged panel at {MERGED_PATH}"

In [None]:
df = pd.read_parquet(MERGED_PATH)
print("Loaded merged panel:", df.shape)
print(df.columns.tolist()[:40])
df.head(3)

Loaded merged panel: (37681816, 63)
['loan_sequence_number', 'monthly_reporting_period', 'current_actual_upb', 'current_loan_delinquency_status', 'loan_age', 'remaining_months_to_legal_maturity', 'defect_settlement_date', 'modification_flag', 'zero_balance_code', 'zero_balance_effective_date', 'current_interest_rate', 'current_deferred_upb', 'due_date_of_last_paid_installment_ddlpi', 'mi_recoveries', 'net_sales_proceeds', 'non_mi_recoveries', 'expenses', 'legal_costs', 'maintenance_and_preservation_costs', 'taxes_and_insurance', 'miscellaneous_expenses', 'actual_loss_calculation', 'modification_cost', 'step_modification_flag', 'deferred_payment_plan', 'estimated_loan_to_value_eltv', 'zero_balance_removal_upb', 'delinquent_accrued_interest', 'delinquency_due_to_disaster', 'borrower_assistance_status_code', 'current_month_modification_cost', 'interest_bearing_upb', 'credit_score', 'first_payment_date', 'first_time_homebuyer_flag', 'maturity_date', 'metropolitan_statistical_area_msa_or_me

Unnamed: 0,loan_sequence_number,monthly_reporting_period,current_actual_upb,current_loan_delinquency_status,loan_age,remaining_months_to_legal_maturity,defect_settlement_date,modification_flag,zero_balance_code,zero_balance_effective_date,current_interest_rate,current_deferred_upb,due_date_of_last_paid_installment_ddlpi,mi_recoveries,net_sales_proceeds,non_mi_recoveries,expenses,legal_costs,maintenance_and_preservation_costs,taxes_and_insurance,miscellaneous_expenses,actual_loss_calculation,modification_cost,step_modification_flag,deferred_payment_plan,estimated_loan_to_value_eltv,zero_balance_removal_upb,delinquent_accrued_interest,delinquency_due_to_disaster,borrower_assistance_status_code,current_month_modification_cost,interest_bearing_upb,credit_score,first_payment_date,first_time_homebuyer_flag,maturity_date,metropolitan_statistical_area_msa_or_metropolitan_division,mortgage_insurance_percentage_mi_%,number_of_units,occupancy_status,original_combined_loan_to_value_cltv,original_debt_to_income_dti_ratio,original_upb,original_loan_to_value_ltv,original_interest_rate,channel,prepayment_penalty_mortgage_ppm_flag,amortization_type_formerly_product_type,property_state,property_type,postal_code,loan_purpose,original_loan_term,number_of_borrowers,seller_name,servicer_name,super_conforming_flag,pre_harp_loan_sequence_number,program_indicator,harp_indicator,property_valuation_method,interest_only_i_o_indicator,mortgage_insurance_cancellation_indicator
0,F10Q10000014,2010-04-01,216000.0,0.0,0,180,,,,NaT,4.375,0.0,,,,,,,,,,,,,,,,,,,,216000.0,784.0,2010-05-01,N,2025-04-01,45780,0,1,P,90.0,38.0,216000,80,4.375,R,N,FRM,OH,SF,43600,N,180,2,Other sellers,U.S. BANK N.A.,,,9,,9,N,9
1,F10Q10000014,2010-05-01,215000.0,0.0,1,179,,,,NaT,4.375,0.0,,,,,,,,,,,,,,,,,,,,215000.0,784.0,2010-05-01,N,2025-04-01,45780,0,1,P,90.0,38.0,216000,80,4.375,R,N,FRM,OH,SF,43600,N,180,2,Other sellers,U.S. BANK N.A.,,,9,,9,N,9
2,F10Q10000014,2010-06-01,214000.0,0.0,2,178,,,,NaT,4.375,0.0,,,,,,,,,,,,,,,,,,,,214000.0,784.0,2010-05-01,N,2025-04-01,45780,0,1,P,90.0,38.0,216000,80,4.375,R,N,FRM,OH,SF,43600,N,180,2,Other sellers,U.S. BANK N.A.,,,9,,9,N,9


In [None]:
# --- Column name resolver (handles slight naming diffs from earlier notebooks) ---
def first_present(cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

COL = dict(
    loan_id          = first_present(['loan_sequence_number','loan_id']),
    report_month     = first_present(['monthly_reporting_period', 'reporting_month', 'period']),
    upb              = first_present(['current_actual_upb', 'curr_actual_upb', 'upb']),
    delinquency      = first_present(['current_loan_delinquency_status','current_delinquency_status','loan_delinquency_status']),
    zero_bal_code    = first_present(['zero_balance_code','zero_bal_code']),
    zero_bal_upb     = first_present(['zero_balance_removal_upb','zero_balance_upb','removal_upb']),
    actual_loss      = first_present(['actual_loss_calculation','actual_loss']),
    mi_recov         = first_present(['mi_recoveries','mi_recovery']),
    non_mi_recov     = first_present(['non_mi_recoveries','non_mi_recovery']),
    net_sale_proc    = first_present(['net_sales_proceeds','net_sale_proceeds']),
    expenses         = first_present(['expenses']),
    legal_costs      = first_present(['legal_costs']),
    maint_costs      = first_present(['maintenance_and_preservation_costs','maintenance_costs']),
    taxes_ins        = first_present(['taxes_and_insurance']),
    orig_upb         = first_present(['original_upb','orig_upb']),
    credit_score     = first_present(['credit_score','fico']),
    dti              = first_present(['original_debt_to_income_dti_ratio','original_dti','dti']),
    ltv              = first_present(['original_loan_to_value_ltv','original_ltv','ltv']),
    cltv             = first_present(['original_combined_loan_to_value_cltv','original_cltv','cltv']),
    interest_rate    = first_present(['original_interest_rate','interest_rate']),
    first_pay_date   = first_present(['first_payment_date']),
    maturity_date    = first_present(['maturity_date']),
    loan_age         = first_present(['loan_age']),
    channel          = first_present(['channel']),
    occupancy        = first_present(['occupancy_status','occupancy']),
    property_type    = first_present(['property_type']),
    loan_purpose     = first_present(['loan_purpose']),
    state            = first_present(['property_state','state']),
)

print(json.dumps(COL, indent=2))


{
  "loan_id": "loan_sequence_number",
  "report_month": "monthly_reporting_period",
  "upb": "current_actual_upb",
  "delinquency": "current_loan_delinquency_status",
  "zero_bal_code": "zero_balance_code",
  "zero_bal_upb": "zero_balance_removal_upb",
  "actual_loss": "actual_loss_calculation",
  "mi_recov": "mi_recoveries",
  "non_mi_recov": "non_mi_recoveries",
  "net_sale_proc": "net_sales_proceeds",
  "expenses": "expenses",
  "legal_costs": "legal_costs",
  "maint_costs": "maintenance_and_preservation_costs",
  "taxes_ins": "taxes_and_insurance",
  "orig_upb": "original_upb",
  "credit_score": "credit_score",
  "dti": "original_debt_to_income_dti_ratio",
  "ltv": "original_loan_to_value_ltv",
  "cltv": "original_combined_loan_to_value_cltv",
  "interest_rate": "original_interest_rate",
  "first_pay_date": "first_payment_date",
  "maturity_date": "maturity_date",
  "loan_age": "loan_age",
  "channel": "channel",
  "occupancy": "occupancy_status",
  "property_type": "property_type

In [None]:

# Enforce key types
if COL['report_month'] and not np.issubdtype(df[COL['report_month']].dtype, np.datetime64):
    df[COL['report_month']] = pd.to_datetime(df[COL['report_month']], errors='coerce')

for num_col in [COL['upb'], COL['orig_upb'], COL['delinquency'], COL['credit_score'], COL['dti'], COL['ltv'], COL['cltv']]:
    if num_col and not np.issubdtype(df[num_col].dtype, np.number):
        df[num_col] = pd.to_numeric(df[num_col], errors='coerce')

df = df.sort_values([c for c in [COL['loan_id'], COL['report_month']] if c]).reset_index(drop=True)

In [None]:
# -----------------------------
#1) PD LABELS (vectorized, robust)
# -----------------------------
# What this cell produces on a *loan-level* table (loan_default):
#   - pd_default_flag: 1 if the loan ever hits 90+ DPD OR a default-type zero balance (lifetime PD)
#   - __first_90dpd_month: first month the loan is 90+ DPD (delinquency >= 3)
#   - __first_liq_month   : first month with a default-type zero-balance *and* positive removal UPB
#   - __first_default_month: earliest of the two (90+DPD vs liquidation) — convenience anchor
#
# Notes:
# - It’s fully vectorized (no row-wise apply), so it’s faster and safer.
# - We coerce key columns to numeric and normalize months to month-start timestamps.

import numpy as np
import pandas as pd

DEFAULT_ZB_CODES = {3, 6, 9, 96, 97}  # foreclosure/REO/repurchase/etc.

# --- Basic checks ---
loan_id = COL['loan_id']
rpt_col = COL['report_month']
assert loan_id and rpt_col, "COL must define 'loan_id' and 'report_month'."

# --- Ensure month column is datetime at month granularity ---
df[rpt_col] = pd.to_datetime(df[rpt_col], errors='coerce')
df[rpt_col] = df[rpt_col].dt.to_period('M').dt.to_timestamp()  # normalize to first-of-month

# --- Coerce key event columns to numeric for safe comparisons ---
if COL.get('delinquency') in df.columns:
    df[COL['delinquency']] = pd.to_numeric(df[COL['delinquency']], errors='coerce')

if COL.get('zero_bal_code') in df.columns:
    df[COL['zero_bal_code']] = pd.to_numeric(df[COL['zero_bal_code']], errors='coerce')

if COL.get('zero_bal_upb') in df.columns:
    df[COL['zero_bal_upb']] = pd.to_numeric(df[COL['zero_bal_upb']], errors='coerce')

# --- Event signals ---
# 90+ DPD (delinquency >= 3) anywhere in the history
cond_90dpd = (
    df[COL['delinquency']] >= 3
) if COL.get('delinquency') else pd.Series(False, index=df.index)

# Liquidation (default-type zero-balance) — prefer requiring positive removal UPB
if COL.get('zero_bal_code'):
    if COL.get('zero_bal_upb'):
        cond_liq = df[COL['zero_bal_code']].isin(DEFAULT_ZB_CODES) & (df[COL['zero_bal_upb']] > 0)
    else:
        # Fallback if removal UPB isn't available (less strict)
        cond_liq = df[COL['zero_bal_code']].isin(DEFAULT_ZB_CODES)
else:
    cond_liq = pd.Series(False, index=df.index)

# Lifetime PD row-level flag
df['__default_flag_row'] = cond_90dpd | cond_liq

# --- Aggregate to loan level ---
# pd_default_flag: ever default (lifetime)
pd_flag = (
    df.groupby(loan_id)['__default_flag_row']
      .max()
      .rename('pd_default_flag')
      .reset_index()
)

# first 90+ DPD month (if any)
first_90dpd = (
    df.loc[cond_90dpd, [loan_id, rpt_col]]
      .groupby(loan_id)[rpt_col]
      .min()
      .rename('__first_90dpd_month')
      .reset_index()
)

# first liquidation month (if any)
first_liq = (
    df.loc[cond_liq, [loan_id, rpt_col]]
      .groupby(loan_id)[rpt_col]
      .min()
      .rename('__first_liq_month')
      .reset_index()
)

# Combine per-loan labels/timings
loan_default = (
    pd_flag
    .merge(first_90dpd, on=loan_id, how='left')
    .merge(first_liq,   on=loan_id, how='left')
)

# Convenience anchor = earliest of the two event months
loan_default['__first_default_month'] = loan_default[['__first_90dpd_month','__first_liq_month']].min(axis=1)

# (Optional) quick sanity prints
print(
    "Loans:", len(loan_default),
    "| lifetime defaults:", int(loan_default['pd_default_flag'].sum()),
    "| with 90+ DPD:", int(loan_default['__first_90dpd_month'].notna().sum()),
    "| with liquidation:", int(loan_default['__first_liq_month'].notna().sum()),
)
loan_default.head(3)


Loans: 737500 | lifetime defaults: 26001 | with 90+ DPD: 24617 | with liquidation: 2914


Unnamed: 0,loan_sequence_number,pd_default_flag,__first_90dpd_month,__first_liq_month,__first_default_month
0,F10Q10000014,False,NaT,NaT,NaT
1,F10Q10000069,False,NaT,NaT,NaT
2,F10Q10000089,False,NaT,NaT,NaT


In [None]:
# Filter for rows where pd_default_flag is True and display the first 10
loan_default[loan_default['pd_default_flag'] == True].head(10)

Unnamed: 0,loan_sequence_number,pd_default_flag,__first_90dpd_month,__first_liq_month,__first_default_month
62,F10Q10001999,True,2021-04-01,NaT,2021-04-01
115,F10Q10003941,True,2018-01-01,NaT,2018-01-01
121,F10Q10004168,True,2019-02-01,NaT,2019-02-01
130,F10Q10004288,True,2012-05-01,2016-06-01,2012-05-01
189,F10Q10005845,True,2012-01-01,2012-04-01,2012-01-01
221,F10Q10006805,True,2012-04-01,NaT,2012-04-01
244,F10Q10007579,True,2020-07-01,NaT,2020-07-01
405,F10Q10013522,True,2015-01-01,NaT,2015-01-01
406,F10Q10013529,True,2015-09-01,NaT,2015-09-01
410,F10Q10013616,True,2011-10-01,NaT,2011-10-01


In [None]:
#2) --- LGD/EAD at LIQUIDATION (zero-balance) month, not first 90+ DPD ---

DEFAULT_ZB_CODES = {3, 6, 9, 96, 97}

# 1) Identify the first liquidation month per loan
zb_col = COL["zero_bal_code"]
zbu_col = COL["zero_bal_upb"]
rm_col = COL["report_month"]

# Make sure zero-balance UPB is numeric to test > 0
zbu_num = pd.to_numeric(df[zbu_col], errors="coerce") if zbu_col else pd.Series(np.nan, index=df.index)
is_liq_row = df[zb_col].isin(DEFAULT_ZB_CODES) & (zbu_num > 0)

first_liq = (
    df.loc[is_liq_row, [COL["loan_id"], rm_col]]
      .groupby(COL["loan_id"])[rm_col]
      .min()
      .rename("__first_liq_month")
      .reset_index()
)

# 2) Pull liquidation rows
liq_rows = df.merge(first_liq, on=COL["loan_id"], how="inner")
liq_rows = liq_rows[liq_rows[rm_col] == liq_rows["__first_liq_month"]].copy()

if len(liq_rows):
    # Coerce needed columns to numeric
    def tonum(col):
        return pd.to_numeric(liq_rows[col], errors="coerce") if (col and col in liq_rows.columns) else pd.Series(np.nan, index=liq_rows.index)

    upb         = tonum(COL.get("upb"))                   # may be 0 in liquidation rows
    zero_upb    = tonum(COL.get("zero_bal_upb"))
    actual_loss = tonum(COL.get("actual_loss"))

    mi   = tonum(COL.get("mi_recov")).fillna(0.0)
    nmi  = tonum(COL.get("non_mi_recov")).fillna(0.0)
    nsp  = tonum(COL.get("net_sale_proc")).fillna(0.0)
    exp_ = tonum(COL.get("expenses")).fillna(0.0)
    leg_ = tonum(COL.get("legal_costs")).fillna(0.0)
    mnt_ = tonum(COL.get("maint_costs")).fillna(0.0)
    tax_ = tonum(COL.get("taxes_ins")).fillna(0.0)

    recov   = mi + nmi + nsp
    add_exp = exp_ + leg_ + mnt_ + tax_

    # EAD at liquidation: prefer zero_upb (>0), else upb
    ead_liq = zero_upb.where(zero_upb > 0, np.nan).fillna(upb)

    # Loss at liquidation: prefer actual_loss when present & zero_upb>0; else fallback
    fallback_loss_liq = (ead_liq - recov + add_exp).clip(lower=0.0)
    use_actual_liq = actual_loss.notna() & zero_upb.gt(0)
    loss_liq = np.where(use_actual_liq, actual_loss, fallback_loss_liq)

    lgd_liq = np.where(ead_liq > 0, loss_liq / ead_liq, np.nan)
    lgd_liq = np.clip(lgd_liq, 0.0, 1.0)

    df_lgd_ead = pd.DataFrame({
        COL["loan_id"]:     liq_rows[COL["loan_id"]].values,
        "default_month":    liq_rows["__first_liq_month"].values,   # now liquidation month
        "EAD_at_default":   ead_liq.values.astype(float),
        "LGD":              lgd_liq.astype(float),
        "UPB_at_default":   upb.values.astype(float),
        "ZeroBalanceUPB":   zero_upb.values.astype(float),
        "ActualLoss":       actual_loss.values.astype(float),
    })
else:
    df_lgd_ead = pd.DataFrame(
        columns=[COL.get("loan_id") or "loan_id","default_month","EAD_at_default","LGD"]
    )

print("LGD/EAD rows (liquidation):", df_lgd_ead.shape)
df_lgd_ead.head(3)


LGD/EAD rows (liquidation): (2914, 7)


Unnamed: 0,loan_sequence_number,default_month,EAD_at_default,LGD,UPB_at_default,ZeroBalanceUPB,ActualLoss
0,F10Q10004288,2016-06-01,31856.2,0.0,0.0,31856.2,-37174.83
1,F10Q10005845,2012-04-01,132147.94,0.0,0.0,132147.94,-1293.5
2,F10Q10020146,2010-08-01,196000.0,1.0,0.0,196000.0,


In [None]:
# 1) One row per liquidated loan?
n_loans = df_lgd_ead[COL['loan_id']].nunique()
print("unique loans:", n_loans, " rows:", len(df_lgd_ead))

# 2) How many LGD are 0 or 1 (clipping effects)?
print("LGD==0:", (df_lgd_ead['LGD']==0).sum(),
      "LGD==1:", (df_lgd_ead['LGD']==1).sum())

# 3) Missing ActualLoss vs present
print("ActualLoss present:", df_lgd_ead['ActualLoss'].notna().sum(),
      "missing:", df_lgd_ead['ActualLoss'].isna().sum())

# 4) Basic distribution
print(df_lgd_ead['LGD'].describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]))


unique loans: 2914  rows: 2914
LGD==0: 1212 LGD==1: 1610
ActualLoss present: 1304 missing: 1610
count    2914.000000
mean        0.555384
std         0.494942
min         0.000000
1%          0.000000
5%          0.000000
25%         0.000000
50%         1.000000
75%         1.000000
95%         1.000000
99%         1.000000
max         1.000000
Name: LGD, dtype: float64


In [None]:
# ---------------------------------------------
# 3) PD FEATURES (Origination-based, lifetime)
#    - One row per loan (first observed month = origination snapshot)
#    - Merge lifetime PD flag + event months
#    - Add DTI/LTV bins
# ---------------------------------------------

# First observed row per loan (origination snapshot proxy)
first_idx = df.groupby(COL['loan_id'])[COL['report_month']].idxmin()
first_rows = df.loc[first_idx].copy()

keep_cols = [c for c in [
    COL['loan_id'], COL['orig_upb'], COL['credit_score'], COL['dti'], COL['ltv'], COL['cltv'],
    COL['interest_rate'], COL['channel'], COL['occupancy'], COL['property_type'],
    COL['loan_purpose'], COL['state'], COL['first_pay_date'], COL['maturity_date']
] if c]

pd_feats = first_rows[keep_cols].copy()

# Merge labels (lifetime PD + timing anchors)
pd_feats = pd_feats.merge(
    loan_default[[COL['loan_id'], 'pd_default_flag', '__first_90dpd_month', '__first_liq_month', '__first_default_month']],
    on=COL['loan_id'], how='left'
)

# Clean DTI special values and outliers
if COL['dti'] in pd_feats.columns:
    pd_feats[COL['dti']] = pd.to_numeric(pd_feats[COL['dti']], errors='coerce')
    pd_feats.loc[pd_feats[COL['dti']] > 200, COL['dti']] = np.nan

# Add DTI/LTV bins for interpretability
if COL['dti'] in pd_feats.columns:
    pd_feats['dti_bin'] = pd.cut(
        pd_feats[COL['dti']],
        bins=[-np.inf, 20, 30, 36, 43, 50, np.inf],
        labels=['<=20','20-30','30-36','36-43','43-50','>50']
    )

if COL['ltv'] in pd_feats.columns:
    pd_feats['ltv_bin'] = pd.cut(
        pd_feats[COL['ltv']],
        bins=[-np.inf, 60, 70, 80, 90, 95, np.inf],
        labels=['<=60','60-70','70-80','80-90','90-95','>95']
    )

print("PD feature table:", pd_feats.shape)
pd_feats.head(3)


PD feature table: (737500, 20)


Unnamed: 0,loan_sequence_number,original_upb,credit_score,original_debt_to_income_dti_ratio,original_loan_to_value_ltv,original_combined_loan_to_value_cltv,original_interest_rate,channel,occupancy_status,property_type,loan_purpose,property_state,first_payment_date,maturity_date,pd_default_flag,__first_90dpd_month,__first_liq_month,__first_default_month,dti_bin,ltv_bin
0,F10Q10000014,216000,784.0,38.0,80,90.0,4.375,R,P,SF,N,OH,2010-05-01,2025-04-01,False,NaT,NaT,NaT,36-43,70-80
1,F10Q10000069,200000,795.0,35.0,67,67.0,4.5,R,P,SF,N,KS,2010-03-01,2025-02-01,False,NaT,NaT,NaT,30-36,60-70
2,F10Q10000089,146000,784.0,47.0,55,55.0,4.5,R,P,SF,N,FL,2010-03-01,2025-02-01,False,NaT,NaT,NaT,43-50,<=60





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
# ----------------------
# 4) WoE for categoricals (Logit-friendly)
# ----------------------

def compute_woe_iv(df_in, feature_col, target_col='pd_default_flag', eps=1e-6, clip=5.0):
    """
    Compute WoE/IV for a categorical feature vs. binary target.
    Returns: mapping dict {category -> woe}, IV float, and the summary table.
    """
    tmp = df_in[[feature_col, target_col]].copy()
    tmp[feature_col] = tmp[feature_col].astype('category')

    agg = tmp.groupby(feature_col)[target_col].agg(['count','sum']).rename(columns={'sum':'default'})
    agg['good'] = agg['count'] - agg['default']

    total_good = agg['good'].sum()
    total_bad  = agg['default'].sum()
    agg['dist_good'] = (agg['good'] / max(eps, total_good)).clip(eps, 1.0)
    agg['dist_bad']  = (agg['default'] / max(eps, total_bad)).clip(eps, 1.0)

    agg['woe'] = np.log(agg['dist_good'] / agg['dist_bad']).clip(-clip, clip)
    agg['iv']  = (agg['dist_good'] - agg['dist_bad']) * agg['woe']

    mapping = agg['woe'].to_dict()
    iv = float(agg['iv'].sum())
    return mapping, iv, agg.reset_index()

def apply_woe(df_in, col, mapping, fill_value=0.0, new_name=None):
    new_name = new_name or f"{col}_woe"
    out = df_in[col].map(mapping).astype(float)
    return out.fillna(fill_value)

CAT_VARS = [c for c in [COL['channel'], COL['occupancy'], COL['property_type'], COL['loan_purpose'], COL['state']]
            if c and c in pd_feats.columns]

woe_info = {}
fit_base = pd_feats.dropna(subset=['pd_default_flag'])
for c in CAT_VARS:
    try:
        m, iv, tbl = compute_woe_iv(fit_base, feature_col=c, target_col='pd_default_flag')
        pd_feats[f"{c}_woe"] = apply_woe(pd_feats, c, m, fill_value=0.0)
        woe_info[c] = {'iv': float(iv), 'levels': int(len(m))}
        print(f"Woe ready for {c}: IV={iv:.4f}, levels={len(m)}")
    except Exception as e:
        print(f"Skipped WoE for {c}: {e}")

woe_info


  agg = tmp.groupby(feature_col)[target_col].agg(['count','sum']).rename(columns={'sum':'default'})
  agg = tmp.groupby(feature_col)[target_col].agg(['count','sum']).rename(columns={'sum':'default'})
  agg = tmp.groupby(feature_col)[target_col].agg(['count','sum']).rename(columns={'sum':'default'})


Woe ready for channel: IV=0.0037, levels=3
Woe ready for occupancy_status: IV=0.0036, levels=3
Woe ready for property_type: IV=0.0068, levels=5
Woe ready for loan_purpose: IV=0.0064, levels=3
Woe ready for property_state: IV=0.0697, levels=54


  agg = tmp.groupby(feature_col)[target_col].agg(['count','sum']).rename(columns={'sum':'default'})
  agg = tmp.groupby(feature_col)[target_col].agg(['count','sum']).rename(columns={'sum':'default'})


{'channel': {'iv': 0.0036837006717230295, 'levels': 3},
 'occupancy_status': {'iv': 0.003622327538280056, 'levels': 3},
 'property_type': {'iv': 0.006768844398866535, 'levels': 5},
 'loan_purpose': {'iv': 0.006423098408643666, 'levels': 3},
 'property_state': {'iv': 0.06974253194195361, 'levels': 54}}

In [None]:
# ---------------------------------
# 6) Save out feature tables + metadata
# ---------------------------------

# Drop internal helper to avoid leaking it
pd_out_cols = [c for c in pd_feats.columns if c != '__first_default_month']

# Save PD feature set (one row per loan)
(FEATURES_DIR / 'features_pd.parquet').parent.mkdir(parents=True, exist_ok=True)
pd_feats[pd_out_cols].to_parquet(FEATURES_DIR / 'features_pd.parquet', index=False)

# Save LGD/EAD training rows (defaults only) — `df_lgd_ead` is produced by  *liquidation* cell
if 'df_lgd_ead' in globals() and len(df_lgd_ead):
    df_lgd_ead.to_parquet(FEATURES_DIR / 'features_lgd_ead.parquet', index=False)

# Diagnostics / metadata
meta = {
    'n_loans_pd': int(pd_feats.shape[0]),
    'n_defaults_lifetime': int(pd_feats['pd_default_flag'].sum() if 'pd_default_flag' in pd_feats.columns else 0),
    'n_with_90dpd': int(pd_feats['__first_90dpd_month'].notna().sum()) if '__first_90dpd_month' in pd_feats.columns else None,
    'n_with_liquidation': int(pd_feats['__first_liq_month'].notna().sum()) if '__first_liq_month' in pd_feats.columns else None,
    'woe_vars': {k: {'iv': float(v['iv']), 'levels': int(v['levels'])} for k, v in (woe_info or {}).items()},
    'has_lgd_ead': bool('df_lgd_ead' in globals() and len(df_lgd_ead)),
    'lgd_ead_rows': int(len(df_lgd_ead)) if 'df_lgd_ead' in globals() else 0,
}
import json
with open(FEATURES_DIR / 'feature_meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

print("Saved:")
print(" -", FEATURES_DIR / 'features_pd.parquet')
print(" -", FEATURES_DIR / 'features_lgd_ead.parquet' if meta['has_lgd_ead'] else " -(no liquidation rows → no LGD/EAD file)")
print(" -", FEATURES_DIR / 'feature_meta.json')


Saved:
 - /content/drive/MyDrive/freddie mac/features/features_pd.parquet
 - /content/drive/MyDrive/freddie mac/features/features_lgd_ead.parquet
 - /content/drive/MyDrive/freddie mac/features/feature_meta.json
