# Data Preparation: Time-Varying Covariates for Fine-Gray Model

This notebook creates the **loan-month panel dataset** required for the Fine-Gray competing risks model with time-varying covariates.

**Outputs:**
- `data/processed/loan_month_panel.parquet` - Complete panel with time-varying covariates

**Key steps:**
1. Load raw Freddie Mac performance data (monthly records)
2. Merge with origination data (static features)
3. Merge with national macroeconomic data
4. Merge with state-level data (unemployment, HPI)
5. Calculate derived features (refinance incentive, current LTV)
6. Define event coding for competing risks

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from glob import glob
import warnings
warnings.filterwarnings('ignore')

# Import column definitions
import sys
sys.path.insert(0, '..')
from src.data.columns import (
    ORIGINATION_COLUMNS, ORIGINATION_DTYPES,
    PERFORMANCE_COLUMNS, PERFORMANCE_DTYPES,
    ZERO_BALANCE_CODE_MAP, MATURITY_THRESHOLD_MONTHS
)

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)

## Configuration

Set parameters for data processing.

In [None]:
# Data paths
RAW_DATA_DIR = Path('../data/raw')
PROCESSED_DATA_DIR = Path('../data/processed')
EXTERNAL_DATA_DIR = Path('../data/external')

# Vintages to process (can subset for testing)
VINTAGES = list(range(1999, 2026))  # 1999-2025
# VINTAGES = [2019, 2020]  # Uncomment for quick testing

# Sample size per vintage (None = all loans)
SAMPLE_PER_VINTAGE = None  # Set to e.g., 5000 for testing

print(f"Processing vintages: {min(VINTAGES)} - {max(VINTAGES)}")
print(f"Sample per vintage: {SAMPLE_PER_VINTAGE or 'All'}")

## Step 1: Load Macroeconomic Data

Load the FRED data first so we can merge it with loan-level data.

In [None]:
# Load national macro data
macro_national = pd.read_parquet(EXTERNAL_DATA_DIR / 'fred_monthly_panel.parquet')
macro_national.index.name = 'date'
macro_national = macro_national.reset_index()
macro_national['date'] = pd.to_datetime(macro_national['date'])

# Create year-month key for merging
macro_national['year_month'] = macro_national['date'].dt.to_period('M')

print(f"National macro data: {macro_national.shape}")
print(f"Date range: {macro_national['date'].min()} to {macro_national['date'].max()}")
print(f"\nColumns: {macro_national.columns.tolist()}")

In [None]:
# Load state-level unemployment
state_unemp = pd.read_parquet(EXTERNAL_DATA_DIR / 'state_unemployment.parquet')
state_unemp.index.name = 'date'
state_unemp = state_unemp.reset_index()
state_unemp['date'] = pd.to_datetime(state_unemp['date'])
state_unemp['year_month'] = state_unemp['date'].dt.to_period('M')

# Melt to long format for easier merging
state_unemp_long = state_unemp.melt(
    id_vars=['date', 'year_month'],
    var_name='state_col',
    value_name='state_unemployment'
)
state_unemp_long['property_state'] = state_unemp_long['state_col'].str.replace('_unemployment', '')
state_unemp_long = state_unemp_long[['year_month', 'property_state', 'state_unemployment']]

print(f"State unemployment: {state_unemp_long.shape}")
print(f"Sample:")
print(state_unemp_long.head())

In [None]:
# Load state-level HPI
state_hpi = pd.read_parquet(EXTERNAL_DATA_DIR / 'state_hpi.parquet')
state_hpi.index.name = 'date'
state_hpi = state_hpi.reset_index()
state_hpi['date'] = pd.to_datetime(state_hpi['date'])
state_hpi['year_month'] = state_hpi['date'].dt.to_period('M')

# Melt to long format
state_hpi_long = state_hpi.melt(
    id_vars=['date', 'year_month'],
    var_name='state_col',
    value_name='state_hpi'
)
state_hpi_long['property_state'] = state_hpi_long['state_col'].str.replace('_hpi', '')
state_hpi_long = state_hpi_long[['year_month', 'property_state', 'state_hpi']]

# Load state HPI YoY changes
state_hpi_yoy = pd.read_parquet(EXTERNAL_DATA_DIR / 'state_hpi_yoy.parquet')
state_hpi_yoy.index.name = 'date'
state_hpi_yoy = state_hpi_yoy.reset_index()
state_hpi_yoy['date'] = pd.to_datetime(state_hpi_yoy['date'])
state_hpi_yoy['year_month'] = state_hpi_yoy['date'].dt.to_period('M')

state_hpi_yoy_long = state_hpi_yoy.melt(
    id_vars=['date', 'year_month'],
    var_name='state_col',
    value_name='state_hpi_yoy'
)
state_hpi_yoy_long['property_state'] = state_hpi_yoy_long['state_col'].str.replace('_hpi_yoy', '')
state_hpi_yoy_long = state_hpi_yoy_long[['year_month', 'property_state', 'state_hpi_yoy']]

# Merge state HPI with YoY
state_hpi_long = state_hpi_long.merge(
    state_hpi_yoy_long,
    on=['year_month', 'property_state'],
    how='left'
)

print(f"State HPI: {state_hpi_long.shape}")
print(f"Sample:")
print(state_hpi_long.head())

## Step 2: Load and Process Loan Data

Load origination and performance data, then create the loan-month panel.

In [None]:
def load_origination_data(vintage: int) -> pd.DataFrame:
    """Load origination data for a vintage."""
    # Find the origination file
    pattern = f'sample_{vintage}/sample_orig_{vintage}.txt'
    files = list(RAW_DATA_DIR.glob(f'**/{pattern}'))
    
    if not files:
        print(f"  Warning: No origination file for {vintage}")
        return pd.DataFrame()
    
    df = pd.read_csv(
        files[0],
        sep='|',
        names=ORIGINATION_COLUMNS,
        dtype=ORIGINATION_DTYPES,
        na_values=['', ' ']
    )
    
    return df


def load_performance_data(vintage: int) -> pd.DataFrame:
    """Load performance (monthly) data for a vintage."""
    pattern = f'sample_{vintage}/sample_svcg_{vintage}.txt'
    files = list(RAW_DATA_DIR.glob(f'**/{pattern}'))
    
    if not files:
        print(f"  Warning: No performance file for {vintage}")
        return pd.DataFrame()
    
    df = pd.read_csv(
        files[0],
        sep='|',
        names=PERFORMANCE_COLUMNS,
        dtype=PERFORMANCE_DTYPES,
        na_values=['', ' ']
    )
    
    return df


def parse_reporting_period(df: pd.DataFrame) -> pd.DataFrame:
    """Parse monthly_reporting_period to datetime."""
    df = df.copy()
    df['reporting_date'] = pd.to_datetime(
        df['monthly_reporting_period'].astype(str),
        format='%Y%m'
    )
    df['year_month'] = df['reporting_date'].dt.to_period('M')
    return df


def determine_event_type(row):
    """Determine event type from zero_balance_code and loan_age."""
    zb_code = row['zero_balance_code']
    
    if pd.isna(zb_code) or zb_code == '':
        return 'active'  # Loan still active
    
    if zb_code == '01':
        # Check if matured (near end of term) or prepaid
        loan_age = row.get('loan_age', 0) or 0
        orig_term = row.get('orig_loan_term', 360) or 360
        if loan_age >= orig_term - MATURITY_THRESHOLD_MONTHS:
            return 'matured'
        else:
            return 'prepay'
    
    return ZERO_BALANCE_CODE_MAP.get(zb_code, 'other')


print("Functions defined.")

In [None]:
def process_vintage(vintage: int, sample_n: int = None) -> pd.DataFrame:
    """
    Process a single vintage into a loan-month panel.
    
    Parameters
    ----------
    vintage : int
        Vintage year to process
    sample_n : int, optional
        Number of loans to sample (for testing)
    
    Returns
    -------
    pd.DataFrame
        Loan-month panel with time-varying covariates
    """
    print(f"Processing vintage {vintage}...")
    
    # Load origination data
    orig_df = load_origination_data(vintage)
    if orig_df.empty:
        return pd.DataFrame()
    
    # Load performance data
    perf_df = load_performance_data(vintage)
    if perf_df.empty:
        return pd.DataFrame()
    
    # Sample loans if requested
    if sample_n and sample_n < len(orig_df):
        sampled_loans = orig_df['loan_sequence_number'].sample(n=sample_n, random_state=42)
        orig_df = orig_df[orig_df['loan_sequence_number'].isin(sampled_loans)]
        perf_df = perf_df[perf_df['loan_sequence_number'].isin(sampled_loans)]
    
    print(f"  Loans: {len(orig_df):,}, Performance records: {len(perf_df):,}")
    
    # Parse dates in performance data
    perf_df = parse_reporting_period(perf_df)
    
    # Select key columns from origination (static features)
    orig_cols = [
        'loan_sequence_number',
        'credit_score',
        'orig_ltv',
        'orig_cltv',
        'orig_dti',
        'orig_upb',
        'orig_interest_rate',
        'orig_loan_term',
        'first_payment_date',
        'property_state',
        'occupancy_status',
        'loan_purpose',
        'property_type',
        'channel',
        'num_borrowers',
        'first_time_homebuyer',
        'mi_pct'
    ]
    orig_df = orig_df[orig_cols].copy()
    orig_df['vintage_year'] = vintage
    
    # Parse first_payment_date to get origination HPI date
    orig_df['first_payment_date'] = pd.to_datetime(
        orig_df['first_payment_date'].astype(str),
        format='%Y%m',
        errors='coerce'
    )
    orig_df['orig_year_month'] = orig_df['first_payment_date'].dt.to_period('M')
    
    # Select key columns from performance (time-varying)
    perf_cols = [
        'loan_sequence_number',
        'year_month',
        'reporting_date',
        'current_actual_upb',
        'current_loan_delinquency_status',
        'loan_age',
        'remaining_months_to_maturity',
        'current_interest_rate',
        'zero_balance_code',
        'modification_flag',
        'eltv'
    ]
    perf_df = perf_df[perf_cols].copy()
    
    # Merge origination with performance
    panel = perf_df.merge(orig_df, on='loan_sequence_number', how='left')
    
    # Determine event type for each record
    panel['event_type'] = panel.apply(determine_event_type, axis=1)
    
    # Parse delinquency status
    panel['delinquency_status'] = pd.to_numeric(
        panel['current_loan_delinquency_status'].replace({'X': '0', 'XX': '0'}),
        errors='coerce'
    ).fillna(0).astype(int)
    
    # Create delinquency flags
    panel['is_current'] = (panel['delinquency_status'] == 0).astype(int)
    panel['is_30_day_delinquent'] = (panel['delinquency_status'] == 1).astype(int)
    panel['is_60_day_delinquent'] = (panel['delinquency_status'] == 2).astype(int)
    panel['is_90_plus_delinquent'] = (panel['delinquency_status'] >= 3).astype(int)
    
    print(f"  Panel records: {len(panel):,}")
    
    return panel


print("Process function defined.")

In [None]:
# Process all vintages
panels = []

for vintage in VINTAGES:
    panel = process_vintage(vintage, sample_n=SAMPLE_PER_VINTAGE)
    if not panel.empty:
        panels.append(panel)

# Combine all vintages
print("\nCombining all vintages...")
loan_month_panel = pd.concat(panels, ignore_index=True)

print(f"\nTotal panel size: {len(loan_month_panel):,} loan-months")
print(f"Unique loans: {loan_month_panel['loan_sequence_number'].nunique():,}")

In [None]:
# Summary statistics
print("=== Panel Summary ===")
print(f"\nDate range: {loan_month_panel['reporting_date'].min()} to {loan_month_panel['reporting_date'].max()}")
print(f"\nEvent type distribution (terminal events only):")
terminal = loan_month_panel[loan_month_panel['event_type'] != 'active']
print(terminal['event_type'].value_counts())

print(f"\nDelinquency status distribution:")
print(loan_month_panel['delinquency_status'].value_counts().head(10))

## Step 3: Merge Macroeconomic Data

In [None]:
# Select key macro columns to merge
macro_cols = [
    'year_month',
    'UNRATE',
    'MORTGAGE30US',
    'MORTGAGE15US',
    'FEDFUNDS',
    'DGS10',
    'UMCSENT',
    'hpi_yoy_change',
    'inflation_yoy',
    'mortgage_rate_yoy_change',
    'mortgage_spread',
    'yield_curve_slope',
    'gdp_growth',
    'HOUST'
]

# Only include columns that exist
available_macro_cols = [c for c in macro_cols if c in macro_national.columns]
macro_to_merge = macro_national[available_macro_cols].copy()

# Merge national macro data
print("Merging national macro data...")
loan_month_panel = loan_month_panel.merge(
    macro_to_merge,
    on='year_month',
    how='left'
)

print(f"Panel after national merge: {len(loan_month_panel):,} records")
print(f"MORTGAGE30US coverage: {loan_month_panel['MORTGAGE30US'].notna().mean():.1%}")

In [None]:
# Merge state-level unemployment
print("Merging state unemployment...")
loan_month_panel = loan_month_panel.merge(
    state_unemp_long,
    on=['year_month', 'property_state'],
    how='left'
)

print(f"State unemployment coverage: {loan_month_panel['state_unemployment'].notna().mean():.1%}")

# Merge state-level HPI
print("Merging state HPI...")
loan_month_panel = loan_month_panel.merge(
    state_hpi_long,
    on=['year_month', 'property_state'],
    how='left'
)

print(f"State HPI coverage: {loan_month_panel['state_hpi'].notna().mean():.1%}")

## Step 4: Calculate Derived Features

In [None]:
# Get origination-time HPI for each loan (for current LTV calculation)
# First, create a mapping of (year_month, state) -> HPI at origination
orig_hpi = loan_month_panel.groupby('loan_sequence_number').agg({
    'orig_year_month': 'first',
    'property_state': 'first'
}).reset_index()

# Merge to get origination HPI
orig_hpi = orig_hpi.merge(
    state_hpi_long[['year_month', 'property_state', 'state_hpi']].rename(
        columns={'year_month': 'orig_year_month', 'state_hpi': 'orig_state_hpi'}
    ),
    on=['orig_year_month', 'property_state'],
    how='left'
)

# Merge back to panel
loan_month_panel = loan_month_panel.merge(
    orig_hpi[['loan_sequence_number', 'orig_state_hpi']],
    on='loan_sequence_number',
    how='left'
)

print(f"Origination HPI coverage: {loan_month_panel['orig_state_hpi'].notna().mean():.1%}")

In [None]:
# Calculate derived features

# 1. Refinance Incentive (positive = incentive to refinance)
loan_month_panel['refinance_incentive'] = (
    loan_month_panel['orig_interest_rate'] - loan_month_panel['MORTGAGE30US']
)

# 2. Refinance incentive buckets
loan_month_panel['refi_incentive_bucket'] = pd.cut(
    loan_month_panel['refinance_incentive'],
    bins=[-np.inf, -1, 0, 0.5, 1, 1.5, 2, np.inf],
    labels=['<-1%', '-1-0%', '0-0.5%', '0.5-1%', '1-1.5%', '1.5-2%', '>2%']
)

# 3. Current LTV (approximate using HPI change)
# current_LTV = orig_LTV * (orig_HPI / current_HPI)
# As HPI rises, LTV falls (more equity)
loan_month_panel['hpi_ratio'] = (
    loan_month_panel['orig_state_hpi'] / loan_month_panel['state_hpi']
)
loan_month_panel['current_ltv_approx'] = (
    loan_month_panel['orig_ltv'] * loan_month_panel['hpi_ratio']
)

# 4. Equity indicator (current LTV < 80%)
loan_month_panel['has_equity'] = (
    loan_month_panel['current_ltv_approx'] < 80
).astype(int)

# 5. Underwater indicator (current LTV > 100%)
loan_month_panel['is_underwater'] = (
    loan_month_panel['current_ltv_approx'] > 100
).astype(int)

# 6. Rate spread at current time (loan rate vs market rate)
loan_month_panel['rate_spread'] = (
    loan_month_panel['current_interest_rate'] - loan_month_panel['MORTGAGE30US']
)

# 7. Loan age squared (for non-linear seasoning effects)
loan_month_panel['loan_age_squared'] = loan_month_panel['loan_age'] ** 2

# 8. Burnout indicator (loans that had refi incentive but didn't refi)
# This is a simplification - true burnout requires looking at history
loan_month_panel['had_refi_opportunity'] = (
    loan_month_panel['refinance_incentive'] > 0.5
).astype(int)

print("Derived features calculated.")
print(f"\nRefinance incentive stats:")
print(loan_month_panel['refinance_incentive'].describe())

In [None]:
# Check feature coverage
print("=== Feature Coverage ===")
key_features = [
    'refinance_incentive',
    'current_ltv_approx',
    'state_unemployment',
    'state_hpi_yoy',
    'MORTGAGE30US',
    'yield_curve_slope'
]

for feat in key_features:
    if feat in loan_month_panel.columns:
        coverage = loan_month_panel[feat].notna().mean()
        print(f"{feat}: {coverage:.1%}")

## Step 5: Event Coding for Competing Risks

Create event indicators for the Fine-Gray model:
- `event_prepay`: 1 if prepaid, 0 otherwise
- `event_default`: 1 if defaulted, 0 otherwise
- `event_any`: 1 if any terminal event, 0 if censored

In [None]:
# Create event indicators
loan_month_panel['event_prepay'] = (loan_month_panel['event_type'] == 'prepay').astype(int)
loan_month_panel['event_default'] = (loan_month_panel['event_type'] == 'default').astype(int)
loan_month_panel['event_matured'] = (loan_month_panel['event_type'] == 'matured').astype(int)
loan_month_panel['event_other'] = (loan_month_panel['event_type'].isin(['other', 'defect'])).astype(int)

# Event coding for Fine-Gray (numeric)
# 0 = censored/active, 1 = prepay (primary), 2 = default (competing)
def code_competing_event(event_type):
    if event_type == 'prepay':
        return 1
    elif event_type == 'default':
        return 2
    elif event_type == 'matured':
        return 0  # Treat matured as censored
    elif event_type in ['other', 'defect']:
        return 3  # Other termination
    else:
        return 0  # Censored/active

loan_month_panel['event_code'] = loan_month_panel['event_type'].apply(code_competing_event)

# Any terminal event
loan_month_panel['is_terminal'] = (loan_month_panel['event_type'] != 'active').astype(int)

print("=== Event Coding ===")
print(f"\nEvent code distribution (terminal records only):")
terminal = loan_month_panel[loan_month_panel['is_terminal'] == 1]
print(terminal['event_code'].value_counts().sort_index())
print("\n0=censored/matured, 1=prepay, 2=default, 3=other")

## Step 6: Final Cleanup and Save

In [None]:
# Select final columns
final_columns = [
    # Identifiers
    'loan_sequence_number',
    'year_month',
    'reporting_date',
    'vintage_year',
    
    # Time-varying loan features
    'loan_age',
    'loan_age_squared',
    'remaining_months_to_maturity',
    'current_actual_upb',
    'current_interest_rate',
    'delinquency_status',
    'is_current',
    'is_30_day_delinquent',
    'is_60_day_delinquent',
    'is_90_plus_delinquent',
    'modification_flag',
    
    # Static loan features
    'credit_score',
    'orig_ltv',
    'orig_cltv',
    'orig_dti',
    'orig_upb',
    'orig_interest_rate',
    'orig_loan_term',
    'property_state',
    'occupancy_status',
    'loan_purpose',
    'property_type',
    'channel',
    'num_borrowers',
    'first_time_homebuyer',
    'mi_pct',
    
    # Derived features
    'refinance_incentive',
    'refi_incentive_bucket',
    'current_ltv_approx',
    'has_equity',
    'is_underwater',
    'rate_spread',
    'had_refi_opportunity',
    
    # National macro
    'UNRATE',
    'MORTGAGE30US',
    'FEDFUNDS',
    'DGS10',
    'UMCSENT',
    'hpi_yoy_change',
    'inflation_yoy',
    'mortgage_rate_yoy_change',
    'mortgage_spread',
    'yield_curve_slope',
    'gdp_growth',
    
    # State-level macro
    'state_unemployment',
    'state_hpi',
    'state_hpi_yoy',
    
    # Event coding
    'event_type',
    'event_code',
    'event_prepay',
    'event_default',
    'event_matured',
    'is_terminal'
]

# Only keep columns that exist
available_columns = [c for c in final_columns if c in loan_month_panel.columns]
missing_columns = [c for c in final_columns if c not in loan_month_panel.columns]

if missing_columns:
    print(f"Warning: Missing columns: {missing_columns}")

loan_month_final = loan_month_panel[available_columns].copy()

print(f"\nFinal panel: {len(loan_month_final):,} records, {len(available_columns)} columns")

In [None]:
# Save to parquet
output_path = PROCESSED_DATA_DIR / 'loan_month_panel.parquet'
loan_month_final.to_parquet(output_path, index=False)
print(f"Saved to {output_path}")
print(f"File size: {output_path.stat().st_size / 1e9:.2f} GB")

In [None]:
# Final summary
print("=" * 60)
print("LOAN-MONTH PANEL SUMMARY")
print("=" * 60)
print(f"\nTotal records: {len(loan_month_final):,}")
print(f"Unique loans: {loan_month_final['loan_sequence_number'].nunique():,}")
print(f"Date range: {loan_month_final['reporting_date'].min()} to {loan_month_final['reporting_date'].max()}")
print(f"Vintages: {loan_month_final['vintage_year'].min()} - {loan_month_final['vintage_year'].max()}")

print(f"\n=== Terminal Events ===")
terminal = loan_month_final[loan_month_final['is_terminal'] == 1]
print(terminal['event_type'].value_counts())

print(f"\n=== Key Feature Stats ===")
print(loan_month_final[['refinance_incentive', 'current_ltv_approx', 'state_unemployment']].describe().round(2))

## Next Steps

The loan-month panel is now ready for modeling:

1. **Notebook 04**: Non-parametric CIF estimation (Aalen-Johansen)
2. **Notebook 05**: Cause-specific Cox models
3. **Notebook 06**: Fine-Gray competing risks model

Key considerations:
- Use `loan_age` as the time variable (months since origination)
- For competing risks: `event_code` = 1 (prepay), 2 (default), 0 (censored)
- Split by vintage for out-of-time validation