# Freddie Mac Survival Analysis - Data Exploration

This notebook explores the preprocessed survival analysis data from Freddie Mac Single Family Loans.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test

sns.set_style('whitegrid')
%matplotlib inline

## Load Data

In [None]:
# Load the preprocessed survival data
# Try parquet first (faster), fall back to CSV if pyarrow not installed
try:
    df = pd.read_parquet('../data/processed/survival_data.parquet')
except ImportError:
    df = pd.read_csv('../data/processed/survival_data.csv')

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
print("=== Event Distribution ===")
print(df['event_type'].value_counts())
print(f"\nCensoring rate: {(df['event'] == 0).mean():.2%}")

## Summary Statistics by Vintage

Explore key metrics per vintage year to understand how loan characteristics and outcomes vary over time.

In [None]:
# Summary statistics by vintage year
vintage_stats = df.groupby('vintage_year').agg({
    'loan_sequence_number': 'count',
    'duration': ['mean', 'median'],
    'event': 'mean',
    'credit_score': 'mean',
    'orig_ltv': 'mean',
    'orig_dti': 'mean',
    'orig_interest_rate': 'mean',
    'orig_upb': 'mean',
}).round(2)

# Flatten column names
vintage_stats.columns = ['_'.join(col).strip() for col in vintage_stats.columns]
vintage_stats = vintage_stats.rename(columns={
    'loan_sequence_number_count': 'n_loans',
    'duration_mean': 'avg_duration',
    'duration_median': 'median_duration',
    'event_mean': 'event_rate',
    'credit_score_mean': 'avg_fico',
    'orig_ltv_mean': 'avg_ltv',
    'orig_dti_mean': 'avg_dti',
    'orig_interest_rate_mean': 'avg_rate',
    'orig_upb_mean': 'avg_upb',
})

# Add event type breakdown
event_breakdown = df.groupby(['vintage_year', 'event_type']).size().unstack(fill_value=0)
event_breakdown_pct = event_breakdown.div(event_breakdown.sum(axis=1), axis=0) * 100

vintage_stats['default_pct'] = event_breakdown_pct.get('default', 0).round(2)
vintage_stats['prepay_pct'] = event_breakdown_pct.get('prepay', 0).round(2)
vintage_stats['censored_pct'] = event_breakdown_pct.get('censored', 0).round(2)

print("=== Summary Statistics by Vintage Year ===\n")
display(vintage_stats)

In [None]:
# Visualize key metrics by vintage
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

vintages = vintage_stats.index

# Default rate over time
axes[0, 0].bar(vintages, vintage_stats['default_pct'], color='indianred', alpha=0.7)
axes[0, 0].set_xlabel('Vintage Year')
axes[0, 0].set_ylabel('Default Rate (%)')
axes[0, 0].set_title('Default Rate by Vintage')
axes[0, 0].tick_params(axis='x', rotation=45)

# Prepayment rate over time
axes[0, 1].bar(vintages, vintage_stats['prepay_pct'], color='steelblue', alpha=0.7)
axes[0, 1].set_xlabel('Vintage Year')
axes[0, 1].set_ylabel('Prepayment Rate (%)')
axes[0, 1].set_title('Prepayment Rate by Vintage')
axes[0, 1].tick_params(axis='x', rotation=45)

# Average FICO over time
axes[0, 2].plot(vintages, vintage_stats['avg_fico'], marker='o', color='darkgreen')
axes[0, 2].set_xlabel('Vintage Year')
axes[0, 2].set_ylabel('Average FICO')
axes[0, 2].set_title('Average FICO Score by Vintage')
axes[0, 2].tick_params(axis='x', rotation=45)

# Average LTV over time
axes[1, 0].plot(vintages, vintage_stats['avg_ltv'], marker='o', color='darkorange')
axes[1, 0].set_xlabel('Vintage Year')
axes[1, 0].set_ylabel('Average LTV (%)')
axes[1, 0].set_title('Average LTV by Vintage')
axes[1, 0].tick_params(axis='x', rotation=45)

# Average interest rate over time
axes[1, 1].plot(vintages, vintage_stats['avg_rate'], marker='o', color='purple')
axes[1, 1].set_xlabel('Vintage Year')
axes[1, 1].set_ylabel('Average Interest Rate (%)')
axes[1, 1].set_title('Average Interest Rate by Vintage')
axes[1, 1].tick_params(axis='x', rotation=45)

# Average loan amount over time
axes[1, 2].plot(vintages, vintage_stats['avg_upb'] / 1000, marker='o', color='teal')
axes[1, 2].set_xlabel('Vintage Year')
axes[1, 2].set_ylabel('Average UPB ($K)')
axes[1, 2].set_title('Average Loan Amount by Vintage')
axes[1, 2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../reports/figures/vintage_trends.png', dpi=150)
plt.show()

## Duration Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall duration distribution
axes[0].hist(df['duration'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Duration (months)')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Loan Duration')

# Duration by event type
for event_type in df['event_type'].unique():
    subset = df[df['event_type'] == event_type]['duration']
    axes[1].hist(subset, bins=50, alpha=0.5, label=event_type)
axes[1].set_xlabel('Duration (months)')
axes[1].set_ylabel('Count')
axes[1].set_title('Duration by Event Type')
axes[1].legend()

plt.tight_layout()
plt.show()

## Kaplan-Meier Survival Curves

In [None]:
# Overall Kaplan-Meier survival curve
kmf = KaplanMeierFitter()
kmf.fit(df['duration'], event_observed=df['event'], label='All Loans')

# Print key statistics
print(f"Median survival time: {kmf.median_survival_time_:.1f} months")
print(f"\nSurvival probabilities:")
for t in [12, 24, 36, 60, 120]:
    if t <= df['duration'].max():
        surv_prob = kmf.predict(t)
        print(f"  At {t:3d} months: {surv_prob:.1%}")

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
kmf.plot_survival_function(ax=ax)
ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curve - All Loans (1999-2025)')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/km_overall.png', dpi=150)
plt.show()

In [None]:
# Survival curves by Vintage Year
fig, ax = plt.subplots(figsize=(12, 7))
vintage_groups = [(1999, 2005), (2006, 2008), (2009, 2015), (2016, 2020), (2021, 2025)]

for start, end in vintage_groups:
    mask = (df['vintage_year'] >= start) & (df['vintage_year'] <= end)
    if mask.sum() > 0:
        kmf_group = KaplanMeierFitter()
        kmf_group.fit(df.loc[mask, 'duration'], 
                      event_observed=df.loc[mask, 'event'], 
                      label=f'{start}-{end}')
        kmf_group.plot_survival_function(ax=ax)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curves by Vintage Year')
ax.legend(title='Vintage')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/km_by_vintage.png', dpi=150)
plt.show()

In [None]:
# Survival curves by FICO Score bands
fig, ax = plt.subplots(figsize=(12, 7))
fico_bands = ['<620', '620-679', '680-739', '740-779', '780+']

for band in fico_bands:
    mask = df['fico_band'] == band
    if mask.sum() > 100:
        kmf_fico = KaplanMeierFitter()
        kmf_fico.fit(df.loc[mask, 'duration'], 
                     event_observed=df.loc[mask, 'event'], 
                     label=f'FICO {band}')
        kmf_fico.plot_survival_function(ax=ax)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curves by FICO Score')
ax.legend(title='FICO Band')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/km_by_fico.png', dpi=150)
plt.show()

In [None]:
# Survival curves by LTV bands
fig, ax = plt.subplots(figsize=(12, 7))
ltv_bands = ['<=60', '61-70', '71-80', '81-90', '91-95', '>95']

for band in ltv_bands:
    mask = df['ltv_band'] == band
    if mask.sum() > 100:
        kmf_ltv = KaplanMeierFitter()
        kmf_ltv.fit(df.loc[mask, 'duration'], 
                    event_observed=df.loc[mask, 'event'], 
                    label=f'LTV {band}')
        kmf_ltv.plot_survival_function(ax=ax)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curves by LTV')
ax.legend(title='LTV Band')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/km_by_ltv.png', dpi=150)
plt.show()

## Cox Proportional Hazards Model

In [None]:
# Prepare data for Cox model
cox_cols = ['duration', 'event', 'credit_score', 'orig_ltv', 'orig_dti', 'orig_interest_rate']
cox_df = df[cox_cols].dropna()

# Rename columns for cleaner output
cox_df = cox_df.rename(columns={
    'credit_score': 'FICO',
    'orig_ltv': 'LTV',
    'orig_dti': 'DTI',
    'orig_interest_rate': 'Interest_Rate'
})

print(f"Data for Cox model: {len(cox_df):,} loans")

In [None]:
# Fit Cox model
cph = CoxPHFitter()
cph.fit(cox_df, duration_col='duration', event_col='event')

# Print summary
cph.print_summary()

In [None]:
# Plot hazard ratios
fig, ax = plt.subplots(figsize=(10, 6))
cph.plot(ax=ax)
ax.set_title('Cox Proportional Hazards Model - Coefficient Plot')
plt.tight_layout()
plt.savefig('../reports/figures/cox_hazard_ratios.png', dpi=150)
plt.show()

## Competing Risks Analysis

Separate analysis for default vs prepayment events.

In [None]:
# Competing Risks: Cause-specific survival curves
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Default-specific survival (treating prepay as censored)
df_default = df[df['event_type'].isin(['default', 'censored'])].copy()
df_default['event'] = (df_default['event_type'] == 'default').astype(int)
kmf_default = KaplanMeierFitter()
kmf_default.fit(df_default['duration'], event_observed=df_default['event'], label='Default')
kmf_default.plot_survival_function(ax=axes[0])
axes[0].set_xlabel('Time (months)')
axes[0].set_ylabel('Survival Probability (no default)')
axes[0].set_title('Cause-Specific Survival: Default')
axes[0].grid(True, alpha=0.3)

# Prepayment-specific survival (treating default as censored)
df_prepay = df[df['event_type'].isin(['prepay', 'censored'])].copy()
df_prepay['event'] = (df_prepay['event_type'] == 'prepay').astype(int)
kmf_prepay = KaplanMeierFitter()
kmf_prepay.fit(df_prepay['duration'], event_observed=df_prepay['event'], label='Prepayment')
kmf_prepay.plot_survival_function(ax=axes[1])
axes[1].set_xlabel('Time (months)')
axes[1].set_ylabel('Survival Probability (no prepay)')
axes[1].set_title('Cause-Specific Survival: Prepayment')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/km_competing_risks.png', dpi=150)
plt.show()

print(f"Default events: {df_default['event'].sum():,} ({df_default['event'].mean():.2%})")
print(f"Prepayment events: {df_prepay['event'].sum():,} ({df_prepay['event'].mean():.2%})")

## Next Steps

1. **Feature Engineering**: Create additional features (time-varying covariates, macroeconomic indicators)
2. **ML Models**: Implement Random Survival Forests, Gradient Boosted models
3. **Deep Learning**: Train DeepSurv or Cox-Time neural network models
4. **Model Evaluation**: Use concordance index, Brier score, calibration plots