# Freddie Mac Survival Analysis - Data Exploration

This notebook explores the preprocessed survival analysis data from Freddie Mac Single Family Loans.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## Load Data

In [None]:
# Load the preprocessed survival data
df = pd.read_parquet('../data/processed/survival_data.parquet')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
print("=== Event Distribution ===")
print(df['event_type'].value_counts())
print(f"\nCensoring rate: {(df['event'] == 0).mean():.2%}")

## Duration Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall duration distribution
axes[0].hist(df['duration'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Duration (months)')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Loan Duration')

# Duration by event type
for event_type in df['event_type'].unique():
    subset = df[df['event_type'] == event_type]['duration']
    axes[1].hist(subset, bins=50, alpha=0.5, label=event_type)
axes[1].set_xlabel('Duration (months)')
axes[1].set_ylabel('Count')
axes[1].set_title('Duration by Event Type')
axes[1].legend()

plt.tight_layout()
plt.show()

## Kaplan-Meier Survival Curves

In [None]:
# Overall survival curve
kmf = KaplanMeierFitter()
kmf.fit(df['duration'], event_observed=df['event'], label='All Loans')

fig, ax = plt.subplots(figsize=(10, 6))
kmf.plot_survival_function(ax=ax)
ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curve - All Loans')
plt.show()

In [None]:
# Survival curves by LTV group
df['ltv_group'] = pd.cut(df['ltv'], bins=[0, 70, 80, 90, 100], labels=['<70%', '70-80%', '80-90%', '>90%'])

fig, ax = plt.subplots(figsize=(10, 6))

for group in df['ltv_group'].dropna().unique():
    mask = df['ltv_group'] == group
    kmf = KaplanMeierFitter()
    kmf.fit(df.loc[mask, 'duration'], event_observed=df.loc[mask, 'event'], label=f'LTV {group}')
    kmf.plot_survival_function(ax=ax)

ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability')
ax.set_title('Survival Curves by LTV Group')
plt.show()

## Cox Proportional Hazards Model

In [None]:
# Prepare data for Cox model
cox_cols = ['duration', 'event', 'credit_score', 'ltv', 'dti', 'orig_interest_rate']
cox_df = df[cox_cols].dropna()

print(f"Data for Cox model: {len(cox_df)} loans")

In [None]:
# Fit Cox model
cph = CoxPHFitter()
cph.fit(cox_df, duration_col='duration', event_col='event')

# Print summary
cph.print_summary()

In [None]:
# Plot hazard ratios
cph.plot()
plt.title('Cox Model - Hazard Ratios')
plt.tight_layout()
plt.show()

## Competing Risks Analysis

Separate analysis for default vs prepayment events.

In [None]:
# Create event indicators for competing risks
df['event_default'] = (df['event_type'] == 'default').astype(int)
df['event_prepay'] = (df['event_type'] == 'prepay').astype(int)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Default-specific survival (treating prepay as censored)
kmf_default = KaplanMeierFitter()
kmf_default.fit(df['duration'], event_observed=df['event_default'], label='Default')
kmf_default.plot_survival_function(ax=axes[0])
axes[0].set_title('Cause-Specific Survival: Default')
axes[0].set_xlabel('Time (months)')

# Prepayment-specific survival (treating default as censored)
kmf_prepay = KaplanMeierFitter()
kmf_prepay.fit(df['duration'], event_observed=df['event_prepay'], label='Prepayment')
kmf_prepay.plot_survival_function(ax=axes[1])
axes[1].set_title('Cause-Specific Survival: Prepayment')
axes[1].set_xlabel('Time (months)')

plt.tight_layout()
plt.show()

## Next Steps

1. **Feature Engineering**: Create additional features (time-varying covariates, macroeconomic indicators)
2. **ML Models**: Implement Random Survival Forests, Gradient Boosted models
3. **Deep Learning**: Train DeepSurv or Cox-Time neural network models
4. **Model Evaluation**: Use concordance index, Brier score, calibration plots