# Cause-Specific Cox Proportional Hazards Models

This notebook fits **cause-specific Cox models** for prepayment and default. In cause-specific analysis, competing events are treated as censored.

## Key Concepts

- **Cause-specific hazard**: Risk of event k among those still at risk (no event yet)
- **Interpretation**: Effect of covariates on event intensity/rate
- **Limitation**: Doesn't directly give cumulative incidence predictions

## Models to Fit
1. Cause-specific Cox for **prepayment** (default treated as censored)
2. Cause-specific Cox for **default** (prepayment treated as censored)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Survival analysis
from lifelines import CoxPHFitter
from lifelines.statistics import logrank_test
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

# Competing risks module
import sys
sys.path.insert(0, '..')
from src.competing_risks import (
    fit_cause_specific_cox,
    CauseSpecificCox,
)
from src.competing_risks.cause_specific import fit_both_cause_specific

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

sns.set_style('whitegrid')
%matplotlib inline

## Load Data

In [None]:
# Load the survival data
DATA_DIR = Path('../data/processed')

df = pd.read_parquet(DATA_DIR / 'survival_data.parquet')
print(f"Loaded {len(df):,} loans")

# Create event code
event_map = {
    'censored': 0,
    'prepay': 1,
    'default': 2,
    'matured': 0,
    'other': 3,
    'defect': 3,
}
df['event_code'] = df['event_type'].map(event_map)

print(f"\nEvent distribution:")
print(df['event_type'].value_counts())

In [None]:
# Define feature columns
feature_cols = [
    'credit_score',
    'orig_ltv',
    'orig_dti',
    'orig_interest_rate',
    'orig_upb',
]

# Filter to complete cases
df_model = df[['duration', 'event_code'] + feature_cols].dropna()
print(f"Complete cases: {len(df_model):,} loans")

# Log transform UPB
df_model['log_upb'] = np.log(df_model['orig_upb'])
feature_cols_final = ['credit_score', 'orig_ltv', 'orig_dti', 'orig_interest_rate', 'log_upb']

In [None]:
# Train-test split
train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42)

print(f"Training set: {len(train_df):,} loans")
print(f"Test set: {len(test_df):,} loans")

## Cause-Specific Cox Model for Prepayment

Fit Cox model where:
- Event = prepayment (event_code = 1)
- Default and other events treated as censored

In [None]:
# Prepare data for prepayment model
train_prepay = train_df.copy()
train_prepay['event'] = (train_prepay['event_code'] == 1).astype(int)

# Fit Cox model
cph_prepay = CoxPHFitter(penalizer=0.01)
cph_prepay.fit(
    train_prepay[feature_cols_final + ['duration', 'event']],
    duration_col='duration',
    event_col='event'
)

print("=== Cause-Specific Cox Model: PREPAYMENT ===")
cph_prepay.print_summary()

In [None]:
# Plot hazard ratios
fig, ax = plt.subplots(figsize=(10, 6))
cph_prepay.plot(ax=ax)
ax.set_title('Cause-Specific Cox Model: Prepayment\nHazard Ratios (95% CI)')
plt.tight_layout()
plt.savefig('../reports/figures/cox_prepay_hazard_ratios.png', dpi=150)
plt.show()

In [None]:
# Evaluate on test set
test_prepay = test_df.copy()
test_prepay['event'] = (test_prepay['event_code'] == 1).astype(int)

# Predict risk scores
risk_prepay = cph_prepay.predict_partial_hazard(test_prepay[feature_cols_final]).values

# Calculate C-index
c_index_prepay = concordance_index_censored(
    test_prepay['event'].astype(bool),
    test_prepay['duration'],
    risk_prepay.flatten()
)

print(f"Prepayment Model - Test C-index: {c_index_prepay[0]:.4f}")

## Cause-Specific Cox Model for Default

Fit Cox model where:
- Event = default (event_code = 2)
- Prepayment and other events treated as censored

In [None]:
# Prepare data for default model
train_default = train_df.copy()
train_default['event'] = (train_default['event_code'] == 2).astype(int)

# Fit Cox model
cph_default = CoxPHFitter(penalizer=0.01)
cph_default.fit(
    train_default[feature_cols_final + ['duration', 'event']],
    duration_col='duration',
    event_col='event'
)

print("=== Cause-Specific Cox Model: DEFAULT ===")
cph_default.print_summary()

In [None]:
# Plot hazard ratios
fig, ax = plt.subplots(figsize=(10, 6))
cph_default.plot(ax=ax)
ax.set_title('Cause-Specific Cox Model: Default\nHazard Ratios (95% CI)')
plt.tight_layout()
plt.savefig('../reports/figures/cox_default_hazard_ratios.png', dpi=150)
plt.show()

In [None]:
# Evaluate on test set
test_default = test_df.copy()
test_default['event'] = (test_default['event_code'] == 2).astype(int)

# Predict risk scores
risk_default = cph_default.predict_partial_hazard(test_default[feature_cols_final]).values

# Calculate C-index
c_index_default = concordance_index_censored(
    test_default['event'].astype(bool),
    test_default['duration'],
    risk_default.flatten()
)

print(f"Default Model - Test C-index: {c_index_default[0]:.4f}")

## Compare Coefficients: Prepayment vs Default

In [None]:
# Extract coefficients
prepay_coefs = cph_prepay.summary[['coef', 'exp(coef)', 'p']].copy()
prepay_coefs.columns = ['coef_prepay', 'hr_prepay', 'p_prepay']

default_coefs = cph_default.summary[['coef', 'exp(coef)', 'p']].copy()
default_coefs.columns = ['coef_default', 'hr_default', 'p_default']

# Combine
comparison = prepay_coefs.join(default_coefs)
comparison['coef_diff'] = comparison['coef_prepay'] - comparison['coef_default']
comparison['hr_ratio'] = comparison['hr_prepay'] / comparison['hr_default']

print("=== Coefficient Comparison: Prepayment vs Default ===")
print(comparison.round(4).to_string())

In [None]:
# Plot coefficient comparison
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(feature_cols_final))
width = 0.35

bars1 = ax.bar(x - width/2, comparison['coef_prepay'], width, 
               label='Prepayment', color='steelblue', alpha=0.7)
bars2 = ax.bar(x + width/2, comparison['coef_default'], width, 
               label='Default', color='indianred', alpha=0.7)

ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Feature')
ax.set_ylabel('Coefficient')
ax.set_title('Cause-Specific Cox Coefficients: Prepayment vs Default')
ax.set_xticks(x)
ax.set_xticklabels(feature_cols_final, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../reports/figures/cox_coefficient_comparison.png', dpi=150)
plt.show()

print("\nKey Observations:")
print("- Credit score: Negative for both (higher FICO = lower hazard)")
print("- LTV: Positive for default (higher LTV = more default risk)")
print("- Interest rate: Different effects for prepay vs default")

## Predicted Survival Curves

In [None]:
# Create representative profiles
profiles = pd.DataFrame({
    'credit_score': [620, 700, 780],
    'orig_ltv': [80, 80, 80],
    'orig_dti': [35, 35, 35],
    'orig_interest_rate': [5.0, 5.0, 5.0],
    'log_upb': [np.log(250000), np.log(250000), np.log(250000)],
}, index=['Low FICO', 'Medium FICO', 'High FICO'])

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Prepayment survival curves
ax = axes[0]
cph_prepay.predict_survival_function(profiles).plot(ax=ax)
ax.set_title('Prepayment: Survival Function by FICO')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability (no prepay)')
ax.legend(title='Profile')
ax.grid(True, alpha=0.3)

# Default survival curves
ax = axes[1]
cph_default.predict_survival_function(profiles).plot(ax=ax)
ax.set_title('Default: Survival Function by FICO')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Survival Probability (no default)')
ax.legend(title='Profile')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/cox_survival_by_fico.png', dpi=150)
plt.show()

## Model Summary

In [None]:
print("=" * 60)
print("CAUSE-SPECIFIC COX MODELS SUMMARY")
print("=" * 60)

print(f"\nPrepayment Model:")
print(f"  Test C-index: {c_index_prepay[0]:.4f}")
print(f"  Concordant pairs: {c_index_prepay[1]:,}")

print(f"\nDefault Model:")
print(f"  Test C-index: {c_index_default[0]:.4f}")
print(f"  Concordant pairs: {c_index_default[1]:,}")

print(f"\nKey Findings:")
print("  - Higher FICO reduces both prepayment and default hazard")
print("  - Higher LTV increases default hazard significantly")
print("  - Interest rate effect differs between prepay and default")

## Save Models for Later Use

In [None]:
import pickle

# Save models
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(exist_ok=True)

with open(MODELS_DIR / 'cox_prepay.pkl', 'wb') as f:
    pickle.dump(cph_prepay, f)
    
with open(MODELS_DIR / 'cox_default.pkl', 'wb') as f:
    pickle.dump(cph_default, f)

print(f"Models saved to {MODELS_DIR}")

## Next Steps

1. **Notebook 06**: Fit Fine-Gray subdistribution hazard model
2. **Notebook 07**: Compare Fine-Gray vs cause-specific Cox results

**Important**: These cause-specific models estimate the hazard among those at risk, but do NOT directly predict cumulative incidence. For cumulative incidence prediction, we need the Fine-Gray model.