# Non-parametric Cumulative Incidence Estimation

This notebook estimates the **Cumulative Incidence Function (CIF)** using the **Aalen-Johansen estimator**, which is the proper non-parametric estimator for competing risks.

## Key Concepts

- **CIF**: Probability of experiencing event k by time t, accounting for competing risks
- **1 - Kaplan-Meier ≠ CIF**: When competing risks exist, 1-KM overestimates cumulative incidence
- **Aalen-Johansen**: Non-parametric estimator that properly handles competing risks

## Outputs
- CIF curves for prepayment and default
- CIF by key segments (vintage, FICO, LTV)
- Comparison of CIF vs 1-KM to show the difference

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Competing risks module
import sys
sys.path.insert(0, '..')
from src.competing_risks import (
    estimate_cif_aalen_johansen,
    plot_cumulative_incidence,
)
from src.competing_risks.cumulative_incidence import (
    estimate_cif_by_group,
    plot_cif_comparison,
    compare_cif_vs_kaplan_meier,
)

# Lifelines
from lifelines import AalenJohansenFitter, KaplanMeierFitter

sns.set_style('whitegrid')
%matplotlib inline

## Load Data

We need one record per loan with terminal event information.

In [None]:
# Load the survival data (one row per loan)
DATA_DIR = Path('../data/processed')

df = pd.read_parquet(DATA_DIR / 'survival_data.parquet')
print(f"Loaded {len(df):,} loans")
print(f"\nEvent distribution:")
print(df['event_type'].value_counts())

Loaded 1,324,950 loans

Event distribution:
event_type
prepay      941535
censored    345972
default      19728
matured       7113
other         7016
defect        3586
Name: count, dtype: int64


In [None]:
# Create numeric event code for competing risks
# 0 = censored, 1 = prepay, 2 = default
event_map = {
    'censored': 0,
    'prepay': 1,
    'default': 2,
    'matured': 0,  # Treat matured as censored
    'other': 3,
    'defect': 3,
}
df['event_code'] = df['event_type'].map(event_map)

print("Event code distribution:")
print(df['event_code'].value_counts().sort_index())
print("\n0=censored/matured, 1=prepay, 2=default, 3=other")

Event code distribution:
event_code
0    353085
1    941535
2     19728
3     10602
Name: count, dtype: int64

0=censored/matured, 1=prepay, 2=default, 3=other


## Overall Cumulative Incidence

Estimate CIF for prepayment and default across the entire portfolio.

In [None]:
# Estimate CIF for prepayment (event=1)
# Note: calculate_variance=False is critical for large datasets -
# the variance calculation is O(n²) and will hang with 1M+ observations
ajf_prepay = AalenJohansenFitter(calculate_variance=False)
ajf_prepay.fit(df['duration'], df['event_code'], event_of_interest=1)

# Estimate CIF for default (event=2)
ajf_default = AalenJohansenFitter(calculate_variance=False)
ajf_default.fit(df['duration'], df['event_code'], event_of_interest=2)

# Helper function to get CIF at a specific time
def cif_at_time(ajf, t):
    """Get CIF value at time t using the nearest available time point."""
    cdf = ajf.cumulative_density_
    valid_times = cdf.index[cdf.index <= t]
    if len(valid_times) == 0:
        return 0.0
    return cdf.loc[valid_times[-1]].values[0]

# Print key statistics
print("=== Cumulative Incidence at Key Time Points ===")
for t in [12, 24, 36, 60, 120]:
    cif_prepay = cif_at_time(ajf_prepay, t)
    cif_default = cif_at_time(ajf_default, t)
    print(f"  At {t:3d} months: Prepay={cif_prepay:.1%}, Default={cif_default:.2%}")

In [None]:
# Plot overall CIF for both events
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Prepayment CIF
ajf_prepay.plot(ax=axes[0], color='steelblue')
axes[0].set_title('Cumulative Incidence: Prepayment')
axes[0].set_xlabel('Time (months)')
axes[0].set_ylabel('Cumulative Incidence')
axes[0].grid(True, alpha=0.3)
axes[0].legend().remove()

# Default CIF
ajf_default.plot(ax=axes[1], color='indianred')
axes[1].set_title('Cumulative Incidence: Default')
axes[1].set_xlabel('Time (months)')
axes[1].set_ylabel('Cumulative Incidence')
axes[1].grid(True, alpha=0.3)
axes[1].legend().remove()

plt.tight_layout()
plt.savefig('../reports/figures/cif_overall.png', dpi=150)
plt.show()

## CIF vs 1 - Kaplan-Meier Comparison

**Important**: When competing risks exist, `1 - Kaplan-Meier` is NOT the same as the cumulative incidence function. This comparison demonstrates the difference.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Prepayment: CIF vs 1-KM
ax = axes[0]

# CIF (correct)
times = ajf_prepay.cumulative_density_.index
cif = ajf_prepay.cumulative_density_.values.flatten()
ax.step(times, cif, where='post', label='CIF (Aalen-Johansen)', linewidth=2, color='steelblue')

# 1 - KM (incorrect for competing risks)
kmf = KaplanMeierFitter()
kmf.fit(df['duration'], (df['event_code'] == 1).astype(int))
km_times = kmf.survival_function_.index
one_minus_km = 1 - kmf.survival_function_.values.flatten()
ax.step(km_times, one_minus_km, where='post', label='1 - KM (incorrect)', 
        linewidth=2, linestyle='--', color='orange')

ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.set_title('Prepayment: CIF vs 1-Kaplan-Meier')
ax.legend()
ax.grid(True, alpha=0.3)

# Default: CIF vs 1-KM
ax = axes[1]

# CIF (correct)
times = ajf_default.cumulative_density_.index
cif = ajf_default.cumulative_density_.values.flatten()
ax.step(times, cif, where='post', label='CIF (Aalen-Johansen)', linewidth=2, color='indianred')

# 1 - KM (incorrect for competing risks)
kmf = KaplanMeierFitter()
kmf.fit(df['duration'], (df['event_code'] == 2).astype(int))
km_times = kmf.survival_function_.index
one_minus_km = 1 - kmf.survival_function_.values.flatten()
ax.step(km_times, one_minus_km, where='post', label='1 - KM (incorrect)', 
        linewidth=2, linestyle='--', color='orange')

ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.set_title('Default: CIF vs 1-Kaplan-Meier')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/cif_vs_km_comparison.png', dpi=150)
plt.show()

print("\nNote: 1-KM overestimates cumulative incidence when competing risks exist.")
print("This is because KM treats competing events as censored, ignoring that")
print("those subjects can never experience the event of interest.")

## CIF by Vintage Year

Compare cumulative incidence across different origination vintages.

In [None]:
# Group vintages
vintage_groups = {
    '1999-2003': (1999, 2003),
    '2004-2008': (2004, 2008),
    '2009-2015': (2009, 2015),
    '2016-2020': (2016, 2020),
    '2021-2025': (2021, 2025),
}

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
colors = plt.cm.viridis(np.linspace(0, 1, len(vintage_groups)))

# Prepayment CIF by vintage
ax = axes[0]
for (name, (start, end)), color in zip(vintage_groups.items(), colors):
    mask = (df['vintage_year'] >= start) & (df['vintage_year'] <= end)
    if mask.sum() > 100:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df.loc[mask, 'duration'], df.loc[mask, 'event_code'], event_of_interest=1)
        ajf.plot(ax=ax, label=name, color=color)

ax.set_title('Prepayment CIF by Vintage')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.legend(title='Vintage')
ax.grid(True, alpha=0.3)

# Default CIF by vintage
ax = axes[1]
for (name, (start, end)), color in zip(vintage_groups.items(), colors):
    mask = (df['vintage_year'] >= start) & (df['vintage_year'] <= end)
    if mask.sum() > 100:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df.loc[mask, 'duration'], df.loc[mask, 'event_code'], event_of_interest=2)
        ajf.plot(ax=ax, label=name, color=color)

ax.set_title('Default CIF by Vintage')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.legend(title='Vintage')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/cif_by_vintage.png', dpi=150)
plt.show()

## CIF by FICO Score Band

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
fico_bands = ['<620', '620-679', '680-739', '740-779', '780+']
colors = plt.cm.RdYlGn(np.linspace(0.1, 0.9, len(fico_bands)))

# Prepayment CIF by FICO
ax = axes[0]
for band, color in zip(fico_bands, colors):
    mask = df['fico_band'] == band
    if mask.sum() > 100:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df.loc[mask, 'duration'], df.loc[mask, 'event_code'], event_of_interest=1)
        ajf.plot(ax=ax, label=f'FICO {band}', color=color)

ax.set_title('Prepayment CIF by FICO Score')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.legend(title='FICO Band')
ax.grid(True, alpha=0.3)

# Default CIF by FICO
ax = axes[1]
for band, color in zip(fico_bands, colors):
    mask = df['fico_band'] == band
    if mask.sum() > 100:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df.loc[mask, 'duration'], df.loc[mask, 'event_code'], event_of_interest=2)
        ajf.plot(ax=ax, label=f'FICO {band}', color=color)

ax.set_title('Default CIF by FICO Score')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.legend(title='FICO Band')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/cif_by_fico.png', dpi=150)
plt.show()

## CIF by LTV Band

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
ltv_bands = ['<=60', '61-70', '71-80', '81-90', '91-95', '>95']
colors = plt.cm.coolwarm(np.linspace(0, 1, len(ltv_bands)))

# Prepayment CIF by LTV
ax = axes[0]
for band, color in zip(ltv_bands, colors):
    mask = df['ltv_band'] == band
    if mask.sum() > 100:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df.loc[mask, 'duration'], df.loc[mask, 'event_code'], event_of_interest=1)
        ajf.plot(ax=ax, label=f'LTV {band}', color=color)

ax.set_title('Prepayment CIF by LTV')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.legend(title='LTV Band')
ax.grid(True, alpha=0.3)

# Default CIF by LTV
ax = axes[1]
for band, color in zip(ltv_bands, colors):
    mask = df['ltv_band'] == band
    if mask.sum() > 100:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df.loc[mask, 'duration'], df.loc[mask, 'event_code'], event_of_interest=2)
        ajf.plot(ax=ax, label=f'LTV {band}', color=color)

ax.set_title('Default CIF by LTV')
ax.set_xlabel('Time (months)')
ax.set_ylabel('Cumulative Incidence')
ax.legend(title='LTV Band')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/cif_by_ltv.png', dpi=150)
plt.show()

## Summary Statistics

In [None]:
# Create summary table of CIF at key time points by segment
def get_cif_at_time(df_subset, event, time):
    """Get CIF at a specific time point."""
    try:
        ajf = AalenJohansenFitter(calculate_variance=False)
        ajf.fit(df_subset['duration'], df_subset['event_code'], event_of_interest=event)
        # Use cif_at_time helper defined earlier
        return cif_at_time(ajf, time)
    except:
        return np.nan

# Summary by vintage
summary_rows = []
for name, (start, end) in vintage_groups.items():
    mask = (df['vintage_year'] >= start) & (df['vintage_year'] <= end)
    df_sub = df[mask]
    
    row = {
        'Segment': name,
        'N': len(df_sub),
        'Prepay 12m': get_cif_at_time(df_sub, 1, 12),
        'Prepay 36m': get_cif_at_time(df_sub, 1, 36),
        'Prepay 60m': get_cif_at_time(df_sub, 1, 60),
        'Default 12m': get_cif_at_time(df_sub, 2, 12),
        'Default 36m': get_cif_at_time(df_sub, 2, 36),
        'Default 60m': get_cif_at_time(df_sub, 2, 60),
    }
    summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)

# Format as percentages
for col in summary_df.columns:
    if col not in ['Segment', 'N']:
        summary_df[col] = summary_df[col].apply(lambda x: f"{x:.1%}" if pd.notna(x) else "N/A")

print("=== Cumulative Incidence by Vintage ===")
print(summary_df.to_string(index=False))

## Next Steps

Now that we have non-parametric CIF estimates:

1. **Notebook 05**: Fit cause-specific Cox models for prepayment and default
2. **Notebook 06**: Fit Fine-Gray subdistribution hazard model
3. **Notebook 07**: Compare model predictions against these non-parametric baselines

Key observations from this analysis:
- Prepayment CIF is much higher than default CIF (as expected)
- 2004-2008 vintages show highest default rates
- Lower FICO scores have higher default and lower prepayment
- Higher LTV associated with both lower prepayment and higher default