# 07 - Model Validation (ASOP 56 Compliant)

## Objectives
1. Out-of-time holdout validation (train 2012-2017, test 2018-2019)
2. Calibration analysis by decile
3. ML vs Traditional actuarial comparison
4. Stability analysis

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import json
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Paths
DATA_PATH = '../data/ilec_cleaned.parquet'
MODEL_PATH = '../models/lgbm_mortality_offset_poisson.txt'
OUTPUT_DIR = Path('../knowledge_base/methodology')

# Features
FEATURES = ['Attained_Age', 'Issue_Age', 'Duration', 'Sex', 'Smoker_Status',
            'Insurance_Plan', 'Face_Amount_Band', 'Preferred_Class',
            'SOA_Post_Lvl_Ind', 'SOA_Antp_Lvl_TP', 'SOA_Guar_Lvl_TP']
CAT_FEATURES = ['Sex', 'Smoker_Status', 'Insurance_Plan', 'Face_Amount_Band',
                'Preferred_Class', 'SOA_Post_Lvl_Ind', 'SOA_Antp_Lvl_TP', 'SOA_Guar_Lvl_TP']

## 1. Load Data and Model

In [None]:
# Load data
print('Loading data...')
df = pd.read_parquet(DATA_PATH)
print(f'Total records: {len(df):,}')
print(f'Year range: {df["Year"].min()} - {df["Year"].max()}')

# Load model
model = lgb.Booster(model_file=MODEL_PATH)
print(f'Model loaded: {model.num_trees()} trees')

## 2. Out-of-Time Validation

Train on 2012-2017, validate on 2018-2019.

In [None]:
# Split by year
train_df = df[df['Year'] <= 2017].copy()
test_df = df[df['Year'] >= 2018].copy()

print(f'Train: {len(train_df):,} records (2012-2017)')
print(f'Test:  {len(test_df):,} records (2018-2019)')

In [None]:
# Encode categoricals
from sklearn.preprocessing import LabelEncoder

encoders = {}
for col in CAT_FEATURES:
    encoders[col] = LabelEncoder()
    train_df[col] = encoders[col].fit_transform(train_df[col].astype(str))
    test_df[col] = encoders[col].transform(test_df[col].astype(str))

In [None]:
# Train fresh model on 2012-2017 only
X_train = train_df[FEATURES]
y_train = train_df['Death_Count']
offset_train = np.log(train_df['Policies_Exposed'])

X_test = test_df[FEATURES]
y_test = test_df['Death_Count']
offset_test = np.log(test_df['Policies_Exposed'])

# Create datasets
train_data = lgb.Dataset(X_train, label=y_train, init_score=offset_train)

# Train
params = {
    'objective': 'poisson',
    'metric': 'poisson',
    'max_depth': 6,
    'learning_rate': 0.1,
    'min_child_samples': 100,
    'verbosity': -1
}

oot_model = lgb.train(
    params,
    train_data,
    num_boost_round=100
)
print('Out-of-time model trained')

In [None]:
# Predict on holdout
test_df['Expected_Deaths'] = oot_model.predict(X_test) * test_df['Policies_Exposed']

# Calculate A/E
total_actual = test_df['Death_Count'].sum()
total_expected = test_df['Expected_Deaths'].sum()
oot_ae = total_actual / total_expected

print('=== Out-of-Time Validation (2018-2019) ===')
print(f'Actual Deaths:   {total_actual:,.0f}')
print(f'Expected Deaths: {total_expected:,.0f}')
print(f'A/E Ratio:       {oot_ae:.4f}')
print()
if 0.95 <= oot_ae <= 1.05:
    print('✓ Model is well-calibrated on holdout data')
else:
    print(f'⚠ Model shows {"underestimation" if oot_ae > 1 else "overestimation"} on holdout')

In [None]:
# A/E by year (holdout)
oot_yearly = test_df.groupby('Year').agg({
    'Death_Count': 'sum',
    'Expected_Deaths': 'sum'
}).reset_index()
oot_yearly['AE_Ratio'] = oot_yearly['Death_Count'] / oot_yearly['Expected_Deaths']

print('=== Holdout A/E by Year ===')
print(oot_yearly.to_string(index=False))

## 3. Calibration by Decile

In [None]:
# Create deciles based on predicted rate
test_df['pred_rate'] = test_df['Expected_Deaths'] / test_df['Policies_Exposed']
test_df['decile'] = pd.qcut(test_df['pred_rate'], 10, labels=False, duplicates='drop')

# A/E by decile
calibration = test_df.groupby('decile').agg({
    'Death_Count': 'sum',
    'Expected_Deaths': 'sum',
    'Policies_Exposed': 'sum',
    'pred_rate': 'mean'
}).reset_index()
calibration['actual_rate'] = calibration['Death_Count'] / calibration['Policies_Exposed']
calibration['AE_Ratio'] = calibration['Death_Count'] / calibration['Expected_Deaths']

print('=== Calibration by Predicted Rate Decile ===')
print(calibration[['decile', 'pred_rate', 'actual_rate', 'AE_Ratio']].to_string(index=False))

In [None]:
# Calibration plot
fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(calibration['decile'], calibration['AE_Ratio'], 
       color='steelblue', alpha=0.7, label='A/E Ratio')
ax.axhline(1.0, color='red', linestyle='--', label='Perfect Calibration')
ax.axhline(0.95, color='orange', linestyle=':', alpha=0.7)
ax.axhline(1.05, color='orange', linestyle=':', alpha=0.7, label='±5% Band')

ax.set_xlabel('Predicted Rate Decile (0=Lowest, 9=Highest)')
ax.set_ylabel('A/E Ratio')
ax.set_title('Model Calibration by Predicted Rate Decile\n(Out-of-Time: 2018-2019)')
ax.legend()
ax.set_ylim(0.8, 1.2)

plt.tight_layout()
plt.savefig('../data/plots/validation_calibration_decile.png', dpi=150)
plt.show()
print('✓ Saved: validation_calibration_decile.png')

## 4. ML vs Traditional Actuarial Comparison

In [None]:
# Traditional approach: A/E by Age×Sex
# This simulates a basic actuarial table adjustment

# Calculate simple age-sex factors from training data
age_bins = pd.cut(train_df['Attained_Age'], bins=[0, 30, 50, 65, 80, 120], 
                  labels=['<30', '30-49', '50-64', '65-79', '80+'])
train_df['age_band'] = age_bins

# Simple factor table
trad_factors = train_df.groupby(['age_band', 'Sex']).agg({
    'Death_Count': 'sum',
    'Policies_Exposed': 'sum'
}).reset_index()
trad_factors['rate'] = trad_factors['Death_Count'] / trad_factors['Policies_Exposed']

print('=== Traditional Actuarial Factors (Age×Sex) ===')
print(trad_factors.pivot(index='age_band', columns='Sex', values='rate').round(6))

In [None]:
# Apply traditional factors to test data
test_df['age_band'] = pd.cut(test_df['Attained_Age'], bins=[0, 30, 50, 65, 80, 120],
                             labels=['<30', '30-49', '50-64', '65-79', '80+'])
test_df = test_df.merge(trad_factors[['age_band', 'Sex', 'rate']].rename(columns={'rate': 'trad_rate'}),
                        on=['age_band', 'Sex'], how='left')
test_df['trad_expected'] = test_df['trad_rate'] * test_df['Policies_Exposed']

# Compare
ml_ae = test_df['Death_Count'].sum() / test_df['Expected_Deaths'].sum()
trad_ae = test_df['Death_Count'].sum() / test_df['trad_expected'].sum()

ml_rmse = np.sqrt(mean_squared_error(test_df['Death_Count'], test_df['Expected_Deaths']))
trad_rmse = np.sqrt(mean_squared_error(test_df['Death_Count'], test_df['trad_expected']))

print('=== ML vs Traditional Comparison (On Holdout) ===')
print(f'{"Metric":<20} {"ML (LightGBM)":<15} {"Traditional":<15}')
print(f'{"A/E Ratio":<20} {ml_ae:<15.4f} {trad_ae:<15.4f}')
print(f'{"RMSE":<20} {ml_rmse:<15.4f} {trad_rmse:<15.4f}')
print()
if abs(ml_ae - 1) < abs(trad_ae - 1):
    print('✓ ML model shows better calibration')
else:
    print('⚠ Traditional approach shows better calibration')

## 5. Save Validation Results

In [None]:
# Save validation results
validation_results = {
    "out_of_time_validation": {
        "train_period": "2012-2017",
        "test_period": "2018-2019",
        "total_actual": int(total_actual),
        "total_expected": round(total_expected, 0),
        "ae_ratio": round(oot_ae, 4),
        "status": "calibrated" if 0.95 <= oot_ae <= 1.05 else "needs_review"
    },
    "calibration_by_decile": calibration[['decile', 'AE_Ratio']].to_dict('records'),
    "ml_vs_traditional": {
        "ml_ae": round(ml_ae, 4),
        "trad_ae": round(trad_ae, 4),
        "ml_rmse": round(ml_rmse, 4),
        "trad_rmse": round(trad_rmse, 4),
        "ml_better": abs(ml_ae - 1) < abs(trad_ae - 1)
    }
}

output_path = OUTPUT_DIR / 'validation_results.json'
with open(output_path, 'w') as f:
    json.dump(validation_results, f, indent=2)
    
print(f'✓ Saved: {output_path}')

In [None]:
print('=== Validation Summary ===')
print(f'Out-of-Time A/E: {oot_ae:.4f}')
print(f'ML vs Traditional: {"ML better" if validation_results["ml_vs_traditional"]["ml_better"] else "Traditional better"}')
print(f'Calibration: All deciles within acceptable range' if all(0.8 < x < 1.2 for x in calibration['AE_Ratio']) else 'Some deciles outside range')