# Generate Calibration Store

This notebook generates model calibration metrics:
1. `overall_ae.json` - Overall A/E ratio
2. `yearly_ae.json` - A/E by observation year

**Output:** `knowledge_base/calibration/`

In [None]:
import pandas as pd
import numpy as np
import json
import os
import lightgbm as lgb

OUTPUT_DIR = '../../knowledge_base/calibration'
DATA_PATH = '../../data/ilec_cleaned.parquet'
MODEL_PATH = '../../models/lgbm_mortality_offset_poisson.txt'
YEAR_FACTORS_PATH = '../../models/year_factors_offset.csv'

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'Output directory: {OUTPUT_DIR}')

## 1. Load Data and Model

In [None]:
# Load data
df = pd.read_parquet(DATA_PATH)
print(f'Data shape: {df.shape}')

# Load model
model = lgb.Booster(model_file=MODEL_PATH)
print(f'Model loaded: {model.num_trees()} trees')

# Load year factors
year_factors_df = pd.read_csv(YEAR_FACTORS_PATH)
print(f'Year factors: {len(year_factors_df)} years')

In [None]:
# Feature setup
NUMERICAL_FEATURES = ['Attained_Age', 'Issue_Age', 'Duration']
CATEGORICAL_FEATURES = [
    'Sex', 'Smoker_Status', 'Insurance_Plan', 'Face_Amount_Band',
    'Preferred_Class', 'SOA_Post_Lvl_Ind', 'SOA_Antp_Lvl_TP', 'SOA_Guar_Lvl_TP'
]
FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

# Prepare features
X = df[FEATURES].copy()
for col in CATEGORICAL_FEATURES:
    X[col] = X[col].astype('category')

print(f'Features: {len(FEATURES)}')

## 2. Calculate Predictions

In [None]:
df['Predicted_Rate'] = model.predict(X)
df['Expected_Deaths'] = df['Predicted_Rate'] * df['Policies_Exposed']

print(f'Expected Deaths: {df["Expected_Deaths"].sum():,.0f}')
print(f'Actual Deaths: {df["Death_Count"].sum():,.0f}')

## 3. Overall A/E

In [None]:
total_actual = df['Death_Count'].sum()
total_expected = df['Expected_Deaths'].sum()
overall_ae = total_actual / total_expected

if overall_ae > 1.05:
    interpretation = "Model underestimates mortality"
elif overall_ae < 0.95:
    interpretation = "Model overestimates mortality"
else:
    interpretation = "Model is well-calibrated"

print(f'Overall A/E: {overall_ae:.4f} - {interpretation}')

# Save
overall_ae_data = {
    "overall_ae": round(overall_ae, 4),
    "total_actual_deaths": int(total_actual),
    "total_expected_deaths": round(total_expected, 0),
    "interpretation": interpretation,
    "data_period": "2009-2019"
}

with open(f'{OUTPUT_DIR}/overall_ae.json', 'w') as f:
    json.dump(overall_ae_data, f, indent=2)
print('✓ overall_ae.json saved')

## 4. Yearly A/E

In [None]:
yearly = df.groupby('Observation_Year').agg({
    'Death_Count': 'sum',
    'Expected_Deaths': 'sum',
    'Policies_Exposed': 'sum'
})
yearly['ae_ratio'] = yearly['Death_Count'] / yearly['Expected_Deaths']

print('A/E by Year:')
display(yearly.round(4))

In [None]:
# Save
yearly_ae_data = {"by_year": {}, "year_factors": {}}

for year, row in yearly.iterrows():
    yearly_ae_data['by_year'][str(int(year))] = {
        "ae_ratio": round(row['ae_ratio'], 4),
        "actual_deaths": int(row['Death_Count']),
        "expected_deaths": round(row['Expected_Deaths'], 0)
    }

for _, row in year_factors_df.iterrows():
    yearly_ae_data['year_factors'][str(int(row['Year']))] = round(row['Year_Factor'], 4)

with open(f'{OUTPUT_DIR}/yearly_ae.json', 'w') as f:
    json.dump(yearly_ae_data, f, indent=2)
print('✓ yearly_ae.json saved')

## 5. Verify

In [None]:
print('Generated files:')
for f in os.listdir(OUTPUT_DIR):
    if f.endswith('.json'):
        size = os.path.getsize(os.path.join(OUTPUT_DIR, f))
        print(f'  ✓ {f} ({size} bytes)')