# 04. Generate Risk Segments

使用決策樹識別風險區段，生成 RAG 知識庫文件。

**兩層架構：**
- **Layer A (Coverage)**: 覆蓋整體人群的基礎區段
- **Layer B (Spotlight)**: 識別異常區段（風險顯著偏離整體平均）

**兩個指標：**
- `ae_ratio`: Actual / Expected (模型校準度)
- `relative_risk`: Segment Rate / Overall Rate (風險水平)

In [35]:
import pandas as pd
import numpy as np
import json
import os
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.preprocessing import LabelEncoder

# Paths
DATA_PATH = '../../data/ilec_cleaned.parquet'
MODEL_PATH = '../../models/lgbm_mortality_offset_poisson.txt'
OUTPUT_DIR = '../../knowledge_base/segments'
SPOTLIGHT_DIR = f'{OUTPUT_DIR}/spotlight'

os.makedirs(SPOTLIGHT_DIR, exist_ok=True)
print('Setup complete')

Setup complete


In [36]:
# Load data (10% sample)
print('Loading data...')
df_full = pd.read_parquet(DATA_PATH)
print(f'Full data: {len(df_full):,} rows')

df = df_full.sample(frac=0.1, random_state=42).reset_index(drop=True)
print(f'Sampled 10%: {len(df):,} rows')
del df_full

Loading data...
Full data: 45,501,036 rows
Sampled 10%: 4,550,104 rows


In [37]:
# Load model
print('Loading LightGBM model...')
model = lgb.Booster(model_file=MODEL_PATH)
print(f'Model loaded: {model.num_trees()} trees')

Loading LightGBM model...
Model loaded: 718 trees


In [38]:
# Prepare features and predict
FEATURES = ['Attained_Age', 'Issue_Age', 'Duration', 'Sex', 'Smoker_Status', 
            'Insurance_Plan', 'Face_Amount_Band', 'Preferred_Class', 
            'SOA_Post_Lvl_Ind', 'SOA_Antp_Lvl_TP', 'SOA_Guar_Lvl_TP']

CATEGORICAL = ['Sex', 'Smoker_Status', 'Insurance_Plan', 'Face_Amount_Band', 
               'Preferred_Class', 'SOA_Post_Lvl_Ind', 'SOA_Antp_Lvl_TP', 'SOA_Guar_Lvl_TP']

# Encode for prediction
X = df[FEATURES].copy()
encoders_model = {}
for col in CATEGORICAL:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders_model[col] = le

# Predict
df['Predicted_Rate'] = model.predict(X.values)
df['Expected_Deaths'] = df['Predicted_Rate'] * df['Policies_Exposed']
df['Actual_Rate'] = df['Death_Count'] / df['Policies_Exposed'].replace(0, np.nan)

# Calculate overall rate
overall_rate = df['Death_Count'].sum() / df['Policies_Exposed'].sum()
print(f'Overall death rate: {overall_rate:.6f}')
print(f'Predictions complete. Range: [{df["Predicted_Rate"].min():.6f}, {df["Predicted_Rate"].max():.6f}]')

Overall death rate: 0.009666
Predictions complete. Range: [0.000021, 0.901006]


In [39]:
# Helper: Extract leaf path as rules
def get_leaf_path(tree, leaf_id, feature_names, encoders):
    """Extract decision rules for a leaf node."""
    tree_ = tree.tree_
    feature_name = feature_names
    
    # Find path to leaf
    def find_path(node, target, path=[]):
        if node == target:
            return path
        left = tree_.children_left[node]
        right = tree_.children_right[node]
        if left != -1:
            result = find_path(left, target, path + [(node, 'left')])
            if result:
                return result
        if right != -1:
            result = find_path(right, target, path + [(node, 'right')])
            if result:
                return result
        return None
    
    path = find_path(0, leaf_id)
    if not path:
        return []
    
    rules = []
    for node, direction in path:
        feat_idx = tree_.feature[node]
        thresh = tree_.threshold[node]
        feat_name = feature_name[feat_idx]
        
        if direction == 'left':
            rules.append(f"{feat_name} <= {thresh:.1f}")
        else:
            rules.append(f"{feat_name} > {thresh:.1f}")
    
    return rules

## Layer A: Coverage Tree

In [40]:
# Coverage tree features (interpretable)
COVERAGE_FEATURES = ['Attained_Age', 'Duration', 'Smoker_Status', 'Insurance_Plan']

# Encode
X_coverage = df[COVERAGE_FEATURES].copy()
encoders_cov = {}
for col in ['Smoker_Status', 'Insurance_Plan']:
    le = LabelEncoder()
    X_coverage[col] = le.fit_transform(X_coverage[col].astype(str))
    encoders_cov[col] = le

# Target: A/E ratio (model calibration)
y_coverage = df['Death_Count'] / df['Expected_Deaths'].replace(0, np.nan)
y_coverage = y_coverage.fillna(1.0).clip(0.1, 10)  # Handle NaN and extremes

# Train
coverage_tree = DecisionTreeRegressor(
    max_depth=4,
    min_samples_leaf=10000,
    random_state=42
)
coverage_tree.fit(X_coverage, y_coverage, sample_weight=df['Policies_Exposed'])
print(f'Coverage tree: {coverage_tree.get_n_leaves()} leaves')

Coverage tree: 16 leaves


In [41]:
# Generate coverage registry
df['Coverage_Leaf'] = coverage_tree.apply(X_coverage)

coverage_agg = df.groupby('Coverage_Leaf').agg({
    'Policies_Exposed': 'sum',
    'Death_Count': 'sum',
    'Expected_Deaths': 'sum'
}).reset_index()

# Calculate both metrics
coverage_agg['actual_rate'] = coverage_agg['Death_Count'] / coverage_agg['Policies_Exposed']
coverage_agg['ae_ratio'] = coverage_agg['Death_Count'] / coverage_agg['Expected_Deaths']
coverage_agg['relative_risk'] = coverage_agg['actual_rate'] / overall_rate

# Build registry
coverage_registry = {
    "layer": "A",
    "name": "Coverage Segments",
    "purpose": "Baseline risk segments covering entire population",
    "metrics": {
        "ae_ratio": "Actual Deaths / Expected Deaths (model calibration)",
        "relative_risk": "Segment Rate / Overall Rate (risk vs population)"
    },
    "overall_rate": round(overall_rate, 6),
    "n_segments": len(coverage_agg),
    "segments": []
}

for _, row in coverage_agg.iterrows():
    leaf_id = int(row['Coverage_Leaf'])
    rules = get_leaf_path(coverage_tree, leaf_id, COVERAGE_FEATURES, encoders_cov)
    
    segment = {
        "segment_id": f"COV_{leaf_id:03d}",
        "leaf_id": leaf_id,
        "rules": rules,
        "rule_text": " AND ".join(rules) if rules else "ROOT",
        "statistics": {
            "exposure": int(row['Policies_Exposed']),
            "actual_deaths": int(row['Death_Count']),
            "expected_deaths": round(row['Expected_Deaths'], 1),
            "actual_rate": round(row['actual_rate'], 6),
            "ae_ratio": round(row['ae_ratio'], 3),
            "relative_risk": round(row['relative_risk'], 3)
        },
        "interpretation": {
            "model": f"A/E = {row['ae_ratio']:.2f} ({'underestimate' if row['ae_ratio'] > 1.05 else 'overestimate' if row['ae_ratio'] < 0.95 else 'calibrated'})",
            "risk": f"RR = {row['relative_risk']:.2f} ({'higher' if row['relative_risk'] > 1.1 else 'lower' if row['relative_risk'] < 0.9 else 'average'} risk)"
        }
    }
    coverage_registry["segments"].append(segment)

# Save
with open(f'{OUTPUT_DIR}/coverage_registry.json', 'w') as f:
    json.dump(coverage_registry, f, indent=2)

# Save tree rules
with open(f'{OUTPUT_DIR}/coverage_tree_rules.txt', 'w') as f:
    f.write(export_text(coverage_tree, feature_names=COVERAGE_FEATURES))

print(f'✓ coverage_registry.json ({len(coverage_registry["segments"])} segments)')

✓ coverage_registry.json (16 segments)


## Layer B: Spotlight Tree

In [42]:
# Spotlight features (more granular)
SPOTLIGHT_FEATURES = ['Attained_Age', 'Duration', 'Smoker_Status', 'Insurance_Plan', 'Face_Amount_Band']

# Encode
X_spotlight = df[SPOTLIGHT_FEATURES].copy()
encoders_spot = {}
for col in ['Smoker_Status', 'Insurance_Plan', 'Face_Amount_Band']:
    le = LabelEncoder()
    X_spotlight[col] = le.fit_transform(X_spotlight[col].astype(str))
    encoders_spot[col] = le

# Target: Relative Risk (vs population average)
y_spotlight = df['Actual_Rate'] / overall_rate
y_spotlight = y_spotlight.fillna(1.0).clip(0.1, 10)

# Train
spotlight_tree = DecisionTreeRegressor(
    max_depth=6,
    min_samples_leaf=5000,
    random_state=42
)
spotlight_tree.fit(X_spotlight, y_spotlight, sample_weight=df['Policies_Exposed'])
print(f'Spotlight tree: {spotlight_tree.get_n_leaves()} leaves')

Spotlight tree: 59 leaves


In [43]:
# Aggregate spotlight
df['Spotlight_Leaf'] = spotlight_tree.apply(X_spotlight)

spotlight_agg = df.groupby('Spotlight_Leaf').agg({
    'Policies_Exposed': 'sum',
    'Death_Count': 'sum',
    'Expected_Deaths': 'sum'
}).reset_index()

# Calculate both metrics
spotlight_agg['actual_rate'] = spotlight_agg['Death_Count'] / spotlight_agg['Policies_Exposed']
spotlight_agg['ae_ratio'] = spotlight_agg['Death_Count'] / spotlight_agg['Expected_Deaths']
spotlight_agg['relative_risk'] = spotlight_agg['actual_rate'] / overall_rate

spotlight_agg['credibility'] = spotlight_agg['Policies_Exposed'].apply(
    lambda x: 'high' if x >= 50000 else ('medium' if x >= 10000 else 'low')
)

# Filter anomalous segments (RR significantly different from 1.0)
RR_THRESHOLD = 0.15  # |RR - 1| > 15%
anomalous = spotlight_agg[
    (abs(spotlight_agg['relative_risk'] - 1) > RR_THRESHOLD) &
    (spotlight_agg['credibility'].isin(['high', 'medium']))
].copy()

print(f'Spotlight leaves: {len(spotlight_agg)}')
print(f'Anomalous (|RR-1| > {RR_THRESHOLD}): {len(anomalous)}')

Spotlight leaves: 59
Anomalous (|RR-1| > 0.15): 54


In [44]:
# Generate spotlight files
spotlight_summary = {
    "layer": "B",
    "name": "Risk Spotlight",
    "purpose": "Identify segments with significant risk deviation from population",
    "metrics": {
        "ae_ratio": "Actual Deaths / Expected Deaths (model calibration)",
        "relative_risk": "Segment Rate / Overall Rate (risk vs population)"
    },
    "rr_threshold": RR_THRESHOLD,
    "overall_rate": round(overall_rate, 6),
    "n_anomalous": len(anomalous),
    "anomalies": []
}

for _, row in anomalous.iterrows():
    leaf_id = int(row['Spotlight_Leaf'])
    rules = get_leaf_path(spotlight_tree, leaf_id, SPOTLIGHT_FEATURES, encoders_spot)
    
    # Classify
    risk_level = 'high_risk' if row['relative_risk'] > 1 else 'low_risk'
    rr_deviation = abs(row['relative_risk'] - 1) * 100
    
    if row['ae_ratio'] > 1.05:
        model_status = 'underestimate'
    elif row['ae_ratio'] < 0.95:
        model_status = 'overestimate'
    else:
        model_status = 'calibrated'
    
    detail = {
        "segment_id": f"SPOT_{leaf_id:03d}",
        "leaf_id": leaf_id,
        "risk_level": risk_level,
        "model_status": model_status,
        "rules": rules,
        "rule_text": " AND ".join(rules) if rules else "ROOT",
        "statistics": {
            "exposure": int(row['Policies_Exposed']),
            "actual_deaths": int(row['Death_Count']),
            "expected_deaths": round(row['Expected_Deaths'], 1),
            "actual_rate": round(row['actual_rate'], 6),
            "ae_ratio": round(row['ae_ratio'], 3),
            "relative_risk": round(row['relative_risk'], 3)
        },
        "credibility": row['credibility'],
        "interpretation": {
            "risk": f"Mortality is {row['relative_risk']:.2f}x the population average ({rr_deviation:.1f}% {'higher' if risk_level == 'high_risk' else 'lower'}).",
            "model": f"Model {'underestimates' if model_status == 'underestimate' else 'overestimates' if model_status == 'overestimate' else 'is calibrated for'} this segment (A/E={row['ae_ratio']:.2f})."
        }
    }
    
    # Save individual file
    with open(f'{SPOTLIGHT_DIR}/SPOT_{leaf_id:03d}.json', 'w') as f:
        json.dump(detail, f, indent=2)
    
    spotlight_summary["anomalies"].append({
        "segment_id": f"SPOT_{leaf_id:03d}",
        "rule_text": detail["rule_text"],
        "ae_ratio": detail["statistics"]["ae_ratio"],
        "relative_risk": detail["statistics"]["relative_risk"],
        "risk_level": risk_level,
        "model_status": model_status
    })

# Save summary
with open(f'{OUTPUT_DIR}/spotlight_summary.json', 'w') as f:
    json.dump(spotlight_summary, f, indent=2)

# Save tree rules
with open(f'{OUTPUT_DIR}/spotlight_tree_rules.txt', 'w') as f:
    f.write(export_text(spotlight_tree, feature_names=SPOTLIGHT_FEATURES))

print(f'✓ spotlight_summary.json ({len(anomalous)} anomalies)')
print(f'✓ {len(anomalous)} detail files in spotlight/')

✓ spotlight_summary.json (54 anomalies)
✓ 54 detail files in spotlight/


## Summary

In [45]:
print('='*60)
print('GENERATED FILES')
print('='*60)
print(f'\nOverall death rate: {overall_rate:.6f}')
print(f'\nLayer A (Coverage):')
print(f'  ✓ coverage_registry.json ({len(coverage_registry["segments"])} segments)')
print(f'  ✓ coverage_tree_rules.txt')
print(f'\nLayer B (Spotlight):')
print(f'  ✓ spotlight_summary.json ({len(spotlight_summary["anomalies"])} anomalies)')
print(f'  ✓ spotlight_tree_rules.txt')
print(f'  ✓ {len(os.listdir(SPOTLIGHT_DIR))} files in spotlight/')

print('\n' + '='*60)
print('TOP ANOMALIES')
print('(RR = Relative Risk, A/E = Model Calibration)')
print('='*60)
for a in sorted(spotlight_summary['anomalies'], key=lambda x: abs(x['relative_risk']-1), reverse=True)[:10]:
    print(f"\n{a['segment_id']}: RR={a['relative_risk']:.2f}, A/E={a['ae_ratio']:.2f}")
    print(f"  Risk: {a['risk_level']}, Model: {a['model_status']}")
    print(f"  Rule: {a['rule_text'][:100]}{'...' if len(a['rule_text']) > 100 else ''}")

GENERATED FILES

Overall death rate: 0.009666

Layer A (Coverage):
  ✓ coverage_registry.json (16 segments)
  ✓ coverage_tree_rules.txt

Layer B (Spotlight):
  ✓ spotlight_summary.json (54 anomalies)
  ✓ spotlight_tree_rules.txt
  ✓ 61 files in spotlight/

TOP ANOMALIES
(RR = Relative Risk, A/E = Model Calibration)

SPOT_096: RR=14.47, A/E=0.99
  Risk: high_risk, Model: calibrated
  Rule: Attained_Age > 78.5 AND Attained_Age > 84.5 AND Insurance_Plan <= 1.5 AND Face_Amount_Band <= 1.5 AN...

SPOT_115: RR=12.87, A/E=0.97
  Risk: high_risk, Model: calibrated
  Rule: Attained_Age > 78.5 AND Attained_Age > 84.5 AND Insurance_Plan > 1.5 AND Duration > 21.5 AND Face_Am...

SPOT_102: RR=12.46, A/E=1.01
  Risk: high_risk, Model: calibrated
  Rule: Attained_Age > 78.5 AND Attained_Age > 84.5 AND Insurance_Plan <= 1.5 AND Face_Amount_Band > 1.5 AND...

SPOT_114: RR=12.24, A/E=1.01
  Risk: high_risk, Model: calibrated
  Rule: Attained_Age > 78.5 AND Attained_Age > 84.5 AND Insurance_Plan > 1.5 AN