In [None]:
# %%
"""
Premium Optimization Modeling Pipeline
=====================================

This notebook implements the complete pipeline for premium optimization,
including claim probability prediction and risk-based premium calculation.

Business Objective:
- Predict optimal premium values based on risk factors
- Build a claim probability model for risk assessment
- Develop risk-based pricing framework
"""

# %%
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

# Add src to Python path
sys.path.append('../src')

from modeling.data_preprocessor import DataPreprocessor
from modeling.feature_engineer import FeatureEngineer
from modeling.model_trainer import ModelTrainer
from modeling.model_evaluator import ModelEvaluator
from modeling.model_interpreter import ModelInterpreter

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

# %%
# Load the data
print("Loading insurance data...")
data = pd.read_csv('../data/raw/insurance_data.csv')  # Adjust path as needed
print(f"Data shape: {data.shape}")

# %%
# Initialize preprocessing pipeline
preprocessor = DataPreprocessor(random_state=42)
feature_engineer = FeatureEngineer()

print("Preparing premium optimization dataset...")
# Prepare data for premium optimization (all policies)
premium_data = preprocessor.prepare_premium_optimization_data(data)
print(f"Premium optimization dataset shape: {premium_data.shape}")

# %%
# Feature Engineering
print("Engineering features for premium optimization...")
premium_data = feature_engineer.create_risk_features(premium_data)
premium_data = feature_engineer.create_interaction_features(premium_data)

print(f"Data shape after feature engineering: {premium_data.shape}")

# %%
# Data preprocessing
print("Encoding categorical features...")
premium_data_encoded = preprocessor.encode_categorical_features(premium_data, encoding_strategy='mixed')

# %%
# PART 1: CLAIM PROBABILITY MODELING
print("\n" + "="*50)
print("PART 1: CLAIM PROBABILITY MODELING")
print("="*50)

# Prepare features and target for claim probability
exclude_columns = ['TotalClaims', 'TotalPremium', 'CalculatedPremiumPerTerm', 
                  'PolicyID', 'UnderwrittenCoverID', 'TransactionMonth']
feature_columns = [col for col in premium_data_encoded.columns if col not in exclude_columns]

X_prob = premium_data_encoded[feature_columns]
y_prob = premium_data_encoded['HasClaim']  # Binary target

print(f"Claim probability - Feature matrix shape: {X_prob.shape}")
print(f"Claim probability - Target distribution: {y_prob.value_counts()}")

# %%
# Train-test split for claim probability
X_prob_train, X_prob_test, y_prob_train, y_prob_test = preprocessor.split_data(
    X_prob, y_prob, test_size=0.2, stratify=y_prob
)

# Scale features
X_prob_train_scaled, X_prob_test_scaled = preprocessor.scale_features(X_prob_train, X_prob_test)

# Feature selection for claim probability
X_prob_train_selected, prob_selected_features = feature_engineer.select_features(
    X_prob_train_scaled, y_prob_train, method='mutual_info', k=20, problem_type='classification'
)
X_prob_test_selected = X_prob_test_scaled[prob_selected_features]

print(f"Selected features for claim probability: {prob_selected_features}")

# %%
# Train claim probability models
print("Training claim probability models...")

prob_trainer = ModelTrainer(random_state=42)

# Train classification models
prob_models = {}

# Decision Tree
prob_models['decision_tree'] = prob_trainer.train_decision_tree(
    X_prob_train_selected, y_prob_train, problem_type='classification', 
    tune_hyperparameters=False
)

# Random Forest
prob_models['random_forest'] = prob_trainer.train_random_forest(
    X_prob_train_selected, y_prob_train, problem_type='classification', 
    tune_hyperparameters=False
)

# XGBoost
prob_models['xgboost'] = prob_trainer.train_xgboost(
    X_prob_train_selected, y_prob_train, problem_type='classification', 
    tune_hyperparameters=False
)

print("Claim probability models trained!")

# %%
# Evaluate claim probability models
print("Evaluating claim probability models...")

prob_evaluator = ModelEvaluator()

prob_evaluation_results = {}

for model_name, model in prob_models.items():
    print(f"\nEvaluating {model_name} for claim probability...")
    eval_result = prob_evaluator.evaluate_classification_model(
        model, model_name, X_prob_test_selected, y_prob_test, 
        y_prob_train, X_prob_train_selected
    )
    prob_evaluation_results[model_name] = eval_result

# Compare models
prob_comparison = prob_evaluator.compare_models(metric='accuracy', problem_type='classification')
print("\nClaim Probability Model Comparison:")
print(prob_comparison)

# %%
# PART 2: PREMIUM PREDICTION MODELING
print("\n" + "="*50)
print("PART 2: PREMIUM PREDICTION MODELING")
print("="*50)

# Prepare features and target for premium prediction
# Exclude claim-related features to avoid data leakage
exclude_columns_premium = ['TotalClaims', 'HasClaim', 'PolicyID', 
                          'UnderwrittenCoverID', 'TransactionMonth']
feature_columns_premium = [col for col in premium_data_encoded.columns 
                          if col not in exclude_columns_premium]

X_premium = premium_data_encoded[feature_columns_premium]
y_premium = premium_data_encoded['CalculatedPremiumPerTerm']  # or TotalPremium

print(f"Premium prediction - Feature matrix shape: {X_premium.shape}")
print(f"Premium prediction - Target statistics:")
print(f"Mean: {y_premium.mean():.2f}")
print(f"Std: {y_premium.std():.2f}")

# %%
# Train-test split for premium prediction
X_prem_train, X_prem_test, y_prem_train, y_prem_test = preprocessor.split_data(
    X_premium, y_premium, test_size=0.2
)

# Scale features
X_prem_train_scaled, X_prem_test_scaled = preprocessor.scale_features(X_prem_train, X_prem_test)

# Feature selection for premium prediction
X_prem_train_selected, prem_selected_features = feature_engineer.select_features(
    X_prem_train_scaled, y_prem_train, method='mutual_info', k=20, problem_type='regression'
)
X_prem_test_selected = X_prem_test_scaled[prem_selected_features]

print(f"Selected features for premium prediction: {prem_selected_features}")

# %%
# Train premium prediction models
print("Training premium prediction models...")

prem_trainer = ModelTrainer(random_state=42)

# Train all regression models
prem_models = prem_trainer.train_all_models(
    X_prem_train_selected, y_prem_train, 
    problem_type='regression', 
    tune_hyperparameters=False,
    cv_folds=5
)

print("Premium prediction models trained!")

# %%
# Evaluate premium prediction models
print("Evaluating premium prediction models...")

prem_evaluator = ModelEvaluator()

prem_evaluation_results = {}

for model_name, model in prem_models.items():
    print(f"\nEvaluating {model_name} for premium prediction...")
    eval_result = prem_evaluator.evaluate_regression_model(
        model, model_name, X_prem_test_selected, y_prem_test, 
        y_prem_train, X_prem_train_selected
    )
    prem_evaluation_results[model_name] = eval_result

# Compare models
prem_comparison = prem_evaluator.compare_models(metric='rmse', problem_type='regression')
print("\nPremium Prediction Model Comparison:")
print(prem_comparison)

# %%
# PART 3: RISK-BASED PREMIUM FRAMEWORK
print("\n" + "="*50)
print("PART 3: RISK-BASED PREMIUM FRAMEWORK")
print("="*50)

# Get best models
best_prob_model_name = prob_comparison.iloc[0]['model']
best_prem_model_name = prem_comparison.iloc[0]['model']

best_prob_model = prob_models[best_prob_model_name]
best_prem_model = prem_models[best_prem_model_name]

print(f"Best claim probability model: {best_prob_model_name}")
print(f"Best premium prediction model: {best_prem_model_name}")

# %%
# Implement risk-based pricing framework
def calculate_risk_based_premium(prob_model, severity_model, X_features, 
                                expense_loading=0.15, profit_margin=0.10):
    """
    Calculate risk-based premium using:
    Premium = (Claim Probability * Expected Claim Severity) * (1 + Expense Loading + Profit Margin)
    
    Note: For this example, we'll use a simplified approach since we trained separate models
    """
    
    # Predict claim probability
    claim_prob = prob_model.predict_proba(X_features)[:, 1]
    
    # For this example, use a base severity estimate (in practice, use claim severity model)
    # You would load the claim severity model trained in the previous notebook
    base_severity = premium_data['TotalClaims'][premium_data['TotalClaims'] > 0].mean()
    
    # Calculate base risk premium
    risk_premium = claim_prob * base_severity
    
    # Add expense loading and profit margin
    final_premium = risk_premium * (1 + expense_loading + profit_margin)
    
    return final_premium, claim_prob, risk_premium

# %%
# Apply risk-based pricing to test set
print("Calculating risk-based premiums...")

risk_premiums, claim_probabilities, base_risk_premiums = calculate_risk_based_premium(
    best_prob_model, None, X_prob_test_selected
)

# Create results DataFrame
results_df = pd.DataFrame({
    'actual_premium': y_prem_test.values[:len(risk_premiums)],
    'predicted_premium': best_prem_model.predict(X_prem_test_selected)[:len(risk_premiums)],
    'risk_based_premium': risk_premiums,
    'claim_probability': claim_probabilities,
    'base_risk_premium': base_risk_premiums
})

print("Risk-based pricing results:")
print(results_df.head())

# %%
# Analyze pricing results
print("\nPricing Analysis:")
print(f"Average actual premium: ${results_df['actual_premium'].mean():.2f}")
print(f"Average predicted premium: ${results_df['predicted_premium'].mean():.2f}")
print(f"Average risk-based premium: ${results_df['risk_based_premium'].mean():.2f}")

print(f"\nClaim probability distribution:")
print(f"Mean: {results_df['claim_probability'].mean():.4f}")
print(f"Std: {results_df['claim_probability'].std():.4f}")
print(f"Min: {results_df['claim_probability'].min():.4f}")
print(f"Max: {results_df['claim_probability'].max():.4f}")

# %%
# Visualize pricing results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Actual vs Predicted Premium
axes[0, 0].scatter(results_df['actual_premium'], results_df['predicted_premium'], alpha=0.6)
axes[0, 0].plot([results_df['actual_premium'].min(), results_df['actual_premium'].max()], 
               [results_df['actual_premium'].min(), results_df['actual_premium'].max()], 'r--')
axes[0, 0].set_xlabel('Actual Premium')
axes[0, 0].set_ylabel('Predicted Premium')
axes[0, 0].set_title('Actual vs Predicted Premium')

# Claim Probability Distribution
axes[0, 1].hist(results_df['claim_probability'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Claim Probability')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Claim Probabilities')

# Risk-based vs Actual Premium
axes[1, 0].scatter(results_df['actual_premium'], results_df['risk_based_premium'], alpha=0.6)
axes[1, 0].plot([results_df['actual_premium'].min(), results_df['actual_premium'].max()], 
               [results_df['actual_premium'].min(), results_df['actual_premium'].max()], 'r--')
axes[1, 0].set_xlabel('Actual Premium')
axes[1, 0].set_ylabel('Risk-based Premium')
axes[1, 0].set_title('Actual vs Risk-based Premium')

# Premium Comparison
premium_comparison_data = pd.melt(
    results_df[['actual_premium', 'predicted_premium', 'risk_based_premium']], 
    var_name='premium_type', value_name='premium'
)
sns.boxplot(data=premium_comparison_data, x='premium_type', y='premium', ax=axes[1, 1])
axes[1, 1].set_title('Premium Distribution Comparison')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# %%
# Model Interpretation for Premium Models
print("Performing model interpretation...")

# Interpret claim probability model
prob_interpreter = ModelInterpreter()

prob_feature_importance = prob_interpreter.analyze_feature_importance(
    best_prob_model, best_prob_model_name, X_prob_test_selected, y_prob_test, 
    feature_names=prob_selected_features, top_n=15
)

# SHAP analysis for claim probability
prob_shap_values, prob_shap_importance = prob_interpreter.analyze_shap_values(
    best_prob_model, best_prob_model_name, X_prob_train_selected, X_prob_test_selected, 
    feature_names=prob_selected_features, sample_size=100
)

# %%
# Interpret premium prediction model
prem_interpreter = ModelInterpreter()

prem_feature_importance = prem_interpreter.analyze_feature_importance(
    best_prem_model, best_prem_model_name, X_prem_test_selected, y_prem_test, 
    feature_names=prem_selected_features, top_n=15
)

# SHAP analysis for premium prediction
prem_shap_values, prem_shap_importance = prem_interpreter.analyze_shap_values(
    best_prem_model, best_prem_model_name, X_prem_train_selected, X_prem_test_selected, 
    feature_names=prem_selected_features, sample_size=100
)

# %%
# Business Insights
print("Generating business insights...")

prob_insights = prob_interpreter.generate_business_insights(
    best_prob_model_name, feature_names=prob_selected_features
)

prem_insights = prem_interpreter.generate_business_insights(
    best_prem_model_name, feature_names=prem_selected_features
)

print("\nClaim Probability Model - Key Risk Drivers:")
for driver in prob_insights['key_risk_drivers'][:5]:
    print(f"- {driver['feature']}: {driver['business_meaning']}")

print("\nPremium Prediction Model - Key Price Drivers:")
for driver in prem_insights['key_risk_drivers'][:5]:
    print(f"- {driver['feature']}: {driver['business_meaning']}")

# %%
# Save Models and Results
print("Saving models and results...")

# Create directories
os.makedirs('../models/premium_optimization', exist_ok=True)

# Save claim probability models
prob_trainer.save_models('../models/premium_optimization/claim_probability_model')

# Save premium prediction models
prem_trainer.save_models('../models/premium_optimization/premium_prediction_model')

# Save results
results_df.to_csv('../reports/premium_optimization_results.csv', index=False)

# Generate reports
prob_evaluation_report = prob_evaluator.generate_evaluation_report(
    save_path='../reports/figures/modeling'
)

prem_evaluation_report = prem_evaluator.generate_evaluation_report(
    save_path='../reports/figures/modeling'
)

print("Premium optimization modeling pipeline completed successfully!")

# %%
# Final Results Summary
print("\n" + "="*60)
print("PREMIUM OPTIMIZATION MODELING SUMMARY")
print("="*60)

print(f"\nBest Claim Probability Model: {best_prob_model_name}")
print(f"Accuracy: {prob_evaluation_results[best_prob_model_name]['accuracy']:.4f}")
print(f"F1 Score: {prob_evaluation_results[best_prob_model_name]['f1_score']:.4f}")
if 'roc_auc' in prob_evaluation_results[best_prob_model_name]:
    print(f"ROC AUC: {prob_evaluation_results[best_prob_model_name]['roc_auc']:.4f}")

print(f"\nBest Premium Prediction Model: {best_prem_model_name}")
print(f"RMSE: {prem_evaluation_results[best_prem_model_name]['rmse']:.2f}")
print(f"R² Score: {prem_evaluation_results[best_prem_model_name]['r2_score']:.4f}")
print(f"MAPE: {prem_evaluation_results[best_prem_model_name]['mape']:.2f}%")

print(f"\nRisk-based Pricing Results:")
print(f"Average claim probability: {results_df['claim_probability'].mean():.4f}")
print(f"Premium accuracy improvement: Available in saved results")

print("\nModel files saved to: ../models/premium_optimization/")
print("Results saved to: ../reports/premium_optimization_results.csv")

  from .autonotebook import tqdm as notebook_tqdm


Loading insurance data...
Data shape: (1000098, 52)
Preparing premium optimization dataset...
