In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('./src') # Add src directory to Python path

from data_preprocessing import preprocess_data, split_data
from modeling import train_evaluate_regression_model, train_evaluate_classification_model
from model_interpretation import plot_shap_summary, get_feature_importance

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression # For classification task
from sklearn.ensemble import RandomForestClassifier # For classification task

- --- Load Data ---

In [None]:
df = pd.read_csv('data/raw/historical_insurance_claims.csv')
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])

- --- 1. Claim Severity Prediction (Regression) ---

In [None]:
print("--- Claim Severity Prediction Model ---")
# Filter for policies with claims
df_claims_only = df[df['TotalClaims'] > 0].copy()

# Define target and features for Claim Severity
target_severity = 'TotalClaims'
# List features to exclude. Be careful not to include features that leak information about the target.
# 'CalculatedPremiumPerTerm' could be a target or a feature. For claim severity, it might be a strong feature.
# For optimal premium, it would be a target.
features_to_exclude_severity = ['UnderwrittenCoverID', 'PolicyID', 'TransactionDate', 'PostalCode',
                               'TotalPremium', 'CalculatedPremiumPerTerm', # Can be strong features
                               'ClaimOccurred', 'ClaimFrequency', 'Margin', 'LossRatio'] # Exclude derived claim metrics

X_severity_raw, y_severity, preprocessor_severity, severity_features = \
    preprocess_data(df_claims_only, target_severity, features_to_exclude=features_to_exclude_severity)

# Transform features
X_severity_transformed = preprocessor_severity.fit_transform(X_severity_raw)
# Get feature names after one-hot encoding if needed for SHAP
feature_names_severity = preprocessor_severity.get_feature_names_out()


# Split data
X_train_sev, X_test_sev, y_train_sev, y_test_sev = split_data(
    pd.DataFrame(X_severity_transformed, columns=feature_names_severity), y_severity
)

# Train and Evaluate Models
models_severity = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42)
}

results_severity = {}
for name, model in models_severity.items():
    trained_model, metrics = train_evaluate_regression_model(model, X_train_sev, y_train_sev, X_test_sev, y_test_sev, name)
    results_severity[name] = {'model': trained_model, 'metrics': metrics}

# Compare Models
print("\n--- Claim Severity Model Comparison ---")
for name, res in results_severity.items():
    print(f"{name}: RMSE={res['metrics']['rmse']:.4f}, R2={res['metrics']['r2']:.4f}")

# Feature Importance and Interpretability for the best-performing model (e.g., XGBoost)
best_severity_model = results_severity['XGBoost Regressor']['model']
plot_shap_summary(best_severity_model, X_test_sev, feature_names_severity, plot_type='bar')
plot_shap_summary(best_severity_model, X_test_sev, feature_names_severity, plot_type='dot')

- --- 2. Premium Optimization (Regression - predicting CalculatedPremiumPerTerm) ---

In [None]:
print("\n--- Premium Optimization Model (Predicting CalculatedPremiumPerTerm) ---")
target_premium = 'CalculatedPremiumPerTerm'
features_to_exclude_premium = ['UnderwrittenCoverID', 'PolicyID', 'TransactionDate', 'PostalCode',
                               'TotalClaims', 'TotalPremium', # Exclude actual claims and aggregated premiums
                               'ClaimOccurred', 'ClaimFrequency', 'Margin', 'LossRatio'] # Exclude derived metrics

X_premium_raw, y_premium, preprocessor_premium, premium_features = \
    preprocess_data(df, target_premium, features_to_exclude=features_to_exclude_premium)

X_premium_transformed = preprocessor_premium.fit_transform(X_premium_raw)
feature_names_premium = preprocessor_premium.get_feature_names_out()

X_train_prem, X_test_prem, y_train_prem, y_test_prem = split_data(
    pd.DataFrame(X_premium_transformed, columns=feature_names_premium), y_premium
)

models_premium = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42)
}

results_premium = {}
for name, model in models_premium.items():
    trained_model, metrics = train_evaluate_regression_model(model, X_train_prem, y_train_prem, X_test_prem, y_test_prem, name)
    results_premium[name] = {'model': trained_model, 'metrics': metrics}

print("\n--- Premium Optimization Model Comparison ---")
for name, res in results_premium.items():
    print(f"{name}: RMSE={res['metrics']['rmse']:.4f}, R2={res['metrics']['r2']:.4f}")

# Feature Importance for Premium Optimization
best_premium_model = results_premium['XGBoost Regressor']['model']
plot_shap_summary(best_premium_model, X_test_prem, feature_names_premium, plot_type='bar')



- --- 3. Advanced Task: Probability of Claim Prediction (Classification) ---

In [None]:
print("\n--- Probability of Claim Prediction Model ---")
df_clf = df.copy()
df_clf['ClaimOccurred'] = (df_clf['TotalClaims'] > 0).astype(int) # Target for classification

target_clf = 'ClaimOccurred'
features_to_exclude_clf = ['UnderwrittenCoverID', 'PolicyID', 'TransactionDate', 'PostalCode',
                           'TotalClaims', 'TotalPremium', 'CalculatedPremiumPerTerm',
                           'ClaimFrequency', 'ClaimSeverity', 'Margin', 'LossRatio']

X_clf_raw, y_clf, preprocessor_clf, clf_features = \
    preprocess_data(df_clf, target_clf, features_to_exclude=features_to_exclude_clf)

X_clf_transformed = preprocessor_clf.fit_transform(X_clf_raw)
feature_names_clf = preprocessor_clf.get_feature_names_out()

X_train_clf, X_test_clf, y_train_clf, y_test_clf = split_data(
    pd.DataFrame(X_clf_transformed, columns=feature_names_clf), y_clf
)

models_clf = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
    'Random Forest Classifier': RandomForestClassifier(random_state=42),
    'XGBoost Classifier': XGBRegressor(objective='binary:logistic', random_state=42) # XGBoost can do classification
}

results_clf = {}
for name, model in models_clf.items():
    if "XGBoost Classifier" == name: # Need to use specific classification eval for XGBoost
         trained_model, metrics = train_evaluate_classification_model(model, X_train_clf, y_train_clf, X_test_clf, y_test_clf, name)
    else:
         trained_model, metrics = train_evaluate_classification_model(model, X_train_clf, y_train_clf, X_test_clf, y_test_clf, name)
    results_clf[name] = {'model': trained_model, 'metrics': metrics}

print("\n--- Probability of Claim Model Comparison ---")
for name, res in results_clf.items():
    print(f"{name}: Accuracy={res['metrics']['accuracy']:.4f}, F1-Score={res['metrics']['f1']:.4f}")

# Feature Importance for Claim Probability
best_clf_model = results_clf['XGBoost Classifier']['model']
plot_shap_summary(best_clf_model, X_test_clf, feature_names_clf, plot_type='bar')
