# Task 4 â€” Severity Modeling Comparison

Evaluate baseline and ensemble regressors using the shared preprocessing and evaluation utilities.

In [39]:
import sys
import os
from pathlib import Path
import importlib

# Add project root to sys.path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.inspection import permutation_importance

import src.modeling
importlib.reload(src.modeling)

from src.modeling import (
    load_data,
    preprocess_for_severity,
    preprocess_for_premium,
    preprocess_for_claim_probability,
    train_test_split_df,
    build_severity_preprocessor,
    evaluate_regression_models,
    evaluate_classification_models,
    build_default_severity_models,
    build_default_premium_models,
    build_default_classification_models,
)

In [28]:
print('kernel check')

kernel check


In [40]:
df = load_data(nrows=200000)

X, y = preprocess_for_severity(df)

X_train, X_test, y_train, y_test = train_test_split_df(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

preprocessor = build_severity_preprocessor(X_train)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

  - pd.to_datetime(engineered.get('VehicleIntroDate', pd.NaT), errors='coerce').dt.year


Train shape: (500, 21), Test shape: (126, 21)


In [41]:
models = build_default_severity_models()

results_df, trained_models = evaluate_regression_models(

    models,

    X_train,

    y_train,

    X_test,

    y_test,

    preprocessor,

)



formatted_results = results_df.copy()

if not formatted_results.empty:

    formatted_results["rmse"] = formatted_results["rmse"].map("{:.2f}".format)

    formatted_results["r2"] = formatted_results["r2"].map("{:.3f}".format)

formatted_results

Unnamed: 0,model,rmse,r2
0,LinearRegression,23026.73,-0.048
1,RandomForestRegressor,28190.25,-0.571
2,XGBRegressor,34322.84,-1.329


In [42]:
if not results_df.empty:
    best_model_name = results_df.iloc[0]["model"]
    best_pipeline = trained_models[best_model_name]
    y_pred = best_pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    residuals = y_test - y_pred
    diagnostics = pd.DataFrame(
        {
            "metric": ["rmse", "mae", "r2"],
            "value": [
                results_df.iloc[0]["rmse"],
                mae,
                results_df.iloc[0]["r2"],
            ],
        }
    )
    print(f"Best model: {best_model_name}")
    diagnostics
else:
    print("No models evaluated. Check model dependencies (e.g., xgboost).")

Best model: LinearRegression


In [43]:
if not results_df.empty:
    best_model_name = results_df.iloc[0]["model"]
    best_pipeline = trained_models[best_model_name]
    importance = permutation_importance(
        best_pipeline,
        X_test,
        y_test,
        n_repeats=5,
        random_state=42,
        n_jobs=-1,
    )
    importance_df = (
        pd.DataFrame(
            {
                "feature": X_test.columns,
                "importance_mean": importance.importances_mean,
                "importance_std": importance.importances_std,
            }
        )
        .sort_values("importance_mean", ascending=False)
        .reset_index(drop=True)
    )
    importance_df.head(15)
else:
    print("Permutation importance skipped because no models were trained.")

## Premium Prediction Models

In [44]:
Xp, yp = preprocess_for_premium(df)

Xp_train, Xp_test, yp_train, yp_test = train_test_split_df(
    Xp,
    yp,
    test_size=0.2,
    random_state=42,
)

premium_preprocessor = build_severity_preprocessor(Xp_train)
premium_models = build_default_premium_models()
premium_results_df, premium_trained_models = evaluate_regression_models(
    premium_models,
    Xp_train,
    yp_train,
    Xp_test,
    yp_test,
    premium_preprocessor,
)

premium_formatted = premium_results_df.copy()
if not premium_formatted.empty:
    premium_formatted["rmse"] = premium_formatted["rmse"].map("{:.2f}".format)
    premium_formatted["r2"] = premium_formatted["r2"].map("{:.3f}".format)
premium_formatted

  - pd.to_datetime(engineered.get('VehicleIntroDate', pd.NaT), errors='coerce').dt.year


Unnamed: 0,model,rmse,r2
1,RandomForestRegressor,11.33,0.997
2,XGBRegressor,17.15,0.994
0,LinearRegression,54.64,0.942


In [45]:
if not premium_results_df.empty:
    best_premium_model = premium_results_df.iloc[0]["model"]
    best_premium_pipeline = premium_trained_models[best_premium_model]
    premium_preds = best_premium_pipeline.predict(Xp_test)
    premium_mae = mean_absolute_error(yp_test, premium_preds)
    premium_diagnostics = pd.DataFrame(
        {
            "metric": ["rmse", "mae", "r2"],
            "value": [
                premium_results_df.iloc[0]["rmse"],
                premium_mae,
                premium_results_df.iloc[0]["r2"],
            ],
        }
    )
    print(f"Best premium model: {best_premium_model}")
    premium_diagnostics
else:
    print("No premium models evaluated. Check inputs or dependencies.")

Best premium model: RandomForestRegressor


## Claim Probability Classification

In [46]:
Xc, yc = preprocess_for_claim_probability(df)

Xc_train, Xc_test, yc_train, yc_test = train_test_split_df(
    Xc,
    yc,
    test_size=0.2,
    random_state=42,
    stratify=yc,
)

classification_preprocessor = build_severity_preprocessor(Xc_train)
classification_models = build_default_classification_models()
classification_results_df, classification_trained_models = evaluate_classification_models(
    classification_models,
    Xc_train,
    yc_train,
    Xc_test,
    yc_test,
    classification_preprocessor,
)

classification_formatted = classification_results_df.copy()
if not classification_formatted.empty:
    metric_formats = {
        "accuracy": "{:.3f}",
        "precision": "{:.3f}",
        "recall": "{:.3f}",
        "f1": "{:.3f}",
    }
    for metric, format_str in metric_formats.items():
        classification_formatted[metric] = classification_formatted[metric].map(format_str.format)
classification_formatted

  - pd.to_datetime(engineered.get('VehicleIntroDate', pd.NaT), errors='coerce').dt.year


Unnamed: 0,model,accuracy,precision,recall,f1
1,RandomForestClassifier,0.989,0.022,0.056,0.031
0,LogisticRegression,0.78,0.013,0.912,0.025
2,XGBClassifier,0.997,0.0,0.0,0.0


In [47]:
if not classification_results_df.empty:
    best_classifier_name = classification_results_df.iloc[0]["model"]
    best_classifier_pipeline = classification_trained_models[best_classifier_name]
    best_classifier_metrics = (
        classification_results_df.iloc[0][["accuracy", "precision", "recall", "f1"]]
        .reset_index()
        .rename(columns={"index": "metric", 0: "value"})
    )
    print(f"Best classifier: {best_classifier_name}")
    best_classifier_metrics
else:
    print("No classification models evaluated. Check inputs or dependencies.")

Best classifier: RandomForestClassifier


## Model Interpretability with SHAP

In [48]:
import shap
import numpy as np

if not results_df.empty:
    # Limit background and evaluation sets for performance
    background = X_train.sample(min(500, len(X_train)), random_state=42)
    evaluation_data = X_test.head(200)

    # Transform data using the pipeline's preprocessor
    preprocessor = best_pipeline.named_steps["preprocessor"]
    model = best_pipeline.named_steps["model"]
    
    X_bg_transformed = preprocessor.transform(background)
    X_eval_transformed = preprocessor.transform(evaluation_data)
    
    # Get feature names
    try:
        feature_names = preprocessor.get_feature_names_out()
    except AttributeError:
        feature_names = [f"feature_{i}" for i in range(X_bg_transformed.shape[1])]

    # Use the transformed data to explain the inner model
    explainer = shap.Explainer(model, X_bg_transformed)
    shap_values = explainer(X_eval_transformed)
    
    # Handle Explanation object vs list (for classification) vs array
    if hasattr(shap_values, 'values'):
        vals = shap_values.values
    else:
        vals = shap_values

    # If classification (list of arrays or array with extra dim), take positive class
    if isinstance(vals, list):
        vals = vals[1]
    elif vals.ndim > 2:
        vals = vals[:, :, 1] # Assuming binary classification, index 1 is positive

    shap_importance = (
        pd.DataFrame(
            {
                "feature": feature_names,
                "mean_abs_shap": np.abs(vals).mean(0),
            }
        )
        .sort_values("mean_abs_shap", ascending=False)
        .reset_index(drop=True)
    )

    print("Top 10 Features by SHAP Importance:")
    display(shap_importance.head(10))
else:
    print("SHAP skipped because severity models are unavailable.")

Top 10 Features by SHAP Importance:


Unnamed: 0,feature,mean_abs_shap
0,cat__Model_QUANTUM 2.7 SESFIKILE 16s,16753.495663
1,num__kilowatts,14864.731561
2,num__VehicleIntroAge,11469.062831
3,cat__Model_QUANTUM 2.7 SESFIKILE 15s,10435.629706
4,num__CalculatedPremiumPerTerm,8338.591309
5,num__TotalPremium,5851.157664
6,cat__Model_QUANTUM 2.7 SESFIKILE 14s,5339.394935
7,cat__Model_HiACE SUPER 16 F/Lift,5174.41055
8,cat__CoverType_Windscreen,4568.256582
9,num__SumInsured,4334.40574
