imports

In [4]:
%pip install lime

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score, roc_auc_score, f1_score, precision_recall_curve
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb
import shap
import lime.lime_tabular
sns.set_theme(style="whitegrid", palette="colorblind")
SEED = 42
np.random.seed(SEED)

Loading data

In [6]:
df = pd.read_csv(
    "D:/kifiya AI/Insurance-Risk-Analytics-and-predictive-modeling/data/MachineLearningRating_v3.txt",
    sep="|",
    engine="python"
)
print("Rows:", df.shape[0])

Rows: 1000099


Feature engineering

In [7]:
df["VehicleAge"] = 2025 - df["RegistrationYear"]
df["VehicleAge"] = df["VehicleAge"].clip(0, 30)
df["LogCustomValue"] = np.log1p(df["CustomValueEstimate"])
df["PremiumPerValue"] = df["TotalPremium"] / (df["CustomValueEstimate"] + 1)

Missing audit

In [8]:
miss = df.isna().mean().sort_values(ascending=False)
miss[miss>0]

NumberOfVehiclesInFleet     1.000000e+00
CrossBorder                 9.993021e-01
PremiumPerValue             7.795658e-01
LogCustomValue              7.795658e-01
CustomValueEstimate         7.795658e-01
Converted                   6.418385e-01
Rebuilt                     6.418385e-01
WrittenOff                  6.418385e-01
NewVehicle                  1.532808e-01
Bank                        1.459476e-01
AccountType                 4.022902e-02
Gender                      9.536056e-03
MaritalStatus               8.259182e-03
make                        5.529453e-04
VehicleIntroDate            5.529453e-04
bodytype                    5.529453e-04
kilowatts                   5.529453e-04
cubiccapacity               5.529453e-04
Cylinders                   5.529453e-04
Model                       5.529453e-04
NumberOfDoors               5.529453e-04
VehicleType                 5.529453e-04
mmcode                      5.529453e-04
CapitalOutstanding          2.999703e-06
Section         

Pick columns

In [48]:
num_cols = ["VehicleAge", "LogCustomValue", "PremiumPerValue", "SumInsured"]
cat_cols = ["Province", "Gender", "VehicleType", "CoverType", "MaritalStatus"]
target_sev = "TotalClaims"
target_prob = "HasClaim"   

Severity dataset

In [49]:
sev_df = df[df["TotalClaims"] > 0].copy()
sev_df["LogTotalClaims"] = np.log1p(sev_df["TotalClaims"])
X_sev = sev_df[num_cols + cat_cols]
y_sev = sev_df["LogTotalClaims"]        

Probability dataset

In [50]:
df["HasClaim"] = (df["TotalClaims"] > 0).astype(int)
X_prob = df[num_cols + cat_cols]
y_prob = df["HasClaim"]

Split

In [51]:
X_prob_train, X_prob_test, y_prob_train, y_prob_test = train_test_split(
    X_prob, y_prob, test_size=0.2, random_state=SEED, stratify=y_prob)
X_sev_train,  X_sev_test,  y_sev_train,  y_sev_test  = train_test_split(
    X_sev, y_sev,  test_size=0.2, random_state=SEED)

 Pre-processing pipeline

In [52]:
numeric_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale",  StandardScaler())
])
cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

LR severity

In [53]:
lr_sev = Pipeline([("prep", preprocessor),
                   ("model", LinearRegression())])
lr_sev.fit(X_sev_train, y_sev_train)
pred_lr_sev = lr_sev.predict(X_sev_test)
rmse_lr = root_mean_squared_error(y_sev_test, pred_lr_sev)
r2_lr   = r2_score(y_sev_test, pred_lr_sev)
print(f"LR Severity – RMSE: {rmse_lr:.3f} | R²: {r2_lr:.3f}")

LR Severity – RMSE: 0.956 | R²: 0.656


RF severity

In [54]:
rf_sev = Pipeline([("prep", preprocessor),
                   ("model", RandomForestRegressor(n_estimators=300,
                                                  max_depth=None,
                                                  random_state=SEED))])
rf_sev.fit(X_sev_train, y_sev_train)
pred_rf_sev = rf_sev.predict(X_sev_test)
rmse_rf = root_mean_squared_error(y_sev_test, pred_rf_sev)
r2_rf   = r2_score(y_sev_test, pred_rf_sev)
print(f"RF Severity – RMSE: {rmse_rf:.3f} | R²: {r2_rf:.3f}")

RF Severity – RMSE: 1.060 | R²: 0.576


XGB severity

In [55]:
xgb_sev = Pipeline([("prep", preprocessor),
                    ("model", xgb.XGBRegressor(n_estimators=300,
                                               max_depth=6,
                                               learning_rate=0.1,
                                               random_state=SEED))])
xgb_sev.fit(X_sev_train, y_sev_train)
pred_xgb_sev = xgb_sev.predict(X_sev_test)
rmse_xgb = root_mean_squared_error(y_sev_test, pred_xgb_sev)
r2_xgb   = r2_score(y_sev_test, pred_xgb_sev)
print(f"XGB Severity – RMSE: {rmse_xgb:.3f} | R²: {r2_xgb:.3f}")

XGB Severity – RMSE: 1.048 | R²: 0.586


Severity leaderboard

In [56]:
severity_leaderboard = pd.DataFrame({
    "Model": ["LinearRegression", "RandomForest", "XGBoost"],
    "RMSE_log":  [rmse_lr, rmse_rf, rmse_xgb],
    "R2":        [r2_lr, r2_rf, r2_xgb]
}).sort_values("RMSE_log")
severity_leaderboard.style.background_gradient(subset=["RMSE_log"], cmap="Reds")

Unnamed: 0,Model,RMSE_log,R2
0,LinearRegression,0.955736,0.655652
2,XGBoost,1.047657,0.586228
1,RandomForest,1.060325,0.576162


Imbalance

In [57]:
y_prob_train.value_counts(normalize=True)


HasClaim
0    0.997213
1    0.002787
Name: proportion, dtype: float64

XGB probability

In [58]:
xgb_clf = Pipeline([("prep", preprocessor),
                    ("model", xgb.XGBClassifier(n_estimators=300,
                                                max_depth=6,
                                                scale_pos_weight=(y_prob_train==0).sum()/(y_prob_train==1).sum(),
                                                random_state=SEED))])
xgb_clf.fit(X_prob_train, y_prob_train)
pred_prob = xgb_clf.predict_proba(X_prob_test)[:,1]
auc = roc_auc_score(y_prob_test, pred_prob)
print(f"XGB Probability – AUC: {auc:.3f}")

XGB Probability – AUC: 0.871


Combined leaderboard

In [60]:
summary = pd.concat([
    severity_leaderboard.assign(Task="Severity"),
    pd.DataFrame({"Model":["XGBoost"], "RMSE_log":[np.nan], "R2":[np.nan], "AUC":[auc], "Task":["Probability"]})
], ignore_index=True)
summary

Unnamed: 0,Model,RMSE_log,R2,Task,AUC
0,LinearRegression,0.955736,0.655652,Severity,
1,XGBoost,1.047657,0.586228,Severity,
2,RandomForest,1.060325,0.576162,Severity,
3,XGBoost,,,Probability,0.870588


Risk-based premium

In [61]:
features = num_cols + cat_cols

# Subset inputs to exact feature columns
X_prob = df[features]
X_sev = sev_df[features]

prob_claim = xgb_clf.predict_proba(X_prob)[:,1]
sev_claim  = np.expm1(xgb_sev.predict(X_sev))

pricing_df = df.copy()
pricing_df["PredProb"] = prob_claim
pricing_df["PredSev"]  = 0

mask = pricing_df["HasClaim"] == 1
pricing_df.loc[mask, "PredSev"] = sev_claim.values  # make sure indices align or use .values

# For non-claim policies, use mean severity
mean_sev = df.loc[df["HasClaim"]==1, "TotalClaims"].mean()
pricing_df["PredSev"].replace(0, mean_sev, inplace=True)

EXPENSE_LOAD = 250
PROFIT_MARGIN = 0.05
pricing_df["RiskPremium"] = pricing_df["PredProb"] * pricing_df["PredSev"]
pricing_df["SuggestedPremium"] = pricing_df["RiskPremium"] * (1 + PROFIT_MARGIN) + EXPENSE_LOAD


ValueError: Feature shape mismatch, expected: 33, got 51