In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import os
import shap
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
import sys
warnings.filterwarnings("ignore")
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
DATA_DIR = '/content/drive/My Drive'
data = pd.read_csv(os.path.join(DATA_DIR, 'MachineLearningRating_v3.txt'), sep='|')
print(f"Initial data shape: {data.shape}")

numerical_columns_to_coerce = [
    'TotalPremium', 'TotalClaims', 'CustomValueEstimate', 'CapitalOutstanding',
    'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors', 'SumInsured',
    'CalculatedPremiumPerTerm', 'ExcessSelected', 'NumberOfVehiclesInFleet',
    'RegistrationYear'
]

for col in numerical_columns_to_coerce:
    if col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')

# --- Feature Engineering ---
current_date = pd.to_datetime('2025-06-17')

if 'TransactionMonth' in data.columns:
    data['TransactionMonth'] = pd.to_datetime(data['TransactionMonth'], errors='coerce')
    data['PolicyAgeMonths'] = (current_date - data['TransactionMonth']).dt.days / 30.44
else:
    data['PolicyAgeMonths'] = np.nan

if 'RegistrationYear' in data.columns:
    data['VehicleAgeYears'] = (current_date.year - data['RegistrationYear'])
else:
    data['VehicleAgeYears'] = np.nan

if 'VehicleIntroDate' in data.columns:
    data['VehicleIntroDate'] = pd.to_datetime(data['VehicleIntroDate'], errors='coerce')
    data['VehicleModelAgeYears'] = (current_date.year - data['VehicleIntroDate'].dt.year)
else:
    data['VehicleModelAgeYears'] = np.nan

# Interaction features
if 'TotalPremium' in data.columns and 'SumInsured' in data.columns:
    temp_sum_insured = data['SumInsured'].replace(0, np.nan)
    data['PremiumPerSumInsured'] = data['TotalPremium'] / temp_sum_insured
else:
    data['PremiumPerSumInsured'] = np.nan

if 'TotalClaims' in data.columns and 'SumInsured' in data.columns:
    temp_sum_insured = data['SumInsured'].replace(0, np.nan)
    data['ClaimsPerSumInsured'] = data['TotalClaims'] / temp_sum_insured
else:
    data['ClaimsPerSumInsured'] = np.nan

categorical_cols_to_impute_unknown = [
    'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank',
    'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode',
    'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'make',
    'Model', 'bodytype', 'AlarmImmobiliser', 'TrackingDevice', 'NewVehicle',
    'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'CoverCategory',
    'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType',
    'TermFrequency'
]

for col in categorical_cols_to_impute_unknown:
    if col in data.columns:
        data[col] = data[col].astype(str)
        data[col] = data[col].replace('nan', 'Unknown')
        data[col] = data[col].replace('None', 'Unknown')

print(f"Data shape after initial categorical imputation: {data.shape}")

data.dropna(subset=['TotalPremium', 'TotalClaims', 'CalculatedPremiumPerTerm'], inplace=True)
print(f"Data shape after dropping rows with NaN in critical targets: {data.shape}")

# Calculate HasClaim and ClaimSeverity AFTER cleaning TotalClaims
data['HasClaim'] = (data['TotalClaims'] > 0).astype(int)
data['ClaimSeverity'] = data.apply(lambda row: row['TotalClaims'] if row['HasClaim'] == 1 else np.nan, axis=1)

# Calculate Margin
data['Margin'] = data['TotalPremium'] - data['TotalClaims']

print("Data preparation for predictive modeling complete. Head of data with new metrics:")
print(data[['TotalPremium', 'TotalClaims', 'HasClaim', 'ClaimSeverity', 'Margin']].head())

# --- Separate target variables for each modeling goal ---
columns_to_drop_for_X = [
    'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'VehicleIntroDate',
    'TotalPremium', 'TotalClaims', 'Margin', 'HasClaim',
    'RegistrationYear', 'ClaimSeverity'
]

# 1. Claim Severity Prediction (Regression on TotalClaims where claims > 0)
claim_severity_df = data[data['TotalClaims'] > 0].copy()
print(f"Shape of claim_severity_df (TotalClaims > 0): {claim_severity_df.shape}")

X_severity = claim_severity_df.drop(columns=columns_to_drop_for_X, errors='ignore')
y_severity = claim_severity_df['TotalClaims']

print(f"Shape of X_severity before pipeline processing: {X_severity.shape}")
# 2. Premium Optimization (Naive: Predict CalculatedPremiumPerTerm)
premium_optimization_df = data.copy()
X_premium = premium_optimization_df.drop(columns=columns_to_drop_for_X + ['CalculatedPremiumPerTerm'], errors='ignore')
y_premium = premium_optimization_df['CalculatedPremiumPerTerm']

print(f"Shape of X_premium before pipeline processing: {X_premium.shape}")

# 3. Probability of Claim (Binary Classification)
X_claim_prob = data.drop(columns=columns_to_drop_for_X + ['CalculatedPremiumPerTerm'], errors='ignore')
y_claim_prob = data['HasClaim']

print(f"Shape of X_claim_prob before pipeline processing: {X_claim_prob.shape}")

# --- Train-Test Split (with checks for sufficient data) ---
X_train_severity, X_test_severity, y_train_severity, y_test_severity = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()
X_train_premium, X_test_premium, y_train_premium, y_test_premium = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()
X_train_claim_prob, X_test_claim_prob, y_train_claim_prob, y_test_claim_prob = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()


if not y_severity.empty and len(X_severity) > 1:
    X_train_severity, X_test_severity, y_train_severity, y_test_severity = train_test_split(X_severity, y_severity, test_size=0.2, random_state=42)
else:
    print("Warning: Insufficient data for Claim Severity Prediction after NaN removal. Skipping split.")

if not y_premium.empty and len(X_premium) > 1:
    X_train_premium, X_test_premium, y_train_premium, y_test_premium = train_test_split(X_premium, y_premium, test_size=0.2, random_state=42)
else:
    print("Warning: Insufficient data for Premium Optimization after NaN removal. Skipping split.")

if not y_claim_prob.empty and len(X_claim_prob) > 1:
    if y_claim_prob.nunique() > 1:
        X_train_claim_prob, X_test_claim_prob, y_train_claim_prob, y_test_claim_prob = train_test_split(X_claim_prob, y_claim_prob, test_size=0.2, random_state=42, stratify=y_claim_prob)
    else:
        print("Warning: Only one class present for Claim Probability after NaN removal. Cannot stratify. Splitting without stratification.")
        X_train_claim_prob, X_test_claim_prob, y_train_claim_prob, y_test_claim_prob = train_test_split(X_claim_prob, y_claim_prob, test_size=0.2, random_state=42)
else:
    print("Warning: Insufficient data for Claim Probability Prediction after NaN removal. Skipping split.")


# --- Create preprocessing pipelines for numerical and categorical features ---

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Imputes NaNs with the median of the training data
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), # Imputes NaNs/missing categories with 'Unknown'
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Preprocessor for Severity Model
if not X_train_severity.empty:
    numerical_features_severity = X_train_severity.select_dtypes(include=np.number).columns.tolist()
    categorical_features_severity = X_train_severity.select_dtypes(include=['object', 'bool']).columns.tolist()
    preprocessor_severity = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features_severity),
            ('cat', categorical_transformer, categorical_features_severity)
        ],
        remainder='passthrough'
    )
    print(f"\nSeverity Preprocessor: Num features: {numerical_features_severity}, Cat features: {categorical_features_severity}")
else:
    preprocessor_severity = None
    print("Severity preprocessor not created due to insufficient training data.")


# Preprocessor for Premium Model
if not X_train_premium.empty:
    numerical_features_premium = X_train_premium.select_dtypes(include=np.number).columns.tolist()
    categorical_features_premium = X_train_premium.select_dtypes(include=['object', 'bool']).columns.tolist()
    preprocessor_premium = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features_premium),
            ('cat', categorical_transformer, categorical_features_premium)
        ],
        remainder='passthrough'
    )
    print(f"Premium Preprocessor: Num features: {numerical_features_premium}, Cat features: {categorical_features_premium}")
else:
    preprocessor_premium = None
    print("Premium preprocessor not created due to insufficient training data.")


# Preprocessor for Claim Probability Model
if not X_train_claim_prob.empty:
    numerical_features_claim_prob = X_train_claim_prob.select_dtypes(include=np.number).columns.tolist()
    categorical_features_claim_prob = X_train_claim_prob.select_dtypes(include=['object', 'bool']).columns.tolist()
    preprocessor_claim_prob = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features_claim_prob),
            ('cat', categorical_transformer, categorical_features_claim_prob)
        ],
        remainder='passthrough'
    )
    print(f"Claim Probability Preprocessor: Num features: {numerical_features_claim_prob}, Cat features: {categorical_features_claim_prob}")
else:
    preprocessor_claim_prob = None
    print("Claim probability preprocessor not created due to insufficient training data.")

print("\nData preprocessing setup complete for modeling.")

Initial data shape: (1000098, 52)
Data shape after initial categorical imputation: (1000098, 57)
Data shape after dropping rows with NaN in critical targets: (1000098, 57)
Data preparation for predictive modeling complete. Head of data with new metrics:
   TotalPremium  TotalClaims  HasClaim  ClaimSeverity      Margin
0     21.929825          0.0         0            NaN   21.929825
1     21.929825          0.0         0            NaN   21.929825
2      0.000000          0.0         0            NaN    0.000000
3    512.848070          0.0         0            NaN  512.848070
4      0.000000          0.0         0            NaN    0.000000
Shape of claim_severity_df (TotalClaims > 0): (2788, 60)
Shape of X_severity before pipeline processing: (2788, 50)
Shape of X_premium before pipeline processing: (1000098, 49)
Shape of X_claim_prob before pipeline processing: (1000098, 49)

Severity Preprocessor: Num features: ['Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors', 'CustomVal

# New Section

In [3]:
print("\n--- Claim Severity Prediction Models ---")

models_severity = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'XGBoost': xgb.XGBRegressor(random_state=42, n_estimators=100)
}

results_severity = {}

for name, model in models_severity.items():
    print(f"\nTraining {name} for Claim Severity...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor_severity),
                               ('regressor', model)])
    pipeline.fit(X_train_severity, y_train_severity)
    y_pred_severity = pipeline.predict(X_test_severity)

    rmse_severity = np.sqrt(mean_squared_error(y_test_severity, y_pred_severity))
    r2_severity = r2_score(y_test_severity, y_pred_severity)

    results_severity[name] = {'RMSE': rmse_severity, 'R2': r2_severity}
    print(f"{name} - RMSE: {rmse_severity:.2f}, R-squared: {r2_severity:.2f}")
    if name == 'XGBoost': 
        best_model_severity_pipeline = pipeline
        best_model_severity_name = name

print("\n--- Claim Severity Model Evaluation Summary ---")
for name, metrics in results_severity.items():
    print(f"{name}: RMSE={metrics['RMSE']:.2f}, R2={metrics['R2']:.2f}")


--- Claim Severity Prediction Models ---

Training Linear Regression for Claim Severity...
Linear Regression - RMSE: 38377.20, R-squared: 0.08

Training Random Forest for Claim Severity...
Random Forest - RMSE: 7866.83, R-squared: 0.96

Training XGBoost for Claim Severity...
XGBoost - RMSE: 7904.32, R-squared: 0.96

--- Claim Severity Model Evaluation Summary ---
Linear Regression: RMSE=38377.20, R2=0.08
Random Forest: RMSE=7866.83, R2=0.96
XGBoost: RMSE=7904.32, R2=0.96


In [4]:
print("\n--- Premium Optimization (Predicting CalculatedPremiumPerTerm) Models ---")

models_premium = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=10),
    'XGBoost': xgb.XGBRegressor(random_state=42, n_estimators=100)
}

results_premium = {}

for name, model in models_premium.items():
    print(f"\nTraining {name} for Premium Prediction...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor_premium),
                               ('regressor', model)])
    pipeline.fit(X_train_premium, y_train_premium)
    y_pred_premium = pipeline.predict(X_test_premium)

    rmse_premium = np.sqrt(mean_squared_error(y_test_premium, y_pred_premium))
    r2_premium = r2_score(y_test_premium, y_pred_premium)

    results_premium[name] = {'RMSE': rmse_premium, 'R2': r2_premium}
    print(f"{name} - RMSE: {rmse_premium:.2f}, R-squared: {r2_premium:.2f}")

    if name == 'XGBoost':
        best_model_premium_pipeline = pipeline
        best_model_premium_name = name

print("\n--- Premium Prediction Model Evaluation Summary ---")
for name, metrics in results_premium.items():
    print(f"{name}: RMSE={metrics['RMSE']:.2f}, R2={metrics['R2']:.2f}")


--- Premium Optimization (Predicting CalculatedPremiumPerTerm) Models ---

Training Linear Regression for Premium Prediction...
Linear Regression - RMSE: 208.27, R-squared: 0.46

Training Random Forest for Premium Prediction...
Random Forest - RMSE: 12.53, R-squared: 1.00

Training XGBoost for Premium Prediction...
XGBoost - RMSE: 25.99, R-squared: 0.99

--- Premium Prediction Model Evaluation Summary ---
Linear Regression: RMSE=208.27, R2=0.46
Random Forest: RMSE=12.53, R2=1.00
XGBoost: RMSE=25.99, R2=0.99
