# Task 4: Predictive Modeling & Interpretability

This notebook builds two models:
1. Claim Severity Model – Predict `TotalClaims` (regression).
2. Claim Probability Model – Predict whether a policyholder will make a claim (classification).

In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('data/raw/MachineLearningRating_v3.txt', sep='|', parse_dates=['TransactionMonth'])

# Derived columns
df['LossRatio'] = np.where(df['TotalPremium'] > 0, df['TotalClaims'] / df['TotalPremium'], np.nan)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

## OOP-Based Model Class

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, classification_report

class ClaimModel:
    def __init__(self, df, features, target, model_type='regression'):
        self.df = df.copy()
        self.features = features
        self.target = target
        self.model_type = model_type
        self.pipeline = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.cat_features = [col for col in features if self.df[col].dtype == 'object']
        self.num_features = [col for col in features if col not in self.cat_features]

    def prepare(self):
        data = self.df[self.features + [self.target]].dropna()
        X = data[self.features]
        y = data[self.target]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    def build(self):
        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median'))
        ])
        cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        preprocessor = ColumnTransformer([
            ('num', num_pipeline, self.num_features),
            ('cat', cat_pipeline, self.cat_features)
        ])
        model = RandomForestRegressor() if self.model_type == 'regression' else RandomForestClassifier()
        self.pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

    def train(self):
        self.pipeline.fit(self.X_train, self.y_train)

    def evaluate(self):
        y_pred = self.pipeline.predict(self.X_test)
        if self.model_type == 'regression':
            print(f'RMSE: {mean_squared_error(self.y_test, y_pred, squared=False):.2f}')
            print(f'R2: {r2_score(self.y_test, y_pred):.3f}')
        else:
            print(f'ROC AUC: {roc_auc_score(self.y_test, y_pred):.3f}')
            print(classification_report(self.y_test, y_pred))

## Model 1: Claim Severity Regression

In [None]:
df['VehicleAge'] = 2015 - df['RegistrationYear']
df_severity = df[df['HasClaim'] == 1].copy()

features_sev = ['VehicleAge', 'Cubiccapacity', 'Kilowatts', 'NumberOfDoors',
               'Province', 'VehicleType', 'Gender', 'CustomValueEstimate']
target_sev = 'TotalClaims'

model_sev = ClaimModel(df_severity, features_sev, target_sev, model_type='regression')
model_sev.prepare()
model_sev.build()
model_sev.train()
model_sev.evaluate()

## Model 2: Claim Probability Classification

In [None]:
features_prob = ['VehicleAge', 'Cubiccapacity', 'Kilowatts', 'NumberOfDoors',
                 'Province', 'VehicleType', 'Gender', 'CustomValueEstimate']
target_prob = 'HasClaim'

model_prob = ClaimModel(df, features_prob, target_prob, model_type='classification')
model_prob.prepare()
model_prob.build()
model_prob.train()
model_prob.evaluate()