In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from pathlib import Path

In [2]:
cwd = Path.cwd()
home = cwd.parent
data = home / 'data'

In [3]:
# Load the data
data = pd.read_csv(data / "claim_data_group3_2024.csv")

In [4]:
# Feature engineering
data['AgeBucket'] = pd.cut(data['DrivAge'], bins=[18, 25, 35, 45, 55, 65, 75, 100], labels=['18-25', '26-35', '36-45', '46-55', '56-65', '66-75', '75+'])
data['VehAgeBucket'] = pd.cut(data['VehAge'], bins=[-1, 2, 5, 10, 15, 20, 100], labels=['0-2', '3-5', '6-10', '11-15', '16-20', '20+'])
data['BonusMalusBucket'] = pd.cut(data['BonusMalus'], bins=[50, 75, 100, 125, 150, 175, 200, 225], labels=['50-75', '76-100', '101-125', '126-150', '151-175', '176-200', '201+'])

# Create target variable
data['HasClaimed'] = (data['ClaimAmount'] > 0).astype(int)

In [5]:
# Split features and target
X = data.drop(['IDpol', 'ClaimAmount', 'ClaimNb', 'HasClaimed', 'Exposure'], axis=1)
y = data['HasClaimed']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [6]:
# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with SMOTE and LightGBM
model = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('classifier', LGBMClassifier(random_state=42))
])

In [7]:
# Define the parameter space for Bayesian optimization
param_space = {
    'classifier__n_estimators': Integer(100, 1000),
    'classifier__max_depth': Integer(3, 6),
    'classifier__learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'classifier__num_leaves': Integer(20, 200),
    'classifier__class_weight': Categorical(['balanced', None])
}

# Perform Bayesian optimization
bayes_search = BayesSearchCV(
    model,
    param_space,
    n_iter=10,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42
)
bayes_search.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 77037, number of negative: 77037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2546
[LightGBM] [Info] Number of data points in the train set: 154074, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [10]:
# Get the best model
best_model = bayes_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
confusion_matrix(y_test, y_pred)

array([[19253,     6],
       [  739,     2]], dtype=int64)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     19259
           1       0.25      0.00      0.01       741

    accuracy                           0.96     20000
   macro avg       0.61      0.50      0.49     20000
weighted avg       0.94      0.96      0.94     20000



In [11]:
# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
avg_precision = average_precision_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print(f"ROC AUC: {roc_auc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"F1 Score: {f1:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': best_model.named_steps['preprocessor'].get_feature_names_out(),
    'importance': best_model.named_steps['classifier'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False).head(10)
print("\nTop 10 Important Features:")
print(feature_importance)

# Print best parameters
print("\nBest Parameters:")
print(bayes_search.best_params_)

ROC AUC: 0.6215
Average Precision: 0.0658
F1 Score: 0.0053


ValueError: All arrays must be of the same length