# Model Implementation

## Initialize Packages

In [21]:
import sys
import pandas as pd
import numpy as np
import os
import rich
import xgboost as xgb
import json
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

## Load Data and Assign Predictors

### Load Base Data and Final Data

In [10]:
model_data = pd.read_csv('../backend/model_data.csv')
final_model_data = pd.read_csv('../backend/train_df_final.csv')

In [12]:
final_model_data['claim_cst_per_exposure'] = final_model_data['claimcst0'] / final_model_data['exposure']
final_model_data.head(5)

Unnamed: 0,driving_history_score,credit_score,marital_status,time_driven,area,agecat_grouped,gender,veh_color,max_power,engine_type,veh_age,veh_body,veh_value,claimcst0,numclaims,clm,exposure,claim_cst_per_exposure
0,73.0,646.516469,0.0,1.0,1.0,2.0,0.0,0.0,161,2.0,1.0,0.0,5.8,2023.198184,1,1,0.362191,5585.998969
1,88.0,635.400369,1.0,2.0,0.0,1.0,0.0,4.0,100,0.0,3.0,0.0,5.67,3600.172234,1,1,0.632068,5695.85954
2,55.0,646.463131,0.0,2.0,2.0,1.0,0.0,3.0,74,3.0,1.0,0.0,5.9,2021.144067,1,1,0.36746,5500.307127
3,98.0,645.598794,0.0,3.0,1.0,1.0,1.0,0.0,121,0.0,2.0,0.0,4.79,4006.845492,1,1,0.802184,4994.920513
4,66.0,657.348612,1.0,2.0,2.0,2.0,1.0,1.0,75,3.0,2.0,0.0,6.68,2542.953931,1,1,0.485009,5243.10606


In [6]:
# Define the list of predictors
# Create predictor list
veh_pred_lst = ['veh_value', 'veh_body', 'veh_age', 'engine_type', 'max_power', 'veh_color']
policy_pred_lst = ['gender', 'agecat', 'e_bill' ]
driving_behavior_pred_lst = ['area', 'time_of_week_driven', 'time_driven']
demo_pred_lst = ['marital_status', 'low_education_ind', 'credit_score', 'driving_history_score']
pred_lst = veh_pred_lst + policy_pred_lst + driving_behavior_pred_lst + demo_pred_lst # Split the data into training and validation sets
# pred_lst = ['engine_type', 'gender', 'credit_score', 'veh_age', 'agecat', 'area']

train_data = model_data.loc[model_data['sample'] == '1|bld']
val_data = model_data.loc[model_data['sample'] == '2|val']
rich.print( train_data.shape, val_data.shape )
train_data.head(5)

Unnamed: 0,id,fold,sample,veh_value,exposure,veh_body,veh_age,gender,area,agecat,...,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,low_education_ind,clm,numclaims,claimcst0,expected_loss
0,1,2,1|bld,5.8,0.362191,SUV,2,F,B,1,...,0,weekday,12pm - 6pm,6,646.516469,0.0,1,1,2023.198184,5585.998969
1,2,3,1|bld,5.67,0.632068,STNWG,4,F,A,2,...,0,weekend,6am - 12pm,12,635.400369,0.0,1,1,3600.172234,5695.85954
2,3,1,1|bld,5.9,0.36746,SEDAN,2,F,C,2,...,0,weekday,6am - 12pm,12,646.463131,0.0,1,1,2021.144067,5500.307127
3,4,2,1|bld,4.79,0.802184,STNWG,3,M,B,4,...,1,weekday,6pm - 12am,12,645.598794,0.0,1,1,4006.845492,4994.920513
4,5,2,1|bld,6.68,0.485009,SEDAN,3,M,C,1,...,0,weekday,6am - 12pm,12,657.348612,0.0,1,1,2542.953931,5243.10606


In [5]:
# Define features and target
target = 'expected_loss'

X_train = train_data[pred_lst].copy()
y_train = train_data[target]

# Random Forest

In [13]:
for col in X_train.select_dtypes("object"):
    X_train[col] = X_train[col].astype("category").cat.codes

In [None]:
# Train Random Forest
rf = RandomForestRegressor(
    n_estimators=300,      # number of trees
    max_depth=None,        # no depth limit (fully grown trees)
    random_state=42,
    n_jobs=1
)

rf.fit(X_train, y_train)

# In-sample predictions
y_pred = rf.predict(X_train)

# Evaluate on training data
rmse = root_mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print("Training Model Metrics")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.3f}")

Training Model Metrics
RMSE: 1479.17
MAE: 671.65
R²: 0.859


# XGBoost

## Frequency-Severity Modeling

In [15]:
# XGBoost Tweedie Regression with Cross-Validation Early Stopping and Native Categorical Support

final_preds = [
    'driving_history_score', # was supposed to drop, but this feels important for insurance
    'credit_score',
    # 'low_education_ind', # dropped based on results of varclushi
    'marital_status',
    'time_driven',
    'area',
    'agecat_grouped',
    'gender',
    'veh_color',
    'max_power',
    'engine_type',
    'veh_age',
    'veh_body',
    'veh_value' # were supposed to drop this but it doesn't make much sense to do
]

# Define features and target
y = final_model_data['claim_cst_per_exposure']
X = final_model_data[final_preds].copy()
# weights = train_data['exposure']  # Assuming exposure is in years, convert to months

# Convert object columns to pandas Categorical dtype for XGBoost native categorical support
for col in final_preds:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category')

# XGBoost DMatrix with offset
dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)

# # Compute offset (log of exposure, or any other offset variable)
# offset = np.log(train_data['exposure'])
# dtrain.set_base_margin(offset)

# Define XGBoost parameters for Tweedie regression
params = {
    'objective': 'reg:tweedie',
    'eval_metric': 'tweedie-nloglik',
    'tweedie_variance_power': 1.5,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1.0,
    'alpha': 0.0,
    'nthread': -1,
    'seed': 42,
    'tree_method': 'hist',
    'enable_categorical': True
}

# Cross-validation with early stopping
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=5000,
    nfold=5,  # 5-fold cross-validation
    early_stopping_rounds=50,
    seed=42,
    verbose_eval=50
 )

# The best number of boosting rounds is:
best_num_boost_round = len(cv_results)
print(f"Best num_boost_round from CV: {best_num_boost_round}")

# Train final model on all data using best_num_boost_round
model = xgb.train(
    params,
    dtrain,
    # num_boost_round=500
    num_boost_round=best_num_boost_round
 )

XGBoostError: [21:04:12] /Users/runner/work/xgboost/xgboost/src/metric/elementwise_metric.cu:321: Check failed: param != nullptr: tweedie-nloglik must be in format tweedie-nloglik@rho
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001330d92dc dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x00000001333602ec xgboost::metric::EvalTweedieNLogLik::EvalTweedieNLogLik(char const*) + 172
  [bt] (2) 3   libxgboost.dylib                    0x000000013335f8bc std::__1::__function::__func<xgboost::metric::$_10, std::__1::allocator<xgboost::metric::$_10>, xgboost::Metric* (char const*)>::operator()(char const*&&) + 48
  [bt] (3) 4   libxgboost.dylib                    0x000000013336439c xgboost::Metric* xgboost::CreateMetricImpl<xgboost::MetricReg>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) + 1316
  [bt] (4) 5   libxgboost.dylib                    0x0000000133363de8 xgboost::Metric::Create(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, xgboost::Context const*) + 32
  [bt] (5) 6   libxgboost.dylib                    0x0000000133336400 xgboost::LearnerConfiguration::ConfigureMetrics(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>> const&) + 216
  [bt] (6) 7   libxgboost.dylib                    0x000000013332a130 xgboost::LearnerConfiguration::Configure() + 2012
  [bt] (7) 8   libxgboost.dylib                    0x0000000133101a60 XGBoosterBoostedRounds + 100
  [bt] (8) 9   libffi.dylib                        0x000000019b1d0050 ffi_call_SYSV + 80



## Risk Segmentation

### Quantiles

In [19]:
train_data['predicted_expected_loss'] = y_pred

train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['predicted_expected_loss'] = y_pred


Unnamed: 0,id,fold,sample,veh_value,exposure,veh_body,veh_age,gender,area,agecat,...,time_of_week_driven,time_driven,trm_len,credit_score,low_education_ind,clm,numclaims,claimcst0,expected_loss,predicted_expected_loss
0,1,2,1|bld,5.8,0.362191,SUV,2,F,B,1,...,weekday,12pm - 6pm,6,646.516469,0.0,1,1,2023.198184,5585.998969,4190.744101
1,2,3,1|bld,5.67,0.632068,STNWG,4,F,A,2,...,weekend,6am - 12pm,12,635.400369,0.0,1,1,3600.172234,5695.85954,5273.055427
2,3,1,1|bld,5.9,0.36746,SEDAN,2,F,C,2,...,weekday,6am - 12pm,12,646.463131,0.0,1,1,2021.144067,5500.307127,4220.612971
3,4,2,1|bld,4.79,0.802184,STNWG,3,M,B,4,...,weekday,6pm - 12am,12,645.598794,0.0,1,1,4006.845492,4994.920513,3422.488947
4,5,2,1|bld,6.68,0.485009,SEDAN,3,M,C,1,...,weekday,6am - 12pm,12,657.348612,0.0,1,1,2542.953931,5243.10606,3485.267952


In [20]:
train_data["expected_loss"].quantile([0.33, 0.66])

0.33    0.0
0.66    0.0
Name: expected_loss, dtype: float64

### Fixed Thresholds

### KMeans Clustering

## Save Altered Data

In [None]:
model_data.to_csv('../project/model_data.csv', index=False)
inference_data.to_csv('../project/inference_data.csv', index=False)

## Export Model Artifacts

In [24]:
# 1) Save the trained RandomForestRegressor
joblib.dump(rf, "../project/models/rf_model.pkl")

# 2) Save the exact feature column order used to fit the model
feature_columns = list(X_train.columns)  # if you trained with a DataFrame
with open("../project/models/feature_columns.json", "w") as f:
    json.dump(feature_columns, f)

# 3) (Optional) Create simple risk cutoffs from training predictions