# FNOL Model Build

In [2]:
import pandas as pd
from catboost import CatBoostRegressor, CatBoostClassifier, Pool, cv
from sklearn.metrics import mean_absolute_error, log_loss
import numpy as np
# from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./data/Data_Scientist_Interview_Task.csv')

In [4]:
# drop columns that don't provide useful information
df = df.drop(columns=['Claim Number', 'Notifier', 'Loss_code', 'Loss_description', 'Inception_to_loss'])

# drop additional columns that we won't use in this round of modelling
# date of loss could arguably be used to provide day of the week info that might be useful
# time of loss could be used to create time bands that split out rush hours and late nights etc.
df = df.drop(columns=['date_of_loss', 'Time_hour'])

In [5]:
df['Weather_conditions'] = df['Weather_conditions'].fillna('N/K')

df['PH_considered_TP_at_fault'] = df['PH_considered_TP_at_fault'].replace('#', 'n/k')

In [6]:
# set values in target variables less than zero to zero

df.loc[df['Incurred'] < 0, ['Incurred', 'Capped Incurred']] = 0

## Create Attritional Model

In [7]:
# create a data frame for the attritional claims model

df_att = df.drop(columns=['Incurred'])

In [8]:

feature_names = list(df_att.drop(columns=['Capped Incurred']))
cat_features = df_att.drop(columns=['Capped Incurred']).select_dtypes(include=['object']).columns.tolist()

data_pool = Pool(
    data = df_att.drop(columns=['Capped Incurred']),
    label = df_att['Capped Incurred'],
    feature_names = feature_names,
    cat_features = cat_features
)

In [9]:
params = {"objective": "Tweedie:variance_power=1.99",
          "iterations": 1000,
          "random_seed": 69,
#          "depth": 2,
#          "loss_function": "Logloss",
          "verbose": False}

In [10]:
scores = cv(pool = data_pool,
            params = params,
            fold_count = 4,
            early_stopping_rounds = 15)

Stopped by overfitting detector  (15 iterations wait)


In [11]:
optimal_iterations = len(scores)-15
optimal_iterations

551

In [12]:
# Fit final model
params = {"objective": "Tweedie:variance_power=1.99",
          "iterations": optimal_iterations,
          "random_seed": 69,
          "verbose": False}


att_model = CatBoostRegressor(**params)

att_model.fit(data_pool)

<catboost.core.CatBoostRegressor at 0x7f4a316499a0>

In [13]:
# Calculate mse of the model

att_model_preds = att_model.predict(data_pool)

mae = mean_absolute_error(att_model_preds, df_att['Capped Incurred'])

mae

4465.370717733324

In [14]:
# save the model
model_name = "fnol_attritional_model.cbm"

att_model.save_model(model_name, format="cbm")

## Create Large Loss Propensity Model

In [15]:
df['Large_Prop'] = np.where(df['Incurred'] > df['Capped Incurred'], 1, 0)

In [16]:
df['Large_Prop'].sum()/len(df['Large_Prop'])

0.025744376544012484

In [17]:
# create a data frame for the attritional claims model

df_large = df.drop(columns=['Incurred', 'Capped Incurred'])

In [18]:

feature_names = list(df_large.drop(columns=['Large_Prop']))
cat_features = df_large.drop(columns=['Large_Prop']).select_dtypes(include=['object']).columns.tolist()

data_pool = Pool(
    data = df_large.drop(columns=['Large_Prop']),
    label = df_large['Large_Prop'],
    feature_names = feature_names,
    cat_features = cat_features
)

In [19]:
params = {"objective": "Logloss",
          "iterations": 1000,
          "random_seed": 69,
#          "depth": 2,
#          "loss_function": "Logloss",
          "verbose": False}

In [20]:
scores = cv(pool = data_pool,
            params = params,
            fold_count = 4,
            early_stopping_rounds = 15)

Stopped by overfitting detector  (15 iterations wait)


In [21]:
optimal_iterations = len(scores)-15
optimal_iterations

221

In [22]:
# Fit final model
params = {"objective": "Logloss",
          "iterations": optimal_iterations,
          "random_seed": 69,
          "verbose": False}


large_model = CatBoostClassifier(**params)

large_model.fit(data_pool)

<catboost.core.CatBoostClassifier at 0x7f4a316083a0>

In [23]:
# Calculate mse of the model

large_model_preds = large_model.predict_proba(data_pool)[:, 1]

logloss = log_loss(df_large['Large_Prop'], large_model_preds)

logloss

0.04804899206887887

In [24]:
# Mean prediction

large_model_preds.mean()

0.02563057459931494

In [25]:
# save the model
model_name = "fnol_large_model.cbm"

large_model.save_model(model_name, format="cbm")

## Create Large Loss Model

In [26]:
df['Large_Incurred'] = df['Incurred'] - df['Capped Incurred']

In [27]:
large_severity = df[df['Large_Prop']==1]['Large_Incurred'].mean()
large_severity

128818.101010101

## Overall Model Performance

In [28]:
df['FNOL_Prediction'] = att_model_preds + (large_model_preds * large_severity)

In [29]:
df['FNOL_Prediction'].mean()

8898.169940414371

In [30]:
df['Incurred'].mean()

9427.851644779612

In [31]:
mae = mean_absolute_error(df['FNOL_Prediction'], df['Incurred'])

mae

7924.461187881152

In [32]:
len(feature_names)

37

In [33]:
list(range(1, 38))

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37]

In [34]:
list(reversed(range(1, 38)))

[37,
 36,
 35,
 34,
 33,
 32,
 31,
 30,
 29,
 28,
 27,
 26,
 25,
 24,
 23,
 22,
 21,
 20,
 19,
 18,
 17,
 16,
 15,
 14,
 13,
 12,
 11,
 10,
 9,
 8,
 7,
 6,
 5,
 4,
 3,
 2,
 1]