<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Train-Test-Split" data-toc-modified-id="Train-Test-Split-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train Test Split</a></span></li><li><span><a href="#Pure-Premium-Modelling-:-Tweedie-GLM" data-toc-modified-id="Pure-Premium-Modelling-:-Tweedie-GLM-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Pure Premium Modelling : Tweedie GLM</a></span></li><li><span><a href="#Modelling:-Product-of-Frequency-and-Severity-Modelling" data-toc-modified-id="Modelling:-Product-of-Frequency-and-Severity-Modelling-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Modelling: Product of Frequency and Severity Modelling</a></span></li></ul></div>

# Load the libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()

import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
json     2.0.9
autopep8 1.5.2
numpy    1.18.4
sklearn  0.23.1
seaborn  0.11.0
joblib   0.16.0
scipy    1.4.1
pandas   1.1.0



# Load the data

In [4]:
df = pd.read_csv('../data/processed/clean_data.csv.zip', compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))

(100000, 15)


Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount,PurePremium,Frequency,AvgClaimAmount
0,0,0.1,D,5,0,55,50,B12,Regular,1217,R82,0.0,0.0,0.0,0.0
1,0,0.77,D,5,0,55,50,B12,Regular,1217,R82,0.0,0.0,0.0,0.0
99998,0,0.9,C,7,9,44,50,B1,Regular,191,R24,0.0,0.0,0.0,0.0
99999,0,0.9,E,4,12,53,50,B1,Regular,4116,R24,0.0,0.0,0.0,0.0


In [5]:
X = scipy.sparse.load_npz("../data/processed/X.npz")

In [6]:
df.head(2)

Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount,PurePremium,Frequency,AvgClaimAmount
0,0,0.1,D,5,0,55,50,B12,Regular,1217,R82,0.0,0.0,0.0,0.0
1,0,0.77,D,5,0,55,50,B12,Regular,1217,R82,0.0,0.0,0.0,0.0


In [7]:
np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])

In [8]:
with open("../data/processed/features.json") as fi:
    json_features = json.load(fi)

In [9]:
json_features.keys()

dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)

target = ['Frequency']

y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()

df_train.shape, df_test.shape, X_train.shape, X_test.shape

((75000, 15), (25000, 15), (75000, 71), (25000, 71))

# Pure Premium Modelling : Tweedie GLM

In [12]:
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance

In [13]:
# TweedieRegressor?

In [14]:
glm_twd = TweedieRegressor(power=1.9, alpha=.1, max_iter=10_000)

glm_twd.fit(X_train, df_train["PurePremium"],
                     sample_weight=df_train["Exposure"])

TweedieRegressor(alpha=0.1, max_iter=10000, power=1.9)

In [15]:
tr_D2 = glm_twd.score(X_train,
                      df_train['PurePremium'],
                      sample_weight=df_train['Exposure'])

tx_D2 = glm_twd.score(X_test,
                      df_test['PurePremium'],
                      sample_weight=df_test['Exposure'])

tr_preds = glm_twd.predict(X_train)
tx_preds = glm_twd.predict(X_test)

tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)

tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)

df_eval_twd = pd.DataFrame(
{'train': [tr_D2, tr_mae, tr_mse],
'test': [tx_D2, tx_mae, tx_mse]})

df_eval_twd.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_twd

Unnamed: 0,train,test
D2,0.020186,0.013533
mean_absolute_error,182.982035,179.520763
mean_squared_error,142995.523138,69103.013696


# Modelling: Product of Frequency and Severity Modelling

In [17]:
# freq model: possion
glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train, df_train["Frequency"],
             sample_weight=df_train["Exposure"])

tr_preds_freq = glm_freq.predict(X_train)
tx_preds_freq = glm_freq.predict(X_test)

In [18]:
# severity model: gamma
mask_train = (df_train["ClaimAmount"] > 0).to_numpy().ravel()
mask_test = (df_test["ClaimAmount"] > 0).to_numpy().ravel()

glm_sev = GammaRegressor(alpha=10., max_iter=10_000)

glm_sev.fit(
    X_train[mask_train],
    df_train.loc[mask_train, "AvgClaimAmount"],
    sample_weight=df_train.loc[mask_train, "ClaimNb"],
)

tr_preds_sev = glm_sev.predict(X_train)
tx_preds_sev = glm_sev.predict(X_test)

In [20]:
# product of prediction of freq and severity

tr_preds = tr_preds_freq * tr_preds_sev
tx_preds  = tx_preds_freq * tx_preds_sev

In [22]:
tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)

tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)

df_eval_product = pd.DataFrame(
{'train': [np.nan, tr_mae, tr_mse],
'test': [np.nan, tx_mae, tx_mse]})

df_eval_product.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_product

Unnamed: 0,train,test
D2,,
mean_absolute_error,179.254282,177.081152
mean_squared_error,66743.9785,48660.865824


In [23]:
df_eval_twd

Unnamed: 0,train,test
D2,0.020186,0.013533
mean_absolute_error,182.982035,179.520763
mean_squared_error,142995.523138,69103.013696
