In [17]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.optimize as optimize
import scipy.stats
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import ShuffleSplit
from _load_transform import load_transform
from _sample_split import create_sample_split
import statsmodels.api as sm

In [8]:
df = load_transform()
with pd.option_context('display.max_rows', 10):
    display(df)

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount,ClaimAmountCut
0,1,0,0.10000,D,5,0,5,50,B12,Regular,1217,R82,0.0,0.0
1,3,0,0.77000,D,5,0,5,50,B12,Regular,1217,R82,0.0,0.0
2,5,0,0.75000,B,6,1,5,50,B12,Diesel,54,R22,0.0,0.0
3,10,0,0.09000,B,7,0,4,50,B12,Diesel,76,R72,0.0,0.0
4,11,0,0.84000,B,7,0,4,50,B12,Diesel,76,R72,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678008,6114326,0,0.00274,E,4,0,5,50,B12,Regular,3317,R93,0.0,0.0
678009,6114327,0,0.00274,E,4,0,4,95,B12,Regular,9850,R11,0.0,0.0
678010,6114328,0,0.00274,D,6,1,4,50,B12,Diesel,1323,R82,0.0,0.0
678011,6114329,0,0.00274,B,4,0,5,50,B12,Regular,95,R26,0.0,0.0


In [14]:
df = create_sample_split(df, id_column='IDpol', training_frac=0.8)
train_df = df[df['sample'] == 'train']
test_df = df[df['sample'] == 'test']

In [15]:
train_df

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount,ClaimAmountCut,sample
0,1,0,0.10,D,5,0,5,50,B12,Regular,1217,R82,0.0,0.0,train
1,3,0,0.77,D,5,0,5,50,B12,Regular,1217,R82,0.0,0.0,train
2,5,0,0.75,B,6,1,5,50,B12,Diesel,54,R22,0.0,0.0,train
3,10,0,0.09,B,7,0,4,50,B12,Diesel,76,R72,0.0,0.0,train
4,11,0,0.84,B,7,0,4,50,B12,Diesel,76,R72,0.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598014,4189192,0,1.00,B,7,2,4,50,B12,Regular,77,R82,0.0,0.0,train
598015,4189193,0,1.00,A,5,2,5,50,B1,Regular,26,R24,0.0,0.0,train
598016,4189194,0,1.00,A,7,2,6,50,B2,Regular,17,R24,0.0,0.0,train
598017,4189195,1,1.00,E,4,1,3,51,B1,Regular,3866,R24,1204.0,1204.0,train


In [16]:
test_df

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount,ClaimAmountCut,sample
598019,5000002,0,0.03000,D,4,1,6,80,B6,Regular,645,R94,0.0,0.0,test
598020,5000004,0,0.19000,D,4,1,6,76,B6,Regular,645,R94,0.0,0.0,test
598021,5000006,0,0.03000,E,9,1,5,76,B12,Regular,5433,R93,0.0,0.0,test
598022,5000008,0,0.46000,E,9,1,5,95,B12,Regular,5433,R93,0.0,0.0,test
598023,5000010,0,0.03000,B,9,1,5,50,B12,Regular,54,R91,0.0,0.0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678008,6114326,0,0.00274,E,4,0,5,50,B12,Regular,3317,R93,0.0,0.0,test
678009,6114327,0,0.00274,E,4,0,4,95,B12,Regular,9850,R11,0.0,0.0,test
678010,6114328,0,0.00274,D,6,1,4,50,B12,Diesel,1323,R82,0.0,0.0,test
678011,6114329,0,0.00274,B,4,0,5,50,B12,Regular,95,R26,0.0,0.0,test


In [21]:
df['PurePremium'] = df['ClaimAmount'] / df['Exposure']
predictors = ['ClaimNb', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus']
X = df[predictors]
X = sm.add_constant(X)
y = df['PurePremium']

tweedie_model = sm.GLM(y, X, family=sm.families.Tweedie(var_power=1.5))
tweedie_results = tweedie_model.fit()
print(tweedie_results.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:            PurePremium   No. Observations:               678013
Model:                            GLM   Df Residuals:                   678007
Model Family:                 Tweedie   Df Model:                            5
Link Function:                    Log   Scale:                          4936.9
Method:                          IRLS   Log-Likelihood:            -3.9371e+05
Date:                Mon, 18 Nov 2024   Deviance:                   2.8631e+07
Time:                        23:01:32   Pearson chi2:                 3.35e+09
No. Iterations:                    35   Pseudo R-squ. (CS):            0.01962
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.2017      0.244      9.022      0.0

#### We divide the claims amount by exposure because 
1. Normalization: Exposure measures the amount of time a policy is active. By dividing the total claims amount by the exposure, we normalize the claim amounts to a standard unit of time (usually per year). 

2. Annualizing Data: The process effectively annualizes the claims data, providing an estimate of what the insurer can expect to pay on an annual basis per policy. This is important for insurers to assess the annual risk and to set premiums accordingly.