In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
#from ydata_profiling import ProfileReport

import warnings
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('ml_gw_car_insurance.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,...,No,No,No,Yes,No,No,No,Yes,0,0.0
1,1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,...,No,No,No,Yes,No,No,No,Yes,0,0.0
2,2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,...,No,No,No,Yes,No,No,No,Yes,0,0.0
3,3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0.0
4,4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0.0


In [30]:
# Number of car models
df.model.nunique()

11

In [31]:
#Transforming Yes and No values into Booleans
df = df.replace({ "No" : 0 , "Yes" : 1 })

#Removal of policy_id column
df.drop(columns = {"policy_id", 'Unnamed: 0'}, inplace = True)

df.head()

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,60Nm@3500rpm,...,0,0,0,1,0,0,0,1,0,0.0
1,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,60Nm@3500rpm,...,0,0,0,1,0,0,0,1,0,0.0
2,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,60Nm@3500rpm,...,0,0,0,1,0,0,0,1,0,0.0
3,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,113Nm@4400rpm,...,1,1,1,1,1,1,1,1,2,0.0
4,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,91Nm@4250rpm,...,0,1,1,1,0,1,1,1,2,0.0


In [53]:
print(df.age_of_car.max())
print(df.age_of_car.min())

print(df.age_of_policyholder.max())
print(df.age_of_policyholder.min())

1.0
0.0
1.0
0.288461538461538


In [39]:
df.is_claim.value_counts()/df.shape[0] * 100

# Is_Claim is heavily unbalanced -> use class_weight for logistic regression

0.0    56.160975
1.0     3.838001
Name: is_claim, dtype: float64

In [52]:
round(56.160975/3.838001, 0)
# Use this weight for unbalanced classes

15.0

In [41]:
# only considering rows where target variable is not nan
df = df[(df.is_claim == 1) | (df.is_claim == 0)]

# linear correlation
corr = df.corr().abs()
corr_claim = corr['is_claim'].sort_values(ascending=False)
print(f'The variables most correlated with claims are as followed: \n{corr_claim}')

The variables most correlated with claims are as followed: 
is_claim                            1.000000
policy_tenure                       0.078747
age_of_car                          0.028172
age_of_policyholder                 0.022435
population_density                  0.017808
is_adjustable_steering              0.013917
cylinder                            0.013434
is_front_fog_lights                 0.011825
is_brake_assist                     0.010893
is_driver_seat_height_adjustable    0.010686
width                               0.009947
is_parking_sensors                  0.008419
is_day_night_rear_view_mirror       0.007989
displacement                        0.007678
is_speed_alert                      0.007307
is_ecw                              0.006637
is_power_door_locks                 0.006637
is_central_locking                  0.006637
length                              0.006495
gross_weight                        0.003894
ncap_rating                         0.00

In [36]:
df.to_csv('clean_data.csv')

In [12]:
#profile = ProfileReport(df)
#profile

### Logistic Regression

In [49]:
X = df.drop('is_claim', axis=1)
y = df['is_claim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the transformers
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

age_transformer = Pipeline(steps=[
    ('quartile', KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='quantile'))
])

# create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['policy_tenure', 'population_density', 'is_adjustable_steering']),
        ('cat', cat_transformer, ['area_cluster']),
        ('age', age_transformer, ['age_of_car', 'age_of_policyholder'])
    ])

# build the logistic regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1], cv=10, penalty='l1', class_weight={0: 1, 1: 15} , solver='liblinear', max_iter=10000, random_state=42))
])

# train the model on the training set
model.fit(X_train, y_train)

# predict on the training set
y_train_pred = model.predict(X_train)

print('F1 Score:', f1_score(y_train, y_train_pred, pos_label=1))
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))
print("Recall:", recall_score(y_train, y_train_pred))

F1 Score: 0.14988177774215528
Accuracy: 0.5474366906321336
Precision: 0.0851509494103183
Recall: 0.625


### Ridge & Lasso Regression

In [54]:
# only considering highest 5 correlated variables
X = df[['age_of_car', 'age_of_policyholder', 'population_density', 'is_adjustable_steering', 'cylinder']]
y = df['is_claim']

# create linear ridge regression model
linear_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_reg', RidgeCV(alphas=np.logspace(-3, 3, 7), cv=10)) #specify range of values to test for regularization parameter
])
linear_reg.fit(X, y)

# create polynomial ridge regression model
poly_reg = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('linear_reg', RidgeCV(alphas=np.logspace(-3, 3, 7), cv=10))
])
poly_reg.fit(X, y)

# create linear lasso regression model
lasso_reg_linear = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso_reg', LassoCV(alphas=np.logspace(-3, 3, 7), cv=10, max_iter=10000))
])
lasso_reg_linear.fit(X, y)

# create polynomial lasso regression model
lasso_reg_poly = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('lasso_reg', LassoCV(alphas=np.logspace(-3, 3, 7), cv=10, max_iter=10000))
])
lasso_reg_poly.fit(X, y)

# evaluate linear ridge regression model
linear_reg_mse = -cross_val_score(linear_reg, X, y, cv=10, scoring='neg_mean_squared_error')
print('Linear Ridge Regression MSE: ', linear_reg_mse.mean())

# evaluate polynomial ridge regression model
poly_reg_mse = -cross_val_score(poly_reg, X, y, cv=10, scoring='neg_mean_squared_error')
print('Polynomial Ridge Regression MSE: ', poly_reg_mse.mean())

# evaluate linear lasso regression model
lasso_reg_linear_mse = -cross_val_score(lasso_reg_linear, X, y, cv=10, scoring='neg_mean_squared_error')
print('Linear Lasso Regression MSE: ', lasso_reg_linear_mse.mean())

# evaluate polynomial lasso regression model
lasso_reg_poly_mse = -cross_val_score(lasso_reg_poly, X, y, cv=10, scoring='neg_mean_squared_error')
print('Polynomial Lasso Regression MSE: ', lasso_reg_poly_mse.mean())

"""
# get coefficients
linear_reg_coef = linear_reg.named_steps['linear_reg'].coef_
poly_reg_coef = poly_reg.named_steps['linear_reg'].coef_
lasso_reg_linear_coef = lasso_reg_linear.named_steps['lasso_reg'].coef_
lasso_reg_poly_coef = lasso_reg_poly.named_steps['lasso_reg'].coef_

# print coefficients
print('Linear Regression Coefficients:', linear_reg_coef)
print('Polynomial Regression Coefficients:', poly_reg_coef)
print('Linear Lasso Regression Coefficients:', lasso_reg_linear_coef)
print('Polynomial Lasso Regression Coefficients:', lasso_reg_poly_coef)
"""

Linear Ridge Regression MSE:  0.059752218358399
Polynomial Ridge Regression MSE:  0.05973653226957885
Linear Lasso Regression MSE:  0.05975756273396545
Polynomial Lasso Regression MSE:  0.05974351176683681


"\n# get coefficients\nlinear_reg_coef = linear_reg.named_steps['linear_reg'].coef_\npoly_reg_coef = poly_reg.named_steps['linear_reg'].coef_\nlasso_reg_linear_coef = lasso_reg_linear.named_steps['lasso_reg'].coef_\nlasso_reg_poly_coef = lasso_reg_poly.named_steps['lasso_reg'].coef_\n\n# print coefficients\nprint('Linear Regression Coefficients:', linear_reg_coef)\nprint('Polynomial Regression Coefficients:', poly_reg_coef)\nprint('Linear Lasso Regression Coefficients:', lasso_reg_linear_coef)\nprint('Polynomial Lasso Regression Coefficients:', lasso_reg_poly_coef)\n"