## Car Insurance Claim Prediction

Objective:
- create model that can be used to predict if policyholder will file claim within the next 6 months or not.
    

In [2]:
# 1. Import data

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
# import data we will use to train model
df = pd.read_excel('train2.xls')
df.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [5]:
df.shape

(10000, 44)

In [6]:
# 2. Preprocessing

In [7]:
#Remove of policy_id column
df2 = df.drop('policy_id', axis=1)

In [8]:
# Yes/No values into True/False values
df2 = df.replace({ "No" : False , "Yes" : True })

In [9]:
df2.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,False,False,False,True,False,False,False,True,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,False,False,False,True,False,False,False,True,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,False,False,False,True,False,False,False,True,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,True,True,True,True,True,True,True,True,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,False,True,True,True,False,True,True,True,2,0


In [10]:
# Use get_dummies function to convert catergorical data into numerical data 

In [11]:
# obtain only catergorical columns
cat_cols = df2.select_dtypes(include=['object']).columns

In [12]:
# new df with only numbers and boolean 
df3 = pd.get_dummies(df2, columns=cat_cols)

In [13]:
df3.head()

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,population_density,make,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,...,engine_type_K Series Dual jet,engine_type_K10C,engine_type_i-DTEC,rear_brakes_type_Disc,rear_brakes_type_Drum,transmission_type_Automatic,transmission_type_Manual,steering_type_Electric,steering_type_Manual,steering_type_Power
0,0.515874,0.05,0.644231,4990,1,2,False,False,False,True,...,0,0,0,0,1,0,1,0,0,1
1,0.672619,0.02,0.375,27003,1,2,False,False,False,True,...,0,0,0,0,1,0,1,0,0,1
2,0.84111,0.02,0.384615,4076,1,2,False,False,False,True,...,0,0,0,0,1,0,1,0,0,1
3,0.900277,0.11,0.432692,21622,1,2,True,True,False,True,...,0,0,0,0,1,1,0,1,0,0
4,0.596403,0.11,0.634615,34738,2,2,False,False,False,False,...,0,0,0,0,1,1,0,1,0,0


In [14]:
# assign input and output variables. 
X = df3.drop(columns = "is_claim")
y = df3["is_claim"]

In [15]:
X.shape , y.shape

((10000, 10110), (10000,))

In [16]:
y.value_counts()

0    9357
1     643
Name: is_claim, dtype: int64

In [17]:
# 3. Address imbalance of 0 and 1 values

In [18]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)


In [19]:

X_resampled.shape , y_resampled.shape

((18714, 10110), (18714,))

In [20]:
# we now have an equal numbe of claims vs non-claims 
y_resampled.value_counts()

0    9357
1    9357
Name: is_claim, dtype: int64

In [None]:
# 4. Create Random Forest model

In [30]:
# split data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=37)


In [31]:
# use random forrest model to trian model and make predictions 
from sklearn.ensemble import RandomForestClassifier 

classifier = RandomForestClassifier()
 
classifier.fit(X_train,y_train)

predictions = classifier.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_test,predictions) 

1.0

In [33]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1897
           1       1.00      1.00      1.00      1846

    accuracy                           1.00      3743
   macro avg       1.00      1.00      1.00      3743
weighted avg       1.00      1.00      1.00      3743

