# Detecting Auto insurance fraud claims

In [48]:
# Import all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_curve, precision_recall_curve, auc
from sklearn.tree import DecisionTreeClassifier
import warnings

In [49]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_Interactivity = 'all'

In [50]:
# Read dataset
data = pd.read_csv("C:\\Users\\Anju\\Desktop\\Data science misc\\Hand on Projects\\Auto Insurance Fraud Prediction - Streamlit\\insurance_claims.csv")

In [51]:
data.shape

(1000, 40)

In [52]:
# fraud_reported is our target column. We will convert it to 1 and 0 and build the target dataframe.
data['fraud_reported'] = data['fraud_reported'].str.replace('Y', '1')
data['fraud_reported'] = data['fraud_reported'].str.replace('N', '0')
data['fraud_reported'] = data['fraud_reported'].astype(int)
data_target = data['fraud_reported']
data_target.shape

(1000,)

In [53]:
# In another notebook 'Auto Insurance Fraud claim detection-background analysis' where I used techniques like correlation matrix and Chi square 
# for feature selection, below are the important features I have selected to use for my app to predict the fraud claims.

# 'insured_hobbies', 'collision_type',  'months_as_customer_groups', 'policy_deductable', 'incident_severity',
# 'vehicle_claim_groups', 'umbrella_limit', 'number_of_vehicles_involved' ,'bodily_injuries','witnesses', 'incident_type', 
# 'authorities_contacted' ,'police_report_available'

# Reading the dataset with all the above columns:
data_app_req = pd.read_csv("C:\\Users\\Anju\\Desktop\\Data science misc\\Hand on Projects\\Auto Insurance Fraud Prediction - Streamlit\\data_req_app.csv")
data_app_req.head()   # It has only the input features.


Unnamed: 0,insured_hobbies,collision_type_new,months_as_customer,policy_deductable,incident_severity,vehicle_claim,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new
0,other,Side Collision,328,1000,Major Damage,52080,0,1,1,2,Single Vehicle Collision,Police,YES
1,other,other,228,2000,Minor Damage,3510,5000000,1,0,0,Vehicle Theft,Police,other
2,other,Rear Collision,134,2000,Minor Damage,23100,5000000,3,2,3,Multi-vehicle Collision,Police,NO
3,other,Front Collision,256,2000,Major Damage,50720,6000000,1,1,2,Single Vehicle Collision,Police,NO
4,other,other,228,1000,Minor Damage,4550,6000000,1,0,1,Vehicle Theft,,NO


In [14]:
data_app_req.shape

(1000, 13)

In [63]:
data_app_req ['authorities_contacted'].value_counts()


Police       292
Fire         223
Other        198
Ambulance    196
None          91
Name: authorities_contacted, dtype: int64

In [65]:
data_app_req ['authorities_contacted'].value_counts()

4    292
1    223
3    198
0    196
2     91
Name: authorities_contacted, dtype: int64

# label encoding

In [15]:
data_app_req.dtypes    

insured_hobbies                object
collision_type_new             object
months_as_customer              int64
policy_deductable               int64
incident_severity              object
vehicle_claim                   int64
umbrella_limit                  int64
number_of_vehicles_involved     int64
bodily_injuries                 int64
witnesses                       int64
incident_type                  object
authorities_contacted          object
police_report_available_new    object
dtype: object

In [64]:
# label endcoding for the object datatypes.

for col in ['insured_hobbies', 'collision_type_new', 'incident_severity', 'incident_type', 'authorities_contacted' ,
            'police_report_available_new']:
    if (data_app_req[col].dtype == 'object'):
        le = preprocessing.LabelEncoder()
        le = le.fit(data_app_req[col])
        data_app_req[col] = le.transform(data_app_req[col])
        print('Completed Label encoding on',col)


Completed Label encoding on insured_hobbies
Completed Label encoding on collision_type_new
Completed Label encoding on incident_severity
Completed Label encoding on incident_type
Completed Label encoding on authorities_contacted
Completed Label encoding on police_report_available_new


In [18]:
data_app_req.head()

Unnamed: 0,insured_hobbies,collision_type_new,months_as_customer,policy_deductable,incident_severity,vehicle_claim,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new
0,2,2,328,1000,0,52080,0,1,1,2,2,4,1
1,2,3,228,2000,1,3510,5000000,1,0,0,3,4,2
2,2,1,134,2000,1,23100,5000000,3,2,3,0,4,0
3,2,0,256,2000,0,50720,6000000,1,1,2,2,4,0
4,2,3,228,1000,1,4550,6000000,1,0,1,3,2,0


In [19]:
# Feature Scaling.

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
data_scaled = sc.fit_transform(data_app_req)


In [20]:
# Splitting data into train and test

x_train, x_test, y_train, y_test = train_test_split(data_scaled, data_target, random_state = 1)
print('x_train:', x_train.shape, 'x_test:', x_test.shape, 'y_train:', y_train.shape, 'y_test:', y_test.shape )


x_train: (750, 13) x_test: (250, 13) y_train: (750,) y_test: (250,)


# Logistic regression

In [21]:
log = LogisticRegression()

log.fit(x_train, y_train)
prediction = log.predict(x_test)

score = log.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, prediction))


83.6

              precision    recall  f1-score   support

           0       0.83      0.97      0.89       180
           1       0.85      0.50      0.63        70

    accuracy                           0.84       250
   macro avg       0.84      0.73      0.76       250
weighted avg       0.84      0.84      0.82       250



# Decision Tree

In [27]:
dtc = DecisionTreeClassifier()

dtc.fit(x_train, y_train)
preds = dtc.predict(x_test)

score = dtc.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, preds))

76.8

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       180
           1       0.59      0.56      0.57        70

    accuracy                           0.77       250
   macro avg       0.71      0.70      0.71       250
weighted avg       0.76      0.77      0.77       250



# Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1)
rfc.fit(x_train, y_train)
preds = rfc.predict(x_test)

score = rfc.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, preds))

82.8

              precision    recall  f1-score   support

           0       0.86      0.91      0.88       180
           1       0.73      0.61      0.67        70

    accuracy                           0.83       250
   macro avg       0.79      0.76      0.78       250
weighted avg       0.82      0.83      0.82       250



# Random Forest with grid search CV

In [29]:
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
parameters = {'n_estimators':[100,300],'n_jobs':[-1], 'max_features': [0.5,0.7,0.9], 'min_samples_split': [2, 5, 10, 15],'max_depth': [3,5,7,15],'min_samples_leaf':[1,2,5,10],'random_state':[14]} 

clf1 = GridSearchCV(RandomForestClassifier(), parameters, cv=5, scoring='roc_auc')
clf1.fit(x_train, y_train)
pred_tuned = clf1.predict(x_test)

score = clf1.score(x_test, y_test)
print(score*100)
print()
print(classification_report(y_test, preds))


91.24603174603175

              precision    recall  f1-score   support

           0       0.86      0.91      0.88       180
           1       0.73      0.61      0.67        70

    accuracy                           0.83       250
   macro avg       0.79      0.76      0.78       250
weighted avg       0.82      0.83      0.82       250



In [33]:
# predictions on test using clf model.
pred_tuned = clf1.predict(x_test)
pred_tuned.shape

(250,)

In [34]:
# Creating dataframe with actual and predicted values to compare.

diff = pd.DataFrame({'Actual': y_test, 'Predicted': pred_tuned})
diff.head()


Unnamed: 0,Actual,Predicted
507,0,0
818,0,0
452,0,0
368,1,0
242,0,0


In [39]:
# Adding new columns 'actual' and 'predictions' to data_app in order to understand what type of records are predicted correctly.

data_app_req['actual'] = diff['Actual']
data_app_req['predictions'] = diff['Predicted']


In [40]:
# In below result, we will see many actual and predictions as NaN because data_app consists of 1000 rows and with my model, I
# predicted on only 250 rows.

data_app_req

Unnamed: 0,insured_hobbies,collision_type_new,months_as_customer,policy_deductable,incident_severity,vehicle_claim,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new,actual,predictions
0,other,Side Collision,328,1000,Major Damage,52080,0,1,1,2,Single Vehicle Collision,Police,YES,,
1,other,other,228,2000,Minor Damage,3510,5000000,1,0,0,Vehicle Theft,Police,other,,
2,other,Rear Collision,134,2000,Minor Damage,23100,5000000,3,2,3,Multi-vehicle Collision,Police,NO,0.0,0.0
3,other,Front Collision,256,2000,Major Damage,50720,6000000,1,1,2,Single Vehicle Collision,Police,NO,1.0,1.0
4,other,other,228,1000,Minor Damage,4550,6000000,1,0,1,Vehicle Theft,,NO,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,other,Front Collision,3,1000,Minor Damage,61040,0,1,0,1,Single Vehicle Collision,Fire,other,0.0,0.0
996,other,Rear Collision,285,1000,Major Damage,72320,0,1,2,3,Single Vehicle Collision,Fire,other,,
997,other,Side Collision,130,500,Minor Damage,52500,3000000,3,2,3,Multi-vehicle Collision,Police,YES,0.0,0.0
998,other,Rear Collision,458,2000,Major Damage,36540,5000000,1,0,1,Single Vehicle Collision,Other,YES,,


In [41]:
# Dropping all the rows with NaN values.
# So that I can have a cleaned view of my predictions.

data_app_prediction = data_app_req.dropna()
print(data_app_prediction.shape)
data_app_prediction



(250, 15)


Unnamed: 0,insured_hobbies,collision_type_new,months_as_customer,policy_deductable,incident_severity,vehicle_claim,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new,actual,predictions
2,other,Rear Collision,134,2000,Minor Damage,23100,5000000,3,2,3,Multi-vehicle Collision,Police,NO,0.0,0.0
3,other,Front Collision,256,2000,Major Damage,50720,6000000,1,1,2,Single Vehicle Collision,Police,NO,1.0,1.0
6,other,Front Collision,137,1000,Minor Damage,50050,0,3,0,0,Multi-vehicle Collision,Police,other,0.0,0.0
8,other,Front Collision,27,500,Total Loss,22160,0,1,1,1,Single Vehicle Collision,Police,YES,0.0,0.0
12,other,Rear Collision,60,500,Total Loss,42390,3000000,1,1,0,Single Vehicle Collision,Ambulance,NO,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,other,Rear Collision,295,500,Minor Damage,42490,0,1,1,1,Single Vehicle Collision,Fire,NO,0.0,0.0
992,other,Front Collision,94,500,Major Damage,25690,0,3,1,2,Multi-vehicle Collision,Fire,YES,0.0,0.0
994,other,other,141,1000,Minor Damage,4860,0,1,1,2,Parked Car,,YES,0.0,0.0
995,other,Front Collision,3,1000,Minor Damage,61040,0,1,0,1,Single Vehicle Collision,Fire,other,0.0,0.0


In [42]:
df = data_app_prediction[data_app_prediction['predictions'] == 1.0]
print(df.shape)
df


(78, 15)


Unnamed: 0,insured_hobbies,collision_type_new,months_as_customer,policy_deductable,incident_severity,vehicle_claim,umbrella_limit,number_of_vehicles_involved,bodily_injuries,witnesses,incident_type,authorities_contacted,police_report_available_new,actual,predictions
3,other,Front Collision,256,2000,Major Damage,50720,6000000,1,1,2,Single Vehicle Collision,Police,NO,1.0,1.0
19,other,Side Collision,196,2000,Major Damage,48320,0,3,2,0,Multi-vehicle Collision,Police,NO,0.0,1.0
35,other,Front Collision,147,1000,Major Damage,37170,6000000,1,2,0,Single Vehicle Collision,Other,YES,1.0,1.0
41,chess,Side Collision,116,500,Major Damage,64720,0,1,1,1,Single Vehicle Collision,Police,other,1.0,1.0
47,other,Front Collision,355,2000,Major Damage,50240,4000000,3,2,1,Multi-vehicle Collision,Fire,NO,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,cross-fit,other,163,1000,Minor Damage,2730,4000000,1,2,1,Vehicle Theft,Police,YES,1.0,1.0
967,other,Rear Collision,179,1000,Major Damage,37940,6000000,1,1,2,Single Vehicle Collision,Police,NO,1.0,1.0
968,other,Side Collision,372,2000,Major Damage,36260,0,3,2,2,Multi-vehicle Collision,Police,YES,0.0,1.0
974,other,Side Collision,234,500,Major Damage,75600,0,3,2,1,Multi-vehicle Collision,Police,other,1.0,1.0


# With Grid search CV applied on random Forest model, accuracy increased to 91.2%, hence we will be using random forest clf1 as our final model and deploy it to production to predict Auto insurance fraud claim detection.

In [None]:
# Finally save model using pickle and build app uisng SteamLit

In [37]:
import streamlit as st
import pickle

# save the model
filename = 'Streamlit_Autoinsurancefraud1.sav'
pickle.dump(clf1, open(filename,'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)


0.9124603174603175


In [None]:
# Now we will use another notebook named 'Predict Auto Insurance Fraud(streamlit)' 
# to build the app to predict the whether a reported claim is fraud or genuine using streamlit and again save the notebook in 
# C folder with name 'streamlitpredictautoinsurancefraud1'