In [57]:
# Import the data
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE


# from ml_utils import train_test_split_marketing,\
#     fill_missing,\
#     build_encoders,\
#     encode_categorical,\
#     build_target_encoder,\
#     encode_target

df = pd.read_csv('vehicle_insurance_claim_fraud.csv').rename(columns={'FraudFound_P':'y'})
display(df.head())
display(df.info())

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  y                     15420 non-null

None

In [58]:
# Split the data into training and testing sets
X = df.drop(columns='y')
y = df['y'].values.reshape(-1,1)

# Show the X_train dataset
X_train, X_test, y_train, y_test = train_test_split(X,y)
display(X_train.describe())

# How balanced is the data?
values, counts = np.unique(y, return_counts=True)
print(f"There are {counts[0]} non-fraudulent rows, and {counts[1]} fraud rows.")


Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,11565.0,11565.0,11565.0,11565.0,11565.0,11565.0,11565.0,11565.0
mean,2.778988,2.692002,39.788067,7694.789537,8.483441,407.548638,2.490791,1994.863986
std,1.293864,1.262054,13.45753,4432.352565,4.597629,43.289789,1.119474,0.800437
min,1.0,1.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,3863.0,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,7698.0,9.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,11503.0,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,15420.0,16.0,700.0,4.0,1996.0


There are 14497 non-fraudulent rows, and 923 fraud rows.


In [59]:
# Test for null values, should be 15420 rows
X.dropna().count().head(1)


Month    15420
dtype: int64

In [60]:
# Encode the object columns, scale the numeric columns

#  0   Month                 15420 non-null  object o
#  1   WeekOfMonth           15420 non-null  int64  s
#  2   DayOfWeek             15420 non-null  object o
#  3   Make                  15420 non-null  object h
#  4   AccidentArea          15420 non-null  object h
#  5   DayOfWeekClaimed      15420 non-null  object o
#  6   MonthClaimed          15420 non-null  object o
#  7   WeekOfMonthClaimed    15420 non-null  int64  s
#  8   Sex                   15420 non-null  object h
#  9   MaritalStatus         15420 non-null  object h
#  10  Age                   15420 non-null  int64  s
#  11  Fault                 15420 non-null  object h
#  12  PolicyType            15420 non-null  object h
#  13  VehicleCategory       15420 non-null  object h
#  14  VehiclePrice          15420 non-null  object o
#  15  y                     15420 non-null  int64  ignore
#  16  PolicyNumber          15420 non-null  int64  delete
#  17  RepNumber             15420 non-null  int64  h
#  18  Deductible            15420 non-null  int64  s
#  19  DriverRating          15420 non-null  int64  s
#  20  Days_Policy_Accident  15420 non-null  object h
#  21  Days_Policy_Claim     15420 non-null  object h
#  22  PastNumberOfClaims    15420 non-null  object h
#  23  AgeOfVehicle          15420 non-null  object o
#  24  AgeOfPolicyHolder     15420 non-null  object o
#  25  PoliceReportFiled     15420 non-null  object h
#  26  WitnessPresent        15420 non-null  object h
#  27  AgentType             15420 non-null  object h
#  28  NumberOfSuppliments   15420 non-null  object o
#  29  AddressChange_Claim   15420 non-null  object o
#  30  NumberOfCars          15420 non-null  object o
#  31  Year                  15420 non-null  int64 s
#  32  BasePolicy            15420 non-null  object h

# Define the mapper
mapper = DataFrameMapper([
    (['WeekOfMonth', 'Deductible','DriverRating','Age','Year',
      'WeekOfMonthClaimed'], StandardScaler()),  # Scale numerical data
    (['Make', 'AccidentArea','Sex','MaritalStatus','Fault','PolicyType',
      'VehicleCategory','RepNumber','Days_Policy_Accident','Days_Policy_Claim',
      'PastNumberOfClaims','PoliceReportFiled','WitnessPresent','AgentType',
      'BasePolicy'], OneHotEncoder(handle_unknown='ignore', sparse_output=False)),  # One-hot encode categorical data
    (['Month','DayOfWeek','DayOfWeekClaimed','MonthClaimed','VehiclePrice',
      'AgeOfVehicle','AgeOfPolicyHolder','NumberOfSuppliments',
      'AddressChange_Claim','NumberOfCars'], OrdinalEncoder()),
    (['PolicyNumber'], None)
])

# Fit and transform the data
X_train_encoded = mapper.fit_transform(X_train)
X_test_encoded = mapper.transform(X_test)
y_train_flattened = np.ravel(y_train)
y_test_flattened = np.ravel(y_test)
print(X_train_encoded)


[[ 1.70822674e-01 -1.74382103e-01 -1.33174611e+00 ...  3.00000000e+00
   0.00000000e+00  1.52490000e+04]
 [ 9.43735123e-01 -1.74382103e-01  1.34819886e+00 ...  3.00000000e+00
   0.00000000e+00  8.91000000e+02]
 [-1.37500222e+00 -1.74382103e-01  4.54883871e-01 ...  3.00000000e+00
   0.00000000e+00  1.08170000e+04]
 ...
 [ 1.70822674e-01 -1.74382103e-01 -4.38431118e-01 ...  3.00000000e+00
   0.00000000e+00  1.01010000e+04]
 [ 9.43735123e-01 -1.74382103e-01 -1.33174611e+00 ...  3.00000000e+00
   0.00000000e+00  7.71900000e+03]
 [ 1.70822674e-01 -1.74382103e-01 -1.33174611e+00 ...  3.00000000e+00
   0.00000000e+00  7.13200000e+03]]


In [61]:
# Balance the data by duplicating fraudulent rows.
# Apply SMOTE to the training data
smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_encoded, y_train_flattened)

# How balanced is the data?
values, counts = np.unique(y_train_balanced, return_counts=True)
print(f"There are {counts[0]} non-fraudulent rows, and {counts[1]} fraud rows.")


There are 10851 non-fraudulent rows, and 10851 fraud rows.


In [62]:

Models = [SVC, KNeighborsClassifier,
            DecisionTreeClassifier,
            RandomForestClassifier,
            ExtraTreesClassifier,
            GradientBoostingClassifier,
            AdaBoostClassifier
        ]

for Model in Models :
    model = Model()
    print(str(Model.__name__))
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test_encoded)
    print("=============================")
    print( "balanced_accuracy: ", balanced_accuracy_score(y_test_flattened, y_pred))
    print( "train_accuracy:", model.score(X_train_balanced, y_train_balanced))
    print( "test_accuracy:", model.score(X_test_encoded, y_test_flattened))
    print()


SVC
balanced_accuracy:  0.5255900810221334
train_accuracy: 0.515758916228919
test_accuracy: 0.614526588845655

KNeighborsClassifier
balanced_accuracy:  0.6500674528289507
train_accuracy: 0.8644825361717814
test_accuracy: 0.670817120622568

DecisionTreeClassifier
balanced_accuracy:  0.562171429921235
train_accuracy: 1.0
test_accuracy: 0.8970168612191959

RandomForestClassifier
balanced_accuracy:  0.5045104158191319
train_accuracy: 1.0
test_accuracy: 0.9457846952010376

ExtraTreesClassifier
balanced_accuracy:  0.5040990060550069
train_accuracy: 1.0
test_accuracy: 0.9450064850843061

GradientBoostingClassifier
balanced_accuracy:  0.5
train_accuracy: 0.9670998064694498
test_accuracy: 0.9457846952010376

AdaBoostClassifier




balanced_accuracy:  0.5167162283107659
train_accuracy: 0.955211501244125
test_accuracy: 0.9304798962386511

