In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report

df = pd.read_csv('vehicle_insurance_claim_fraud.csv').rename(columns={'FraudFound_P':'y'})
display(df.head())
display(df.info())

df_columns = df.columns

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  y                     15420 non-null

None

In [2]:
# Split the data into training and testing sets
X = df.drop(columns='y')
y = df['y'].values.reshape(-1,1)

# Show the X_train dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.7)
display(X_train.describe())

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,13878.0,13878.0,13878.0,13878.0,13878.0,13878.0,13878.0,13878.0
mean,2.789883,2.691598,39.89278,7711.645266,8.478887,407.774896,2.485084,1994.866407
std,1.288395,1.258424,13.4924,4455.042985,4.593918,44.118755,1.120421,0.804359
min,1.0,1.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,3856.25,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,7702.5,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,11584.75,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,15420.0,16.0,700.0,4.0,1996.0


In [3]:
# How balanced is the data?
values, counts = np.unique(y, return_counts=True)
print(f"There are {counts[0]} non-fraudulent rows, and {counts[1]} fraud rows.")

There are 14497 non-fraudulent rows, and 923 fraud rows.


In [4]:
# Test for null values, should be 15420 rows
X.dropna().count().head(1)

Month    15420
dtype: int64

## Encoding the Data

In [5]:
# Encode the object columns, scale the numeric columns

def encode_feature(dataframe, feature, Model) :
    return pd.Series(np.ravel(Model.fit_transform(dataframe[feature].values.reshape(-1,1))), name=feature)

def encode_multi(dataframe, feature, Model) :
    encoder = Model.fit(dataframe[feature].values.reshape(-1, 1))
    return pd.DataFrame(encoder.transform(dataframe[feature].values.reshape(-1, 1)), columns=OneHotColumnNames(feature, encoder))

def OneHotColumnNames(feature, column) : 
    column_names = column.get_feature_names_out()
    output = []
    for column_name in column_names : 
        output.append(feature+" "+column_name)
    return output

In [6]:
def encode_fraud_data( unencoded_dataframe ) :

    ORD = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    SSC = StandardScaler()
    OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    output = pd.DataFrame()

    output = pd.concat([output, encode_feature(unencoded_dataframe, "Month", ORD)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "WeekOfMonth", SSC)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "DayOfWeek", ORD)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "Make", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "AccidentArea", OHE)], axis="columns")
    #output = pd.concat([output, encode_feature(unencoded_dataframe, "DayOfWeekClaimed", ORD)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "MonthClaimed", ORD)], axis="columns")
    #output = pd.concat([output, encode_feature(unencoded_dataframe, "WeekOfMonthClaimed", SSC)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "Sex", OHE)], axis="columns")
    #output = pd.concat([output, encode_multi(unencoded_dataframe, "MaritalStatus", OHE)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "Age", SSC)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "Fault", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "PolicyType", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "VehicleCategory", OHE)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "VehiclePrice", ORD)], axis="columns")
    #output = pd.concat([output, encode_multi(unencoded_dataframe, "RepNumber", OHE)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "Deductible", SSC)], axis="columns")
    #output = pd.concat([output, encode_feature(unencoded_dataframe, "DriverRating", SSC)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "Days_Policy_Accident", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "Days_Policy_Claim", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "PastNumberOfClaims", OHE)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "AgeOfVehicle", ORD)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "AgeOfPolicyHolder", ORD)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "PoliceReportFiled", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "WitnessPresent", OHE)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "AgentType", OHE)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "NumberOfSuppliments", ORD)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "AddressChange_Claim", ORD)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "NumberOfCars", ORD)], axis="columns")
    output = pd.concat([output, encode_feature(unencoded_dataframe, "Year", SSC)], axis="columns")
    output = pd.concat([output, encode_multi(unencoded_dataframe, "BasePolicy", OHE)], axis="columns")

    return output



In [7]:
# Encode the data
X_train_encoded = pd.DataFrame()
X_train_encoded = encode_fraud_data(X_train)
X_test_encoded = encode_fraud_data(X_test)

# Sometimes the train and test columns don't quite match up because of the 
# extra columns generated after a OneHotEncoding, so I have to do this to get 
# them to match.
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns)
X_test_encoded = X_test_encoded.fillna(0)

# Flatten the y series out.  It's already "encoded" because its just 1 or 0.
y_train_flattened = np.ravel(y_train)
y_test_flattened = np.ravel(y_test)

# Write the encoded data to CSV files for later use
X_train_encoded.to_csv("encoded_training_data.csv", index=False, header=False)
X_test_encoded.to_csv("encoded_test_data.csv", index=False, header=False)
pd.DataFrame(y_train).to_csv("encoded_training_fraud_scoring.csv", index=False, header=False)
pd.DataFrame(y_test).to_csv("encoded_test_fraud_scoring.csv", index=False, header=False)






## Examining Correlation to Determine Valuable Features

In [8]:
# Compute the correlation matrix
corr_matrix = encode_fraud_data(df).corrwith(df["y"]).abs().sort_values(ascending=True)
corr_matrix.head(20)

Days_Policy_Accident x0_15 to 30     0.000325
Make x0_Toyota                       0.000554
Make x0_BMW                          0.000896
Days_Policy_Accident x0_1 to 7       0.001470
PolicyType x0_Sport - Liability      0.002032
Days_Policy_Claim x0_none            0.002032
Make x0_Lexus                        0.002032
Make x0_Ferrari                      0.002874
Make x0_Mercury                      0.003855
Make x0_Porche                       0.004544
Make x0_Nisson                       0.004937
Make x0_Jaguar                       0.004978
PastNumberOfClaims x0_1              0.005267
Make x0_Chevrolet                    0.005807
PolicyType x0_Utility - Collision    0.007471
Days_Policy_Accident x0_8 to 15      0.007832
Make x0_Honda                        0.008040
WitnessPresent x0_No                 0.008057
WitnessPresent x0_Yes                0.008057
Month                                0.008670
dtype: float64

## Balancing the Datasets

In [9]:
# Now we need to balance the datasets

# Oversampling: Apply SMOTE to the training data to expand the fraud cases
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_encoded, y_train_flattened)

# Undersampling: Condensed Nearest Neighbor (CNN) uses a K-nearest neighbors (KNN) 
# approach to retain samples that are necessary for the classification decision, 
# and removes samples that are correctly classified by their K-nearest neighbors.
CNN = CondensedNearestNeighbour()
X_train_CNN, y_train_CNN = CNN.fit_resample(X_train_encoded, y_train_flattened)

# Hybrid Sampling: SMOTETomek - the combination of both methods above
smote_tomek = SMOTETomek()
X_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(X_train_encoded, y_train_flattened)

# How balanced is the data?
values, counts = np.unique(y_train_smote, return_counts=True)
print(f"After using SMOTE balancing, there are {counts[0]} non-fraudulent rows, and {counts[1]} fraud rows.")
values, counts = np.unique(y_train_CNN, return_counts=True)
print(f"After using CNN balancing, there are {counts[0]} non-fraudulent rows, and {counts[1]} fraud rows.")
values, counts = np.unique(y_train_smote_tomek, return_counts=True)
print(f"After using SMOTETomek Hybrid balancing, there are {counts[0]} non-fraudulent rows, and {counts[1]} fraud rows.")


After using SMOTE balancing, there are 13038 non-fraudulent rows, and 13038 fraud rows.
After using CNN balancing, there are 2821 non-fraudulent rows, and 840 fraud rows.
After using SMOTETomek Hybrid balancing, there are 13035 non-fraudulent rows, and 13035 fraud rows.


## Running the unbalanced data through the models for performance reference.

In [10]:

Models = [SVC(kernel='poly'), 
          KNeighborsClassifier(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          ExtraTreesClassifier(),
          GradientBoostingClassifier(),
          AdaBoostClassifier()]

for Model in Models :
    
    model_unbalanced = Model.fit(X_train_encoded, y_train_flattened)
    y_pred_unbalanced = model_unbalanced.predict(X_test_encoded)
    print(f"{type(Model)} - No Balancing")
    print("==========================================================")
    print()    
    print(classification_report(y_test_flattened, y_pred_unbalanced, labels=[0,1]))
    print()    



<class 'sklearn.svm._classes.SVC'> - No Balancing

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1459
           1       0.00      0.00      0.00        83

    accuracy                           0.95      1542
   macro avg       0.47      0.50      0.49      1542
weighted avg       0.90      0.95      0.92      1542


<class 'sklearn.neighbors._classification.KNeighborsClassifier'> - No Balancing

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1459
           1       0.08      0.01      0.02        83

    accuracy                           0.94      1542
   macro avg       0.51      0.50      0.49      1542
weighted avg       0.90      0.94      0.92      1542


<class 'sklearn.tree._classes.DecisionTreeClassifier'> - No Balancing

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      1459
           1       0.17      0.2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<class 'sklearn.ensemble._forest.RandomForestClassifier'> - No Balancing

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1459
           1       0.00      0.00      0.00        83

    accuracy                           0.95      1542
   macro avg       0.47      0.50      0.49      1542
weighted avg       0.90      0.95      0.92      1542


<class 'sklearn.ensemble._forest.ExtraTreesClassifier'> - No Balancing

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1459
           1       0.17      0.01      0.02        83

    accuracy                           0.94      1542
   macro avg       0.56      0.50      0.50      1542
weighted avg       0.90      0.94      0.92      1542


<class 'sklearn.ensemble._gb.GradientBoostingClassifier'> - No Balancing

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1459
           1   



<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'> - No Balancing

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1459
           1       0.08      0.01      0.02        83

    accuracy                           0.94      1542
   macro avg       0.51      0.50      0.49      1542
weighted avg       0.90      0.94      0.92      1542




## Testing the models with SMOTE Oversampling

In [11]:

for Model in Models :

    model_smote = Model.fit(X_train_smote, y_train_smote)
    y_pred_smote = model_smote.predict(X_test_encoded)
    print(f"{type(Model)} - SMOTE Oversampling")
    print("==========================================================")
    print()    
    print(classification_report(y_test_flattened, y_pred_smote, labels=[0,1]))
    print()    


<class 'sklearn.svm._classes.SVC'> - SMOTE Oversampling

              precision    recall  f1-score   support

           0       0.99      0.63      0.77      1459
           1       0.12      0.90      0.21        83

    accuracy                           0.64      1542
   macro avg       0.56      0.77      0.49      1542
weighted avg       0.94      0.64      0.74      1542


<class 'sklearn.neighbors._classification.KNeighborsClassifier'> - SMOTE Oversampling

              precision    recall  f1-score   support

           0       0.97      0.71      0.82      1459
           1       0.10      0.57      0.17        83

    accuracy                           0.71      1542
   macro avg       0.53      0.64      0.50      1542
weighted avg       0.92      0.71      0.79      1542


<class 'sklearn.tree._classes.DecisionTreeClassifier'> - SMOTE Oversampling

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1459
           1  



<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'> - SMOTE Oversampling

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      1459
           1       0.13      0.39      0.19        83

    accuracy                           0.83      1542
   macro avg       0.54      0.62      0.55      1542
weighted avg       0.92      0.83      0.86      1542




## Testing the models with CNN Links Undersampling.

In [12]:

for Model in Models :

    model_CNN = Model.fit(X_train_CNN, y_train_CNN)
    y_pred_CNN = model_CNN.predict(X_test_encoded)
    print(f"{type(Model)} - CNN Undersampling")
    print("==========================================================")
    print()    
    print(classification_report(y_test_flattened, y_pred_CNN, labels=[0,1]))
    print()    


<class 'sklearn.svm._classes.SVC'> - CNN Undersampling

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1459
           1       0.00      0.00      0.00        83

    accuracy                           0.95      1542
   macro avg       0.47      0.50      0.49      1542
weighted avg       0.90      0.95      0.92      1542


<class 'sklearn.neighbors._classification.KNeighborsClassifier'> - CNN Undersampling

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1459
           1       0.13      0.14      0.14        83

    accuracy                           0.90      1542
   macro avg       0.54      0.55      0.54      1542
weighted avg       0.91      0.90      0.91      1542


<class 'sklearn.tree._classes.DecisionTreeClassifier'> - CNN Undersampling

              precision    recall  f1-score   support

           0       0.96      0.86      0.91      1459
           1     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<class 'sklearn.ensemble._forest.RandomForestClassifier'> - CNN Undersampling

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1459
           1       0.42      0.13      0.20        83

    accuracy                           0.94      1542
   macro avg       0.69      0.56      0.59      1542
weighted avg       0.92      0.94      0.93      1542


<class 'sklearn.ensemble._forest.ExtraTreesClassifier'> - CNN Undersampling

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1459
           1       0.21      0.10      0.13        83

    accuracy                           0.93      1542
   macro avg       0.58      0.54      0.55      1542
weighted avg       0.91      0.93      0.92      1542


<class 'sklearn.ensemble._gb.GradientBoostingClassifier'> - CNN Undersampling

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1459




## Testing the models with SMOTETomek Hybrid Balancing

In [13]:

for Model in Models :

    model_smote_tomek = Model.fit(X_train_smote_tomek, y_train_smote_tomek)
    y_pred_smote_tomek = model_smote_tomek.predict(X_test_encoded)

    print(f"{type(Model)} - SMOTETomek Hybrid Balancing")
    print("==========================================================")
    print()    
    print(classification_report(y_test_flattened, y_pred_smote_tomek, labels=[0,1]))
    print()    


<class 'sklearn.svm._classes.SVC'> - SMOTETomek Hybrid Balancing

              precision    recall  f1-score   support

           0       0.99      0.63      0.77      1459
           1       0.12      0.90      0.22        83

    accuracy                           0.65      1542
   macro avg       0.56      0.77      0.49      1542
weighted avg       0.94      0.65      0.74      1542


<class 'sklearn.neighbors._classification.KNeighborsClassifier'> - SMOTETomek Hybrid Balancing

              precision    recall  f1-score   support

           0       0.97      0.72      0.83      1459
           1       0.11      0.60      0.18        83

    accuracy                           0.71      1542
   macro avg       0.54      0.66      0.50      1542
weighted avg       0.92      0.71      0.79      1542


<class 'sklearn.tree._classes.DecisionTreeClassifier'> - SMOTETomek Hybrid Balancing

              precision    recall  f1-score   support

           0       0.96      0.93      0.



<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'> - SMOTETomek Hybrid Balancing

              precision    recall  f1-score   support

           0       0.96      0.87      0.91      1459
           1       0.13      0.34      0.18        83

    accuracy                           0.84      1542
   macro avg       0.54      0.60      0.55      1542
weighted avg       0.91      0.84      0.87      1542


