In [1]:
from imblearn.over_sampling import ADASYN, SMOTENC
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import (train_test_split, 
    ShuffleSplit, cross_val_score, GridSearchCV)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import (DecisionTreeClassifier, 
    ExtraTreeClassifier)
from sklearn.ensemble import (RandomForestClassifier, 
    GradientBoostingClassifier)
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score)

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sn

pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
data_dir_path = "../data/fraud_oracle.csv"
df = pd.read_csv(data_dir_path, dtype=object)
print(df.shape)
df.head()

(15420, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


----
### Data Preprocessing

In [3]:
#convert the the columns to their proper datatypes
df["Month"] = pd.to_datetime(
    df["Month"], format="%b", errors="coerce")\
    .dt.strftime("%m") + df["Month"]
df["MonthClaimed"] = pd.to_datetime(
    df["MonthClaimed"], format="%b", errors="coerce")\
    .dt.strftime("%m") + df["MonthClaimed"]

numeric_cols = [
    "Age",
    "Deductible",
    "DriverRating",
    "RepNumber",
    "FraudFound_P"
    ]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col])

print(df.shape)
df.head()

(15420, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,12Dec,5,Wednesday,Honda,Urban,Tuesday,01Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,01Jan,3,Wednesday,Honda,Urban,Monday,01Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,10Oct,5,Friday,Honda,Urban,Thursday,11Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,06Jun,2,Saturday,Toyota,Rural,Friday,07Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,01Jan,5,Monday,Honda,Urban,Tuesday,02Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [4]:
#set the modelling parameters
fraud_col = "FraudFound_P"

#numerical features
num_feats = [
    'Age',
]
#categorical features
cat_feats = [
    "Month",
    "WeekOfMonth",
    "DayOfWeek", 
    "Make",
    "AccidentArea",
    "DayOfWeekClaimed",
    "MonthClaimed",
    "WeekOfMonthClaimed",
    "Sex",
    'MaritalStatus', 
    'Fault', 
    'PolicyType', 
    'VehicleCategory',
    'VehiclePrice',  
    'RepNumber',
    'DriverRating', 
    'Days_Policy_Accident',
    'Days_Policy_Claim', 
    'PastNumberOfClaims', 
    'AgeOfVehicle',
    'AgeOfPolicyHolder', 
    'PoliceReportFiled', 
    'WitnessPresent', 
    'AgentType',
    'NumberOfSuppliments',
    'AddressChange_Claim', 
    'NumberOfCars', 
    'Year',
    'BasePolicy'
]

In [5]:
#check for null values
rows_with_null = df[cat_feats].isnull().any(axis=1)
print(
    "Number of rows with null features", rows_with_null.sum())
df.loc[rows_with_null]

Number of rows with null features 1


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
1516,07Jul,2,Monday,Honda,Rural,0,,1,Male,Single,0,Policy Holder,Sedan - All Perils,Sedan,more than 69000,0,1517,15,400,2,more than 30,none,none,new,16 to 17,No,No,External,none,no change,1 vehicle,1994,All Perils


In [6]:
#drop the rows with null feature values
df = df.dropna().reset_index(drop=True)
print(df.shape)
df.head()

(15419, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,12Dec,5,Wednesday,Honda,Urban,Tuesday,01Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,01Jan,3,Wednesday,Honda,Urban,Monday,01Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,10Oct,5,Friday,Honda,Urban,Thursday,11Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,06Jun,2,Saturday,Toyota,Rural,Friday,07Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,01Jan,5,Monday,Honda,Urban,Tuesday,02Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [7]:
#mapping of categorical feature to label binarizer
cat_feat_2_lab_bin = {}
prep_df = pd.DataFrame([])

for feat_col in cat_feats:
    lb = LabelBinarizer()
    lb.fit(df[feat_col])
    if len(lb.classes_) > 2:
        columns = [feat_col + "_" + str(col_val) 
                   for col_val in lb.classes_]
    else:
        columns = [feat_col + "_" + str(col_val) 
                   for col_val in lb.classes_[:-1]]
    feat_df = pd.DataFrame(
        lb.transform(df[feat_col]),
        columns=columns)
    prep_df = pd.concat([prep_df, feat_df], axis=1)
    cat_feat_2_lab_bin[feat_col] = lb
    
print(prep_df.shape)
prep_df.head()

(15419, 164)


Unnamed: 0,Month_01Jan,Month_02Feb,Month_03Mar,Month_04Apr,Month_05May,Month_06Jun,Month_07Jul,Month_08Aug,Month_09Sep,Month_10Oct,Month_11Nov,Month_12Dec,WeekOfMonth_1,WeekOfMonth_2,WeekOfMonth_3,WeekOfMonth_4,WeekOfMonth_5,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,Make_Accura,Make_BMW,Make_Chevrolet,Make_Dodge,Make_Ferrari,Make_Ford,Make_Honda,Make_Jaguar,Make_Lexus,Make_Mazda,Make_Mecedes,Make_Mercury,Make_Nisson,Make_Pontiac,Make_Porche,Make_Saab,Make_Saturn,Make_Toyota,Make_VW,AccidentArea_Rural,DayOfWeekClaimed_Friday,DayOfWeekClaimed_Monday,DayOfWeekClaimed_Saturday,DayOfWeekClaimed_Sunday,DayOfWeekClaimed_Thursday,DayOfWeekClaimed_Tuesday,DayOfWeekClaimed_Wednesday,MonthClaimed_01Jan,MonthClaimed_02Feb,MonthClaimed_03Mar,MonthClaimed_04Apr,MonthClaimed_05May,MonthClaimed_06Jun,MonthClaimed_07Jul,MonthClaimed_08Aug,MonthClaimed_09Sep,MonthClaimed_10Oct,MonthClaimed_11Nov,MonthClaimed_12Dec,WeekOfMonthClaimed_1,WeekOfMonthClaimed_2,WeekOfMonthClaimed_3,WeekOfMonthClaimed_4,WeekOfMonthClaimed_5,Sex_Female,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,PolicyType_Sedan - All Perils,PolicyType_Sedan - Collision,PolicyType_Sedan - Liability,PolicyType_Sport - All Perils,PolicyType_Sport - Collision,PolicyType_Sport - Liability,PolicyType_Utility - All Perils,PolicyType_Utility - Collision,PolicyType_Utility - Liability,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,VehiclePrice_20000 to 29000,VehiclePrice_30000 to 39000,VehiclePrice_40000 to 59000,VehiclePrice_60000 to 69000,VehiclePrice_less than 20000,VehiclePrice_more than 69000,RepNumber_1,RepNumber_2,RepNumber_3,RepNumber_4,RepNumber_5,RepNumber_6,RepNumber_7,RepNumber_8,RepNumber_9,RepNumber_10,RepNumber_11,RepNumber_12,RepNumber_13,RepNumber_14,RepNumber_15,RepNumber_16,DriverRating_1,DriverRating_2,DriverRating_3,DriverRating_4,Days_Policy_Accident_1 to 7,Days_Policy_Accident_15 to 30,Days_Policy_Accident_8 to 15,Days_Policy_Accident_more than 30,Days_Policy_Accident_none,Days_Policy_Claim_15 to 30,Days_Policy_Claim_8 to 15,Days_Policy_Claim_more than 30,PastNumberOfClaims_1,PastNumberOfClaims_2 to 4,PastNumberOfClaims_more than 4,PastNumberOfClaims_none,AgeOfVehicle_2 years,AgeOfVehicle_3 years,AgeOfVehicle_4 years,AgeOfVehicle_5 years,AgeOfVehicle_6 years,AgeOfVehicle_7 years,AgeOfVehicle_more than 7,AgeOfVehicle_new,AgeOfPolicyHolder_16 to 17,AgeOfPolicyHolder_18 to 20,AgeOfPolicyHolder_21 to 25,AgeOfPolicyHolder_26 to 30,AgeOfPolicyHolder_31 to 35,AgeOfPolicyHolder_36 to 40,AgeOfPolicyHolder_41 to 50,AgeOfPolicyHolder_51 to 65,AgeOfPolicyHolder_over 65,PoliceReportFiled_No,WitnessPresent_No,AgentType_External,NumberOfSuppliments_1 to 2,NumberOfSuppliments_3 to 5,NumberOfSuppliments_more than 5,NumberOfSuppliments_none,AddressChange_Claim_1 year,AddressChange_Claim_2 to 3 years,AddressChange_Claim_4 to 8 years,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_1 vehicle,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,Year_1994,Year_1995,Year_1996,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0


In [8]:
#append the numerical features and the target variable
prep_df = pd.concat(
    [prep_df, df[num_feats], df[fraud_col]], axis=1)
print(prep_df.shape)
prep_df.head()

(15419, 166)


Unnamed: 0,Month_01Jan,Month_02Feb,Month_03Mar,Month_04Apr,Month_05May,Month_06Jun,Month_07Jul,Month_08Aug,Month_09Sep,Month_10Oct,Month_11Nov,Month_12Dec,WeekOfMonth_1,WeekOfMonth_2,WeekOfMonth_3,WeekOfMonth_4,WeekOfMonth_5,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,Make_Accura,Make_BMW,Make_Chevrolet,Make_Dodge,Make_Ferrari,Make_Ford,Make_Honda,Make_Jaguar,Make_Lexus,Make_Mazda,Make_Mecedes,Make_Mercury,Make_Nisson,Make_Pontiac,Make_Porche,Make_Saab,Make_Saturn,Make_Toyota,Make_VW,AccidentArea_Rural,DayOfWeekClaimed_Friday,DayOfWeekClaimed_Monday,DayOfWeekClaimed_Saturday,DayOfWeekClaimed_Sunday,DayOfWeekClaimed_Thursday,DayOfWeekClaimed_Tuesday,DayOfWeekClaimed_Wednesday,MonthClaimed_01Jan,MonthClaimed_02Feb,MonthClaimed_03Mar,MonthClaimed_04Apr,MonthClaimed_05May,MonthClaimed_06Jun,MonthClaimed_07Jul,MonthClaimed_08Aug,MonthClaimed_09Sep,MonthClaimed_10Oct,MonthClaimed_11Nov,MonthClaimed_12Dec,WeekOfMonthClaimed_1,WeekOfMonthClaimed_2,WeekOfMonthClaimed_3,WeekOfMonthClaimed_4,WeekOfMonthClaimed_5,Sex_Female,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,PolicyType_Sedan - All Perils,PolicyType_Sedan - Collision,PolicyType_Sedan - Liability,PolicyType_Sport - All Perils,PolicyType_Sport - Collision,PolicyType_Sport - Liability,PolicyType_Utility - All Perils,PolicyType_Utility - Collision,PolicyType_Utility - Liability,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,VehiclePrice_20000 to 29000,VehiclePrice_30000 to 39000,VehiclePrice_40000 to 59000,VehiclePrice_60000 to 69000,VehiclePrice_less than 20000,VehiclePrice_more than 69000,RepNumber_1,RepNumber_2,RepNumber_3,RepNumber_4,RepNumber_5,RepNumber_6,RepNumber_7,RepNumber_8,RepNumber_9,RepNumber_10,RepNumber_11,RepNumber_12,RepNumber_13,RepNumber_14,RepNumber_15,RepNumber_16,DriverRating_1,DriverRating_2,DriverRating_3,DriverRating_4,Days_Policy_Accident_1 to 7,Days_Policy_Accident_15 to 30,Days_Policy_Accident_8 to 15,Days_Policy_Accident_more than 30,Days_Policy_Accident_none,Days_Policy_Claim_15 to 30,Days_Policy_Claim_8 to 15,Days_Policy_Claim_more than 30,PastNumberOfClaims_1,PastNumberOfClaims_2 to 4,PastNumberOfClaims_more than 4,PastNumberOfClaims_none,AgeOfVehicle_2 years,AgeOfVehicle_3 years,AgeOfVehicle_4 years,AgeOfVehicle_5 years,AgeOfVehicle_6 years,AgeOfVehicle_7 years,AgeOfVehicle_more than 7,AgeOfVehicle_new,AgeOfPolicyHolder_16 to 17,AgeOfPolicyHolder_18 to 20,AgeOfPolicyHolder_21 to 25,AgeOfPolicyHolder_26 to 30,AgeOfPolicyHolder_31 to 35,AgeOfPolicyHolder_36 to 40,AgeOfPolicyHolder_41 to 50,AgeOfPolicyHolder_51 to 65,AgeOfPolicyHolder_over 65,PoliceReportFiled_No,WitnessPresent_No,AgentType_External,NumberOfSuppliments_1 to 2,NumberOfSuppliments_3 to 5,NumberOfSuppliments_more than 5,NumberOfSuppliments_none,AddressChange_Claim_1 year,AddressChange_Claim_2 to 3 years,AddressChange_Claim_4 to 8 years,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_1 vehicle,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,Year_1994,Year_1995,Year_1996,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,Age,FraudFound_P
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,21,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,34,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,47,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,65,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,27,0


In [9]:
#show the distribution of frauds and non-frauds
pd.concat([
    df[fraud_col].value_counts().rename("fraud_counts"),
    df[fraud_col].value_counts(
        normalize=True).rename("fraud_counts_prcnt")],
    axis=1)

Unnamed: 0,fraud_counts,fraud_counts_prcnt
0,14496,0.940139
1,923,0.059861


----
### Train and Test Split

In [10]:
train_ind, test_ind = train_test_split(
    prep_df.index, test_size=0.2, 
    random_state=0, stratify=prep_df[fraud_col])

train_df = prep_df.loc[train_ind]
test_df = prep_df.loc[test_ind]

print("Train Set", train_df.shape)
print("Test Set", test_df.shape)

Train Set (12335, 166)
Test Set (3084, 166)


In [11]:
#Training Set
pd.concat([
    train_df[fraud_col].value_counts().rename("counts"),
    train_df[fraud_col].value_counts(
        normalize=True).rename("counts_prcnt")
], axis=1)


Unnamed: 0,counts,counts_prcnt
0,11597,0.94017
1,738,0.05983


In [12]:
#Test Set
pd.concat([
    test_df[fraud_col].value_counts().rename("counts"),
    test_df[fraud_col].value_counts(
        normalize=True).rename("counts_prcnt")
], axis=1)

Unnamed: 0,counts,counts_prcnt
0,2899,0.940013
1,185,0.059987


----
### Apply oversampling to training data to address imbalanced data via SMOTE

In [13]:
def oversample(train_df, target_var, numerical_features, method="SMOTENC"):
    features = train_df.columns.difference([target_var])
    y = train_df[target_var].values
    X = train_df[features].values
    
    cat_feats_indices = np.where(~features.isin(num_feats))[0]
    
    if method == "SMOTENC":
        oversampler = SMOTENC(
            categorical_features=cat_feats_indices, random_state=0)
    elif method == "ADASYN":
        oversampler = ADASYN()
    else:
        raise ValueError("Parameter `method` not supported.")
    X_new, y_new = oversampler.fit_resample(X, y)
    train_df_new = pd.DataFrame(
        np.concatenate([X_new, y_new[:, None]], axis=1), 
        columns=np.concatenate([features, [target_var]]))
    return train_df_new

In [15]:
train_df = oversample(train_df, fraud_col, num_feats)
train_df.shape

(23194, 166)

In [16]:
pd.concat([
    train_df[fraud_col].value_counts().rename("counts"),
    train_df[fraud_col].value_counts(
        normalize=True).rename("counts_prcnt")
], axis=1)

Unnamed: 0,counts,counts_prcnt
1.0,11597,0.5
0.0,11597,0.5


----
### Model Training with Hyperparameter Tuning 

In [18]:
model_name_2_model = {
    DecisionTreeClassifier().__class__.__name__: DecisionTreeClassifier,
    DummyClassifier().__class__.__name__: DummyClassifier,
    ExtraTreeClassifier().__class__.__name__: ExtraTreeClassifier,
    GradientBoostingClassifier().__class__.__name__: GradientBoostingClassifier,
    LogisticRegression().__class__.__name__: LogisticRegression,
    RandomForestClassifier().__class__.__name__: RandomForestClassifier
}

model_name_2_params = {
    DecisionTreeClassifier().__class__.__name__:{
        'max_depth': [2, 3, 5, 10, 20],
        'min_samples_leaf': [5, 10, 20, 50, 100],
        'criterion': ["gini", "entropy"],
        'random_state': [0]
    },
    DummyClassifier().__class__.__name__: {
        'strategy': ['stratified'],
        'random_state': [0]
    },
    ExtraTreeClassifier().__class__.__name__:{
        'max_depth': [10, 50, 100, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [5, 10],
        'random_state': [0]
    },
    GradientBoostingClassifier().__class__.__name__:{
        "max_depth": [10, 50, 100, None],
        "max_features": ['auto', 'sqrt', None], 
        "min_samples_leaf": [2, 4],
        "min_samples_split": [5, 10],
        "n_estimators": [100, 500],
        "random_state": [0]
    },
    LogisticRegression().__class__.__name__:{
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'C': [100, 10, 1.0, 0.1, 0.01]
    },
    RandomForestClassifier().__class__.__name__: {
        'bootstrap': [True, False],
        'max_depth': [50, 100, None],
        'max_features': ['auto'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [5, 10],
        'n_estimators': [400, 600],
        'random_state': [0]
    }
}

features = train_df.columns.difference([fraud_col])
target_var = fraud_col

test_size = 0.2
random_state = 0
cv_num_folds = 5

metric_2_score_function = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1": f1_score}

basis_metric = "f1"

model_dir = "../models/03_model_training_with_smote/"

In [20]:
#some helper functions for the training the model and handling hyperparameters
def compute_metric_scores(X, y, model, metrics, cv=None):
    if cv is not None:
        metric_2_cv_scores = {}
        cv = ShuffleSplit(
            n_splits=cv_num_folds, test_size=test_size, 
            random_state=random_state)
        for metric in metrics:
            scores = cross_val_score(
                model, X, y, scoring=metric, cv=cv)
            metric_2_cv_scores[metric] = scores

        metrics_df = pd.DataFrame(metric_2_cv_scores).T
        metrics_df["mean_score"] = metrics_df.mean(axis=1)
        return metrics_df
    else:
        y_preds = model.predict(X)
        metric_2_score = {}
        for metric in metrics:
            score_func = metric_2_score_function[metric]
            metric_2_score[metric] = score_func(y, y_preds)
        metrics_df = pd.Series(metric_2_score).to_frame()
        return metrics_df
        

def run_model_hyperparameter_tuning(X, y, model, params, 
    basis_metric, cv):
    search_cv = GridSearchCV(model(), param_grid=params, cv=cv,
        scoring=basis_metric)
    search_cv.fit(X, y) 
    return search_cv  

----

In [21]:
%%time
#baseline score from DummyClassifier baseline

model_name = DummyClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 267 ms, sys: 5.95 ms, total: 273 ms
Wall time: 273 ms


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.490623,0.50291,0.495365,0.503988,0.499461,0.498469
precision,0.477053,0.499363,0.499781,0.502986,0.492791,0.494395
recall,0.502662,0.510204,0.488034,0.509287,0.507867,0.503611
f1,0.489523,0.504725,0.493838,0.506117,0.500215,0.498884


In [22]:
best_model

DummyClassifier(random_state=0, strategy='stratified')

In [23]:
#performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.498054
precision,0.057755
recall,0.481081
f1,0.103129


In [24]:
%%time
#hyperparameter tuning for DecisionTreeClassifier

model_name = DecisionTreeClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 23.2 s, sys: 55 ms, total: 23.2 s
Wall time: 23.2 s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.935762,0.91787,0.931235,0.928864,0.924984,0.927743
precision,0.914758,0.894823,0.91499,0.9186,0.908593,0.910353
recall,0.956965,0.945723,0.952137,0.940821,0.942745,0.947678
f1,0.935386,0.919569,0.933194,0.929577,0.925354,0.928616


In [25]:
best_model

DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=10,
                       random_state=0)

In [26]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.878405
precision,0.191558
recall,0.318919
f1,0.239351


In [27]:
%%time
model_name = RandomForestClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 41min 3s, sys: 5.26 s, total: 41min 8s
Wall time: 41min 9s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.97133,0.970468,0.97133,0.967665,0.968528,0.969864
precision,0.999059,0.999539,0.999547,0.999539,1.0,0.999537
recall,0.941881,0.940947,0.94359,0.935637,0.936189,0.939649
f1,0.969628,0.969358,0.970763,0.966533,0.967043,0.968665


In [28]:
best_model

RandomForestClassifier(bootstrap=False, max_depth=100, min_samples_leaf=4,
                       min_samples_split=5, n_estimators=400, random_state=0)

In [29]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.939364
precision,0.25
recall,0.005405
f1,0.010582


----

In [30]:
%%time
model_name = ExtraTreeClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 5.49 s, sys: 17.6 ms, total: 5.51 s
Wall time: 5.51 s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.950851,0.955809,0.951498,0.945247,0.943307,0.949343
precision,0.94823,0.967469,0.952503,0.945525,0.94272,0.951289
recall,0.950754,0.942683,0.951282,0.944708,0.942308,0.946347
f1,0.94949,0.954915,0.951892,0.945117,0.942514,0.948786


In [31]:
best_model

ExtraTreeClassifier(max_depth=100, min_samples_leaf=2, min_samples_split=5,
                    random_state=0)

In [32]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.901102
precision,0.11039
recall,0.091892
f1,0.100295


----

In [None]:
%%time
model_name = GradientBoostingClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

In [None]:
best_model

In [None]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)