In [1]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import (train_test_split, 
    ShuffleSplit, cross_val_score, GridSearchCV)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import (DecisionTreeClassifier, 
    ExtraTreeClassifier)
from sklearn.ensemble import (RandomForestClassifier, 
    GradientBoostingClassifier)
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score)

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sn

pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
data_dir_path = "../data/fraud_oracle.csv"
df = pd.read_csv(data_dir_path, dtype=object)
print(df.shape)
df.head()

(15420, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


### Data Preprocessing

In [3]:
#convert the the columns to their proper datatypes
df["Month"] = pd.to_datetime(
    df["Month"], format="%b", errors="coerce")\
    .dt.strftime("%m") + df["Month"]
df["MonthClaimed"] = pd.to_datetime(
    df["MonthClaimed"], format="%b", errors="coerce")\
    .dt.strftime("%m") + df["MonthClaimed"]

numeric_cols = [
    "Age",
    "Deductible",
    "DriverRating",
    "RepNumber",
    "FraudFound_P"
    ]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col])

print(df.shape)
df.head()

(15420, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,12Dec,5,Wednesday,Honda,Urban,Tuesday,01Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,01Jan,3,Wednesday,Honda,Urban,Monday,01Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,10Oct,5,Friday,Honda,Urban,Thursday,11Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,06Jun,2,Saturday,Toyota,Rural,Friday,07Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,01Jan,5,Monday,Honda,Urban,Tuesday,02Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [4]:
#set the modelling parameters
fraud_col = "FraudFound_P"

#numerical features
num_feats = [
    'Age',
]
#categorical features
cat_feats = [
    "Month",
    "WeekOfMonth",
    "DayOfWeek", 
    "Make",
    "AccidentArea",
    "DayOfWeekClaimed",
    "MonthClaimed",
    "WeekOfMonthClaimed",
    "Sex",
    'MaritalStatus', 
    'Fault', 
    'PolicyType', 
    'VehicleCategory',
    'VehiclePrice',  
    'RepNumber',
    'DriverRating', 
    'Days_Policy_Accident',
    'Days_Policy_Claim', 
    'PastNumberOfClaims', 
    'AgeOfVehicle',
    'AgeOfPolicyHolder', 
    'PoliceReportFiled', 
    'WitnessPresent', 
    'AgentType',
    'NumberOfSuppliments',
    'AddressChange_Claim', 
    'NumberOfCars', 
    'Year',
    'BasePolicy'
]

In [5]:
#check for null values
rows_with_null = df[cat_feats].isnull().any(axis=1)
print(
    "Number of rows with null features", rows_with_null.sum())
df.loc[rows_with_null]

Number of rows with null features 1


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
1516,07Jul,2,Monday,Honda,Rural,0,,1,Male,Single,0,Policy Holder,Sedan - All Perils,Sedan,more than 69000,0,1517,15,400,2,more than 30,none,none,new,16 to 17,No,No,External,none,no change,1 vehicle,1994,All Perils


In [6]:
#drop the rows with null feature values
df = df.dropna().reset_index(drop=True)
print(df.shape)
df.head()

(15419, 33)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,12Dec,5,Wednesday,Honda,Urban,Tuesday,01Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,01Jan,3,Wednesday,Honda,Urban,Monday,01Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,10Oct,5,Friday,Honda,Urban,Thursday,11Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,06Jun,2,Saturday,Toyota,Rural,Friday,07Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,01Jan,5,Monday,Honda,Urban,Tuesday,02Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [7]:
#mapping of categorical feature to label binarizer
cat_feat_2_lab_bin = {}
prep_df = pd.DataFrame([])

for feat_col in cat_feats:
    lb = LabelBinarizer()
    lb.fit(df[feat_col])
    if len(lb.classes_) > 2:
        columns = [feat_col + "_" + str(col_val) 
                   for col_val in lb.classes_]
    else:
        columns = [feat_col + "_" + str(col_val) 
                   for col_val in lb.classes_[:-1]]
    feat_df = pd.DataFrame(
        lb.transform(df[feat_col]),
        columns=columns)
    prep_df = pd.concat([prep_df, feat_df], axis=1)
    cat_feat_2_lab_bin[feat_col] = lb
    
print(prep_df.shape)
prep_df.head()

(15419, 164)


Unnamed: 0,Month_01Jan,Month_02Feb,Month_03Mar,Month_04Apr,Month_05May,Month_06Jun,Month_07Jul,Month_08Aug,Month_09Sep,Month_10Oct,Month_11Nov,Month_12Dec,WeekOfMonth_1,WeekOfMonth_2,WeekOfMonth_3,WeekOfMonth_4,WeekOfMonth_5,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,Make_Accura,Make_BMW,Make_Chevrolet,Make_Dodge,Make_Ferrari,Make_Ford,Make_Honda,Make_Jaguar,Make_Lexus,Make_Mazda,Make_Mecedes,Make_Mercury,Make_Nisson,Make_Pontiac,Make_Porche,Make_Saab,Make_Saturn,Make_Toyota,Make_VW,AccidentArea_Rural,DayOfWeekClaimed_Friday,DayOfWeekClaimed_Monday,DayOfWeekClaimed_Saturday,DayOfWeekClaimed_Sunday,DayOfWeekClaimed_Thursday,DayOfWeekClaimed_Tuesday,DayOfWeekClaimed_Wednesday,MonthClaimed_01Jan,MonthClaimed_02Feb,MonthClaimed_03Mar,MonthClaimed_04Apr,MonthClaimed_05May,MonthClaimed_06Jun,MonthClaimed_07Jul,MonthClaimed_08Aug,MonthClaimed_09Sep,MonthClaimed_10Oct,MonthClaimed_11Nov,MonthClaimed_12Dec,WeekOfMonthClaimed_1,WeekOfMonthClaimed_2,WeekOfMonthClaimed_3,WeekOfMonthClaimed_4,WeekOfMonthClaimed_5,Sex_Female,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,PolicyType_Sedan - All Perils,PolicyType_Sedan - Collision,PolicyType_Sedan - Liability,PolicyType_Sport - All Perils,PolicyType_Sport - Collision,PolicyType_Sport - Liability,PolicyType_Utility - All Perils,PolicyType_Utility - Collision,PolicyType_Utility - Liability,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,VehiclePrice_20000 to 29000,VehiclePrice_30000 to 39000,VehiclePrice_40000 to 59000,VehiclePrice_60000 to 69000,VehiclePrice_less than 20000,VehiclePrice_more than 69000,RepNumber_1,RepNumber_2,RepNumber_3,RepNumber_4,RepNumber_5,RepNumber_6,RepNumber_7,RepNumber_8,RepNumber_9,RepNumber_10,RepNumber_11,RepNumber_12,RepNumber_13,RepNumber_14,RepNumber_15,RepNumber_16,DriverRating_1,DriverRating_2,DriverRating_3,DriverRating_4,Days_Policy_Accident_1 to 7,Days_Policy_Accident_15 to 30,Days_Policy_Accident_8 to 15,Days_Policy_Accident_more than 30,Days_Policy_Accident_none,Days_Policy_Claim_15 to 30,Days_Policy_Claim_8 to 15,Days_Policy_Claim_more than 30,PastNumberOfClaims_1,PastNumberOfClaims_2 to 4,PastNumberOfClaims_more than 4,PastNumberOfClaims_none,AgeOfVehicle_2 years,AgeOfVehicle_3 years,AgeOfVehicle_4 years,AgeOfVehicle_5 years,AgeOfVehicle_6 years,AgeOfVehicle_7 years,AgeOfVehicle_more than 7,AgeOfVehicle_new,AgeOfPolicyHolder_16 to 17,AgeOfPolicyHolder_18 to 20,AgeOfPolicyHolder_21 to 25,AgeOfPolicyHolder_26 to 30,AgeOfPolicyHolder_31 to 35,AgeOfPolicyHolder_36 to 40,AgeOfPolicyHolder_41 to 50,AgeOfPolicyHolder_51 to 65,AgeOfPolicyHolder_over 65,PoliceReportFiled_No,WitnessPresent_No,AgentType_External,NumberOfSuppliments_1 to 2,NumberOfSuppliments_3 to 5,NumberOfSuppliments_more than 5,NumberOfSuppliments_none,AddressChange_Claim_1 year,AddressChange_Claim_2 to 3 years,AddressChange_Claim_4 to 8 years,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_1 vehicle,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,Year_1994,Year_1995,Year_1996,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0


In [8]:
#append the numerical features and the target variable
prep_df = pd.concat(
    [prep_df, df[num_feats], df[fraud_col]], axis=1)
print(prep_df.shape)
prep_df.head()

(15419, 166)


Unnamed: 0,Month_01Jan,Month_02Feb,Month_03Mar,Month_04Apr,Month_05May,Month_06Jun,Month_07Jul,Month_08Aug,Month_09Sep,Month_10Oct,Month_11Nov,Month_12Dec,WeekOfMonth_1,WeekOfMonth_2,WeekOfMonth_3,WeekOfMonth_4,WeekOfMonth_5,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,Make_Accura,Make_BMW,Make_Chevrolet,Make_Dodge,Make_Ferrari,Make_Ford,Make_Honda,Make_Jaguar,Make_Lexus,Make_Mazda,Make_Mecedes,Make_Mercury,Make_Nisson,Make_Pontiac,Make_Porche,Make_Saab,Make_Saturn,Make_Toyota,Make_VW,AccidentArea_Rural,DayOfWeekClaimed_Friday,DayOfWeekClaimed_Monday,DayOfWeekClaimed_Saturday,DayOfWeekClaimed_Sunday,DayOfWeekClaimed_Thursday,DayOfWeekClaimed_Tuesday,DayOfWeekClaimed_Wednesday,MonthClaimed_01Jan,MonthClaimed_02Feb,MonthClaimed_03Mar,MonthClaimed_04Apr,MonthClaimed_05May,MonthClaimed_06Jun,MonthClaimed_07Jul,MonthClaimed_08Aug,MonthClaimed_09Sep,MonthClaimed_10Oct,MonthClaimed_11Nov,MonthClaimed_12Dec,WeekOfMonthClaimed_1,WeekOfMonthClaimed_2,WeekOfMonthClaimed_3,WeekOfMonthClaimed_4,WeekOfMonthClaimed_5,Sex_Female,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,PolicyType_Sedan - All Perils,PolicyType_Sedan - Collision,PolicyType_Sedan - Liability,PolicyType_Sport - All Perils,PolicyType_Sport - Collision,PolicyType_Sport - Liability,PolicyType_Utility - All Perils,PolicyType_Utility - Collision,PolicyType_Utility - Liability,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,VehiclePrice_20000 to 29000,VehiclePrice_30000 to 39000,VehiclePrice_40000 to 59000,VehiclePrice_60000 to 69000,VehiclePrice_less than 20000,VehiclePrice_more than 69000,RepNumber_1,RepNumber_2,RepNumber_3,RepNumber_4,RepNumber_5,RepNumber_6,RepNumber_7,RepNumber_8,RepNumber_9,RepNumber_10,RepNumber_11,RepNumber_12,RepNumber_13,RepNumber_14,RepNumber_15,RepNumber_16,DriverRating_1,DriverRating_2,DriverRating_3,DriverRating_4,Days_Policy_Accident_1 to 7,Days_Policy_Accident_15 to 30,Days_Policy_Accident_8 to 15,Days_Policy_Accident_more than 30,Days_Policy_Accident_none,Days_Policy_Claim_15 to 30,Days_Policy_Claim_8 to 15,Days_Policy_Claim_more than 30,PastNumberOfClaims_1,PastNumberOfClaims_2 to 4,PastNumberOfClaims_more than 4,PastNumberOfClaims_none,AgeOfVehicle_2 years,AgeOfVehicle_3 years,AgeOfVehicle_4 years,AgeOfVehicle_5 years,AgeOfVehicle_6 years,AgeOfVehicle_7 years,AgeOfVehicle_more than 7,AgeOfVehicle_new,AgeOfPolicyHolder_16 to 17,AgeOfPolicyHolder_18 to 20,AgeOfPolicyHolder_21 to 25,AgeOfPolicyHolder_26 to 30,AgeOfPolicyHolder_31 to 35,AgeOfPolicyHolder_36 to 40,AgeOfPolicyHolder_41 to 50,AgeOfPolicyHolder_51 to 65,AgeOfPolicyHolder_over 65,PoliceReportFiled_No,WitnessPresent_No,AgentType_External,NumberOfSuppliments_1 to 2,NumberOfSuppliments_3 to 5,NumberOfSuppliments_more than 5,NumberOfSuppliments_none,AddressChange_Claim_1 year,AddressChange_Claim_2 to 3 years,AddressChange_Claim_4 to 8 years,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_1 vehicle,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,Year_1994,Year_1995,Year_1996,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,Age,FraudFound_P
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,21,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,34,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,47,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,65,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,27,0


In [9]:
#show the distribution of frauds and non-frauds
pd.concat([
    df[fraud_col].value_counts().rename("fraud_counts"),
    df[fraud_col].value_counts(
        normalize=True).rename("fraud_counts_prcnt")],
    axis=1)

Unnamed: 0,fraud_counts,fraud_counts_prcnt
0,14496,0.940139
1,923,0.059861


----
### Train and Test Split

In [10]:
train_ind, test_ind = train_test_split(
    prep_df.index, test_size=0.2, 
    random_state=0, stratify=prep_df[fraud_col])

train_df = prep_df.loc[train_ind]
test_df = prep_df.loc[test_ind]

print("Train Set", train_df.shape)
print("Test Set", test_df.shape)

Train Set (12335, 166)
Test Set (3084, 166)


In [11]:
#Training Set
pd.concat([
    train_df[fraud_col].value_counts().rename("counts"),
    train_df[fraud_col].value_counts(
        normalize=True).rename("counts_prcnt")
], axis=1)


Unnamed: 0,counts,counts_prcnt
0,11597,0.94017
1,738,0.05983


In [12]:
#Test Set
pd.concat([
    test_df[fraud_col].value_counts().rename("counts"),
    test_df[fraud_col].value_counts(
        normalize=True).rename("counts_prcnt")
], axis=1)

Unnamed: 0,counts,counts_prcnt
0,2899,0.940013
1,185,0.059987


----
### Model Training with Hyperparameter Tuning 

In [20]:
model_name_2_model = {
    DecisionTreeClassifier().__class__.__name__: DecisionTreeClassifier,
    DummyClassifier().__class__.__name__: DummyClassifier,
    ExtraTreeClassifier().__class__.__name__: ExtraTreeClassifier,
    GradientBoostingClassifier().__class__.__name__: GradientBoostingClassifier,
    LogisticRegression().__class__.__name__: LogisticRegression,
    RandomForestClassifier().__class__.__name__: RandomForestClassifier
}

model_name_2_params = {
    DecisionTreeClassifier().__class__.__name__:{
        'max_depth': [2, 3, 5, 10, 20],
        'min_samples_leaf': [5, 10, 20, 50, 100],
        'criterion': ["gini", "entropy"],
        'random_state': [0]
    },
    DummyClassifier().__class__.__name__: {
        'strategy': ['stratified'],
        'random_state': [0]
    },
    ExtraTreeClassifier().__class__.__name__:{
        'max_depth': [10, 50, 100, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [5, 10],
        'random_state': [0]
    },
    GradientBoostingClassifier().__class__.__name__:{
        "max_depth": [10, 50, 100, None],
        "max_features": ['auto', 'sqrt', None], 
        "min_samples_leaf": [2, 4],
        "min_samples_split": [5, 10],
        "n_estimators": [100, 500],
        "random_state": [0]
    },
    LogisticRegression().__class__.__name__:{
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'C': [100, 10, 1.0, 0.1, 0.01]
    },
    RandomForestClassifier().__class__.__name__: {
        'bootstrap': [True, False],
        'max_depth': [10, 50, 100, None],
        'max_features': ['auto'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [5, 10],
        'n_estimators': [400, 600],
        'random_state': [0]
    }
}

features = train_df.columns.difference([fraud_col])
target_var = fraud_col

test_size = 0.2
random_state = 0
cv_num_folds = 5

metric_2_score_function = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1": f1_score}

basis_metric = "f1"

model_dir = "../models/02_model_training/"

In [15]:
#some helper functions for the training the model and handling hyperparameters
def compute_metric_scores(X, y, model, metrics, cv=None):
    if cv is not None:
        metric_2_cv_scores = {}
        cv = ShuffleSplit(
            n_splits=cv_num_folds, test_size=test_size, 
            random_state=random_state)
        for metric in metrics:
            scores = cross_val_score(
                model, X, y, scoring=metric, cv=cv)
            metric_2_cv_scores[metric] = scores

        metrics_df = pd.DataFrame(metric_2_cv_scores).T
        metrics_df["mean_score"] = metrics_df.mean(axis=1)
        return metrics_df
    else:
        y_preds = model.predict(X)
        metric_2_score = {}
        for metric in metrics:
            score_func = metric_2_score_function[metric]
            metric_2_score[metric] = score_func(y, y_preds)
        metrics_df = pd.Series(metric_2_score).to_frame()
        return metrics_df
        

def run_model_hyperparameter_tuning(X, y, model, params, 
    basis_metric, cv):
    search_cv = GridSearchCV(model(), param_grid=params, cv=cv,
        scoring=basis_metric)
    search_cv.fit(X, y) 
    return search_cv  

----

In [16]:
%%time
#baseline score from DummyClassifier baseline

model_name = DummyClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 154 ms, sys: 9.78 ms, total: 164 ms
Wall time: 182 ms


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.878395,0.865018,0.874341,0.881232,0.869477,0.873693
precision,0.058511,0.050847,0.031915,0.057592,0.060773,0.051928
recall,0.08209,0.051724,0.044776,0.08871,0.067485,0.066957
f1,0.068323,0.051282,0.037267,0.069841,0.063953,0.058133


In [17]:
#performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.876783
precision,0.074236
recall,0.091892
f1,0.082126


In [18]:
%%time
#hyperparameter tuning for DecisionTreeClassifier

model_name = DecisionTreeClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 10.1 s, sys: 45.8 ms, total: 10.1 s
Wall time: 10.1 s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.916903,0.911634,0.924605,0.928658,0.917714,0.919903
precision,0.191304,0.296296,0.204545,0.245098,0.282609,0.243971
recall,0.164179,0.183908,0.134328,0.201613,0.159509,0.168708
f1,0.176707,0.22695,0.162162,0.221239,0.203922,0.198196


In [19]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.921206
precision,0.266129
recall,0.178378
f1,0.213592


In [21]:
%%time
model_name = RandomForestClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CPU times: user 19min 23s, sys: 4.41 s, total: 19min 28s
Wall time: 19min 29s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.946494,0.929469,0.946088,0.949737,0.933928,0.941143
precision,0.75,0.0,0.666667,0.0,0.5,0.383333
recall,0.022388,0.0,0.014925,0.0,0.006135,0.00869
f1,0.043478,0.0,0.029197,0.0,0.012121,0.016959


In [22]:
best_model

RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=400, random_state=0)

In [23]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.940013
precision,0.5
recall,0.005405
f1,0.010695


----

In [24]:
%%time
model_name = ExtraTreeClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 2.84 s, sys: 8.13 ms, total: 2.85 s
Wall time: 2.85 s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.931496,0.911228,0.923389,0.930685,0.919741,0.923308
precision,0.238806,0.191781,0.142857,0.15942,0.253521,0.197277
recall,0.119403,0.08046,0.08209,0.08871,0.110429,0.096218
f1,0.159204,0.11336,0.104265,0.11399,0.153846,0.128933


In [25]:
best_model

ExtraTreeClassifier(max_depth=50, min_samples_leaf=2, min_samples_split=5,
                    random_state=0)

In [26]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.9238
precision,0.202381
recall,0.091892
f1,0.126394


----

In [27]:
%%time
model_name = GradientBoostingClassifier().__class__.__name__
model = model_name_2_model[model_name]
params = model_name_2_params[model_name]
metrics = list(metric_2_score_function.keys())

cv = ShuffleSplit(
    n_splits=cv_num_folds, test_size=test_size, 
    random_state=random_state)

search_cv = run_model_hyperparameter_tuning(train_df[features],
    train_df[target_var], model, params, basis_metric, cv)

best_model = search_cv.best_estimator_
train_metrics_df = compute_metric_scores(
    train_df[features], train_df[target_var], best_model, 
    metrics, cv)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

#store the best model
model_name = best_model.__class__.__name__ + ".pkl"
model_path = os.path.join(model_dir, model_name)
pd.to_pickle(best_model, model_path)

train_metrics_df 

CPU times: user 5h 13min 32s, sys: 22.4 s, total: 5h 13min 54s
Wall time: 5h 45min 6s


Unnamed: 0,0,1,2,3,4,mean_score
accuracy,0.945683,0.930685,0.944467,0.949331,0.935955,0.941224
precision,0.5,0.555556,0.421053,0.48,0.586207,0.508563
recall,0.11194,0.086207,0.059701,0.096774,0.104294,0.091783
f1,0.182927,0.149254,0.104575,0.161074,0.177083,0.154983


In [28]:
best_model

GradientBoostingClassifier(max_depth=10, max_features='auto',
                           min_samples_leaf=2, min_samples_split=10,
                           random_state=0)

In [29]:
# performance on the test set
compute_metric_scores(
    test_df[features], test_df[target_var], best_model, 
    metrics)

Unnamed: 0,0
accuracy,0.94131
precision,0.5625
recall,0.097297
f1,0.165899
