In [21]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from ml_utils import train_test_split_marketing,\
    fill_missing,\
    build_encoders,\
    encode_categorical,\
    build_target_encoder,\
    encode_target

df = pd.read_csv('vehicle_insurance_claim_fraud.csv').rename(columns={'FraudFound_P':'y'})
display(df.head())
display(df.info())

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  y                     15420 non-null

None

In [34]:
# Split the data into training and testing sets
X = df.drop(columns='y')
y = df['y'].values.reshape(-1,1)

# Show the X_train dataset
X_train, X_test, y_train, y_test = train_test_split(X,y)
display(X_train.describe())

# What are the y values we have to work with?
values, counts = np.unique(y, return_counts=True)
print("Values: ", values)
print("Counts: ", counts)

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,11565.0,11565.0,11565.0,11565.0,11565.0,11565.0,11565.0,11565.0
mean,2.786079,2.68716,39.865456,7741.290618,8.492693,407.70428,2.497536,1994.872201
std,1.294921,1.262644,13.466732,4460.218239,4.605959,44.015371,1.121439,0.80432
min,1.0,1.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,3855.0,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,7740.0,8.0,400.0,3.0,1995.0
75%,4.0,4.0,48.0,11612.0,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,15420.0,16.0,700.0,4.0,1996.0


Values:  [0 1]
Counts:  [14497   923]


In [38]:
# Test for null values, should be 15420 rows
X.dropna().count().head(1)


Month    15420
dtype: int64

In [4]:
# Create the encoders for categorical variables (use X_train_filled)
encoders = build_encoders(X_train_filled)
encoders

[{'column': 'job',
  'multi_col_output': True,
  'encoder': OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=5,
                sparse_output=False)},
 {'column': 'marital',
  'multi_col_output': True,
  'encoder': OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)},
 {'column': 'education',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['primary', 'secondary', 'tertiary']],
                 handle_unknown='use_encoded_value', unknown_value=-1)},
 {'column': 'default',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'housing',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'loan',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='us

In [5]:
# Encode X_train_filled and X_test_filled
X_train_encoded = encode_categorical(X_train_filled, encoders)
X_test_encoded = encode_categorical(X_test_filled, encoders)

X_train_encoded.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_management,...,x0_telephone,x0_unknown,x0_nonexistent,x0_other,x0_success,education,default,housing,loan,month
0,34,328,21,18,7,-1.0,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,10.0
1,30,484,6,703,1,-1.0,0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0
2,42,31,18,120,1,-1.0,0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
3,27,3,22,506,1,-1.0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,6.0
4,30,19,26,191,2,-1.0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0


In [6]:
# Encode y_train and y_test
y_encoder = build_target_encoder(y_train)
y_train_encoded = encode_target(y_train, y_encoder)
y_test_encoded = encode_target(y_test, y_encoder)

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=13)
model.fit(X_train_encoded, y_train_encoded)
y_pred = model.predict(X_test_encoded)
print(balanced_accuracy_score(y_test_encoded, y_pred))

0.5753911268989494


In [8]:
# Import new data and test with the model

new_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_3/datasets/bank_marketing_new_data.csv')

In [9]:
X = new_df.drop(columns='y')
y = new_df['y'].values.reshape(-1, 1)

In [10]:
X_filled = fill_missing(X)
X_encoded = encode_categorical(X_filled, encoders)
y_encoded = encode_target(y, y_encoder)

In [11]:
y_pred = model.predict(X_encoded)
print(balanced_accuracy_score(y_encoded, y_pred))

0.5769138944243607
