## Imports

In [1]:
# from IPython.core.interactiveshell import InteractiveShell

# InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

## Original data

In [4]:
carclaims_synthetic = pd.read_csv('../../data/Angoss Knowledge Seeker - carclaims.txt/carclaims_synthetic_2024-11-03 21:55:14.011908.csv')

In [5]:
carclaims_synthetic.sample(10)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
629184,Oct,3,Tuesday,VW,Rural,Tuesday,Apr,2,Male,Married,...,36 to 40,No,No,External,1 to 2,no change,1 vehicle,1995,Liability,No
117717,Jun,3,Monday,Mazda,Urban,Monday,Jul,1,Male,Married,...,36 to 40,No,No,External,none,no change,1 vehicle,1995,All Perils,No
139433,Mar,4,Monday,Chevrolet,Urban,Monday,Jul,1,Male,Married,...,21 to 25,No,No,External,none,no change,1 vehicle,1995,Collision,No
915653,Jul,3,Monday,Toyota,Urban,Tuesday,Apr,3,Male,Married,...,41 to 50,No,No,Internal,none,no change,1 vehicle,1995,Collision,No
651394,Nov,4,Tuesday,VW,Rural,Monday,Jun,4,Male,Married,...,41 to 50,No,No,External,more than 5,no change,1 vehicle,1995,Collision,No
273480,Jun,3,Wednesday,Accura,Rural,Monday,Nov,2,Male,Married,...,41 to 50,No,No,External,1 to 2,no change,1 vehicle,1995,All Perils,No
891089,Jul,5,Tuesday,VW,Urban,Wednesday,Apr,4,Male,Married,...,36 to 40,No,No,External,more than 5,no change,1 vehicle,1995,Liability,No
981184,Apr,3,Monday,Mazda,Rural,Wednesday,Nov,2,Male,Married,...,36 to 40,No,No,External,none,no change,1 vehicle,1995,Liability,No
695906,Aug,3,Saturday,Toyota,Urban,Tuesday,Sep,5,Male,Single,...,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1995,All Perils,No
522902,Mar,1,Thursday,Mazda,Urban,Wednesday,Sep,1,Female,Married,...,41 to 50,No,No,External,1 to 2,no change,1 vehicle,1995,All Perils,No


## Clean up

There is only one row with DayOfWeekClaimed and MonthClaimed as 0, for now we can drop it

In [6]:
carclaims_synthetic.drop(carclaims_synthetic[carclaims_synthetic['DayOfWeekClaimed'] == '0'].index, inplace=True)
carclaims_synthetic.drop(carclaims_synthetic[carclaims_synthetic['MonthClaimed'] == '0'].index, inplace=True)

## Encoding

In [7]:
carclaims_synthetic.dtypes

Month                   object
WeekOfMonth              int64
DayOfWeek               object
Make                    object
AccidentArea            object
DayOfWeekClaimed        object
MonthClaimed            object
WeekOfMonthClaimed       int64
Sex                     object
MaritalStatus           object
Age                      int64
Fault                   object
PolicyType              object
VehicleCategory         object
VehiclePrice            object
PolicyNumber             int64
RepNumber                int64
Deductible               int64
DriverRating             int64
Days:Policy-Accident    object
Days:Policy-Claim       object
PastNumberOfClaims      object
AgeOfVehicle            object
AgeOfPolicyHolder       object
PoliceReportFiled       object
WitnessPresent          object
AgentType               object
NumberOfSuppliments     object
AddressChange-Claim     object
NumberOfCars            object
Year                     int64
BasePolicy              object
FraudFou

In [8]:
print(np.sort(carclaims_synthetic['AgeOfVehicle'].unique()))

['2 years' '3 years' '4 years' '5 years' '6 years' '7 years' 'more than 7'
 'new']


### Label encoding

In [9]:
column_labels = {
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'DayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'DayOfWeekClaimed': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'MonthClaimed': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'AgeOfPolicyHolder': np.sort(carclaims_synthetic['AgeOfPolicyHolder'].unique()),
    'NumberOfSuppliments': ['none', '1 to 2', '3 to 5', 'more than 5'],
    'AddressChange-Claim': ['no change', 'under 6 months', '1 year', '2 to 3 years', '4 to 8 years'],
    'NumberOfCars': np.sort(carclaims_synthetic['NumberOfCars'].unique()),
    'VehiclePrice': ['less than 20,000', '20,000 to 29,000', '30,000 to 39,000', '40,000 to 59,000', '60,000 to 69,000', 'more than 69,000'],
    'Days:Policy-Accident': ['none', '1 to 7', '15 to 30', '8 to 15', 'more than 30'],
    'Days:Policy-Claim': ['none', '15 to 30', '8 to 15', 'more than 30'],
    'PastNumberOfClaims': ['none', '1', '2 to 4', 'more than 4'],
    'AgeOfVehicle': ['new', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', 'more than 7']
}

In [10]:
for column, labels  in column_labels.items():
    print(column)
    le = LabelEncoder()
    le.fit(labels)
    carclaims_synthetic[column] = le.transform(carclaims_synthetic[column])

Month
DayOfWeek
DayOfWeekClaimed
MonthClaimed
AgeOfPolicyHolder
NumberOfSuppliments
AddressChange-Claim
NumberOfCars
VehiclePrice
Days:Policy-Accident
Days:Policy-Claim
PastNumberOfClaims
AgeOfVehicle


In [11]:
carclaims_synthetic

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,8,3,1,Mazda,Urban,5,5,3,Female,Married,...,4,No,No,External,3,3,0,1995,Liability,No
1,4,5,5,Accura,Urban,6,3,1,Male,Married,...,5,No,No,External,3,3,0,1996,Collision,No
2,3,1,1,Toyota,Urban,5,7,4,Male,Married,...,5,No,No,External,2,3,0,1995,Liability,No
3,0,1,0,Pontiac,Urban,1,7,2,Male,Married,...,8,No,No,External,3,3,0,1994,All Perils,No
4,3,1,1,Toyota,Urban,1,1,2,Male,Married,...,6,No,No,External,2,3,0,1995,Liability,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,5,4,5,Toyota,Urban,1,8,2,Male,Married,...,5,No,No,External,1,3,0,1995,Liability,No
999996,1,2,1,Toyota,Urban,1,9,3,Male,Married,...,8,No,No,External,2,3,0,1995,All Perils,No
999997,7,5,1,Toyota,Urban,1,8,1,Male,Married,...,5,Yes,No,External,3,3,0,1994,Liability,No
999998,3,1,6,Ford,Urban,5,10,2,Male,Married,...,6,No,No,External,2,3,0,1995,Liability,No


### One Hot Encoding

In [12]:
columns_one_hot = ['Make', 'AccidentArea', 'Sex', 'MaritalStatus', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'BasePolicy', 'Fault', 'PolicyType', 'VehicleCategory', 'FraudFound']

In [13]:
for column in columns_one_hot:
    ohe = OneHotEncoder(drop='first', sparse_output=False)
    carclaims_synthetic[column] = ohe.fit_transform(carclaims_synthetic[[column]])

In [14]:
carclaims_synthetic

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,8,3,1,0.0,1.0,5,5,3,0.0,1.0,...,4,0.0,0.0,0.0,3,3,0,1995,0.0,0.0
1,4,5,5,0.0,1.0,6,3,1,1.0,1.0,...,5,0.0,0.0,0.0,3,3,0,1996,1.0,0.0
2,3,1,1,0.0,1.0,5,7,4,1.0,1.0,...,5,0.0,0.0,0.0,2,3,0,1995,0.0,0.0
3,0,1,0,0.0,1.0,1,7,2,1.0,1.0,...,8,0.0,0.0,0.0,3,3,0,1994,0.0,0.0
4,3,1,1,0.0,1.0,1,1,2,1.0,1.0,...,6,0.0,0.0,0.0,2,3,0,1995,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,5,4,5,0.0,1.0,1,8,2,1.0,1.0,...,5,0.0,0.0,0.0,1,3,0,1995,0.0,0.0
999996,1,2,1,0.0,1.0,1,9,3,1.0,1.0,...,8,0.0,0.0,0.0,2,3,0,1995,0.0,0.0
999997,7,5,1,0.0,1.0,1,8,1,1.0,1.0,...,5,1.0,0.0,0.0,3,3,0,1994,0.0,0.0
999998,3,1,6,0.0,1.0,5,10,2,1.0,1.0,...,6,0.0,0.0,0.0,2,3,0,1995,0.0,0.0


In [15]:
X = carclaims_synthetic.drop('FraudFound', axis=1)
y = carclaims_synthetic['FraudFound']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
xgboost_model = XGBClassifier(eval_metric='logloss', random_state=42)

In [17]:
 param_grid_xgb = {
    'n_estimators': [64, 128, 256, 512, 1024],
    'max_depth': [3, 5, 10, 15, 20],
    'learning_rate': [0.0005, 0.01, 0.05, 0.1, 0.3]
}

In [None]:
grid_search_xgboost = GridSearchCV(estimator=xgboost_model, param_grid=param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgboost.fit(X_train, y_train)

In [None]:
print("Best Parameters for XGBoost are {}".format(grid_search_xgboost.best_params_))

In [None]:
best_xgboost_model = grid_search_xgboost.best_estimator_
predictions = best_xgboost_model.predict(X_test)

In [None]:
print("XGBoost Accuracy is {}".format(accuracy_score(y_test, predictions)))
# print(classification_report(y_test, predictions))