## Imports

In [25]:
# from IPython.core.interactiveshell import InteractiveShell

# InteractiveShell.ast_node_interactivity = "all"

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

## Original data

In [2]:
carclaims_original = pd.read_csv('../../data/Angoss Knowledge Seeker - carclaims.txt/carclaims_original.csv')

In [3]:
carclaims_original.sample(10)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
5690,Apr,4,Monday,Honda,Urban,Tuesday,May,2,Male,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1994,All Perils,No
11879,Jun,1,Friday,Honda,Urban,Wednesday,Jun,2,Male,Married,...,41 to 50,No,No,External,none,no change,1 vehicle,1996,Collision,No
12868,May,3,Friday,Toyota,Urban,Friday,Jun,2,Male,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1996,Collision,No
355,Mar,2,Tuesday,Pontiac,Urban,Monday,Mar,3,Male,Married,...,41 to 50,No,No,External,none,no change,1 vehicle,1994,All Perils,No
11107,Mar,4,Monday,Mazda,Urban,Friday,Apr,2,Male,Single,...,36 to 40,No,No,External,none,no change,1 vehicle,1995,Collision,No
8551,Oct,2,Sunday,Mazda,Urban,Tuesday,Oct,2,Male,Married,...,36 to 40,No,No,External,none,no change,1 vehicle,1995,Liability,No
4469,Mar,1,Friday,Chevrolet,Urban,Tuesday,Mar,2,Male,Married,...,31 to 35,No,No,External,none,no change,3 to 4,1994,Collision,No
6434,Aug,5,Thursday,Mazda,Urban,Wednesday,Sep,1,Male,Single,...,31 to 35,No,No,External,1 to 2,no change,1 vehicle,1995,Collision,No
12497,Jul,3,Saturday,Pontiac,Urban,Monday,Jul,3,Male,Married,...,31 to 35,No,No,External,3 to 5,no change,1 vehicle,1996,Collision,No
10244,Nov,4,Saturday,Chevrolet,Urban,Monday,Nov,4,Male,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1995,All Perils,No


In [4]:
carclaims_original[carclaims_original['DayOfWeekClaimed'] == '0']

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
1516,Jul,2,Monday,Honda,Rural,0,0,1,Male,Single,...,16 to 17,No,No,External,none,no change,1 vehicle,1994,All Perils,No


## Clean up

There is only one row with DayOfWeekClaimed and MonthClaimed as 0, for now we can drop it

In [61]:
carclaims_original.drop(carclaims_original[carclaims_original['DayOfWeekClaimed'] == '0'].index, inplace=True)

## Encoding

In [1]:
(carclaims_original.dtypes == 'object')

NameError: name 'carclaims_original' is not defined

In [63]:
print(np.sort(carclaims_original['AgeOfPolicyHolder'].unique()))

['16 to 17' '18 to 20' '21 to 25' '26 to 30' '31 to 35' '36 to 40'
 '41 to 50' '51 to 65' 'over 65']


### Label encoding

In [64]:
column_labels = {
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'DayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'DayOfWeekClaimed': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'MonthClaimed': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'AgeOfPolicyHolder': np.sort(carclaims_original['AgeOfPolicyHolder'].unique()),
    'NumberOfSuppliments': ['none', '1 to 2', '3 to 5', 'more than 5'],
    'AddressChange-Claim': ['no change', 'under 6 months', '1 year', '2 to 3 years', '4 to 8 years'],
    'NumberOfCars': np.sort(carclaims_original['NumberOfCars'].unique()),
    'VehiclePrice': ['less than 20,000', '20,000 to 29,000', '30,000 to 39,000', '40,000 to 59,000', '60,000 to 69,000', 'more than 69,000'],
    'Days:Policy-Accident': ['none', '1 to 7', '15 to 30', '8 to 15', 'more than 30'],
    'Days:Policy-Claim': ['15 to 30', '8 to 15', 'more than 30'],
    'PastNumberOfClaims': ['none', '1', '2 to 4', 'more than 4'],
    'AgeOfVehicle': ['new', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', 'more than 7']
}

In [65]:
for column, labels  in column_labels.items():
    le = LabelEncoder()
    le.fit(labels)
    carclaims_original[column] = le.transform(carclaims_original[column])

In [66]:
carclaims_original

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,2,5,6,Honda,Urban,5,4,1,Female,Single,...,3,No,No,External,3,0,2,1994,Liability,No
1,4,3,6,Honda,Urban,1,4,4,Male,Single,...,4,Yes,No,External,3,3,0,1994,Collision,No
2,10,5,0,Honda,Urban,4,9,2,Male,Married,...,6,No,No,External,3,3,0,1994,Collision,No
3,6,2,2,Toyota,Rural,0,5,1,Male,Married,...,7,Yes,No,External,2,3,0,1994,Liability,No
4,4,5,1,Honda,Urban,5,3,2,Female,Single,...,4,No,No,External,3,3,0,1994,Collision,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,9,4,0,Toyota,Urban,5,9,5,Male,Married,...,4,No,No,External,3,3,0,1996,Collision,Yes
15416,9,5,4,Pontiac,Urban,0,2,1,Male,Married,...,4,No,No,External,2,3,2,1996,Liability,No
15417,9,5,4,Toyota,Rural,0,2,1,Male,Single,...,3,No,No,External,0,3,0,1996,Collision,Yes
15418,2,1,1,Toyota,Urban,4,2,2,Female,Married,...,4,No,No,External,2,3,0,1996,All Perils,No


### One Hot Encoding

In [67]:
columns_one_hot = ['Make', 'AccidentArea', 'Sex', 'MaritalStatus', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'BasePolicy', 'Fault', 'PolicyType', 'VehicleCategory', 'FraudFound']

In [68]:
for column in columns_one_hot:
    ohe = OneHotEncoder(drop='first', sparse_output=False)
    carclaims_original[column] = ohe.fit_transform(carclaims_original[[column]])

In [69]:
carclaims_original

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,2,5,6,0.0,1.0,5,4,1,0.0,0.0,...,3,0.0,0.0,0.0,3,0,2,1994,0.0,0.0
1,4,3,6,0.0,1.0,1,4,4,1.0,0.0,...,4,1.0,0.0,0.0,3,3,0,1994,1.0,0.0
2,10,5,0,0.0,1.0,4,9,2,1.0,1.0,...,6,0.0,0.0,0.0,3,3,0,1994,1.0,0.0
3,6,2,2,0.0,0.0,0,5,1,1.0,1.0,...,7,1.0,0.0,0.0,2,3,0,1994,0.0,0.0
4,4,5,1,0.0,1.0,5,3,2,0.0,0.0,...,4,0.0,0.0,0.0,3,3,0,1994,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,9,4,0,0.0,1.0,5,9,5,1.0,1.0,...,4,0.0,0.0,0.0,3,3,0,1996,1.0,1.0
15416,9,5,4,0.0,1.0,0,2,1,1.0,1.0,...,4,0.0,0.0,0.0,2,3,2,1996,0.0,0.0
15417,9,5,4,0.0,0.0,0,2,1,1.0,0.0,...,3,0.0,0.0,0.0,0,3,0,1996,1.0,1.0
15418,2,1,1,0.0,1.0,4,2,2,0.0,1.0,...,4,0.0,0.0,0.0,2,3,0,1996,0.0,0.0


In [70]:
X = carclaims_original.drop('FraudFound', axis=1)
y = carclaims_original['FraudFound']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
xgboost_model = XGBClassifier(eval_metric='logloss', random_state=42)

In [72]:
 param_grid_xgb = {
    'n_estimators': [51, 101, 201],
    'max_depth': [3, 5, 10, 20],
    'learning_rate': [0.0005, 0.01, 0.05, 0.1, 0.3],
}

In [73]:
grid_search_xgboost = GridSearchCV(estimator=xgboost_model, param_grid=param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgboost.fit(X_train, y_train)

In [76]:
print("Best Parameters for XGBoost are {}".format(grid_search_xgboost.best_params_))

Best Parameters for XGBoost are {'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 201}


In [78]:
best_xgboost_model = grid_search_xgboost.best_estimator_
predictions = best_xgboost_model.predict(X_test)

In [80]:
print("XGBoost Accuracy is {}".format(accuracy_score(y_test, predictions)))
# print(classification_report(y_test, predictions))

XGBoost Accuracy is 0.9507133592736705
