## Imports

In [1]:
# from IPython.core.interactiveshell import InteractiveShell

# InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

## Original data

In [3]:
carclaims_original = pd.read_csv('../../data/Angoss Knowledge Seeker - carclaims.txt/carclaims_original.csv')

In [4]:
carclaims_original.sample(10)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
5764,Sep,3,Friday,Toyota,Urban,Monday,Sep,3,Male,Married,...,41 to 50,No,No,External,more than 5,no change,1 vehicle,1994,All Perils,No
8950,Jul,1,Monday,Toyota,Urban,Thursday,Jun,5,Male,Married,...,36 to 40,No,No,External,1 to 2,no change,1 vehicle,1995,Collision,No
13874,Jul,5,Monday,Honda,Urban,Wednesday,Aug,1,Male,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1996,All Perils,No
4153,Mar,4,Friday,Honda,Urban,Monday,Mar,4,Female,Married,...,36 to 40,No,No,External,none,no change,1 vehicle,1994,Collision,No
6107,Jun,2,Monday,Chevrolet,Urban,Thursday,Jun,3,Female,Single,...,31 to 35,No,No,External,more than 5,no change,1 vehicle,1994,Collision,No
7465,Oct,4,Monday,Honda,Urban,Monday,Oct,4,Male,Married,...,31 to 35,No,No,External,none,no change,1 vehicle,1995,Collision,Yes
13912,Nov,1,Friday,Mazda,Urban,Monday,Nov,2,Male,Married,...,36 to 40,No,No,External,more than 5,no change,1 vehicle,1996,All Perils,No
9176,Jun,3,Friday,Toyota,Urban,Tuesday,Jun,4,Male,Married,...,over 65,No,No,External,none,no change,1 vehicle,1995,All Perils,No
12435,Feb,1,Tuesday,Pontiac,Urban,Tuesday,Feb,1,Male,Single,...,41 to 50,Yes,Yes,External,none,no change,1 vehicle,1996,All Perils,No
533,Jan,2,Monday,Pontiac,Urban,Friday,Jan,3,Male,Married,...,31 to 35,No,No,External,1 to 2,no change,3 to 4,1994,Collision,Yes


## Clean up

There is only one row with DayOfWeekClaimed and MonthClaimed as 0, for now we can drop it

In [5]:
carclaims_original = carclaims_original.drop(carclaims_original[carclaims_original['DayOfWeekClaimed'] == '0'].index).reset_index()
# carclaims_original.drop(carclaims_original[carclaims_original['Age'] == '0'].index, inplace=True)

In [6]:
carclaims_original[[]] == 0

0
1
2
3
4
...
15414
15415
15416
15417
15418


## Encoding

In [7]:
carclaims_original.dtypes

index                    int64
Month                   object
WeekOfMonth              int64
DayOfWeek               object
Make                    object
AccidentArea            object
DayOfWeekClaimed        object
MonthClaimed            object
WeekOfMonthClaimed       int64
Sex                     object
MaritalStatus           object
Age                      int64
Fault                   object
PolicyType              object
VehicleCategory         object
VehiclePrice            object
PolicyNumber             int64
RepNumber                int64
Deductible               int64
DriverRating             int64
Days:Policy-Accident    object
Days:Policy-Claim       object
PastNumberOfClaims      object
AgeOfVehicle            object
AgeOfPolicyHolder       object
PoliceReportFiled       object
WitnessPresent          object
AgentType               object
NumberOfSuppliments     object
AddressChange-Claim     object
NumberOfCars            object
Year                     int64
BasePoli

In [8]:
print(np.sort(carclaims_original['AgeOfPolicyHolder'].unique()))

['16 to 17' '18 to 20' '21 to 25' '26 to 30' '31 to 35' '36 to 40'
 '41 to 50' '51 to 65' 'over 65']


### Label encoding

In [9]:
column_labels = {
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'DayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'DayOfWeekClaimed': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'MonthClaimed': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'AgeOfPolicyHolder': np.sort(carclaims_original['AgeOfPolicyHolder'].unique()),
    'NumberOfSuppliments': ['none', '1 to 2', '3 to 5', 'more than 5'],
    'AddressChange-Claim': ['no change', 'under 6 months', '1 year', '2 to 3 years', '4 to 8 years'],
    'NumberOfCars': np.sort(carclaims_original['NumberOfCars'].unique()),
    'VehiclePrice': ['less than 20,000', '20,000 to 29,000', '30,000 to 39,000', '40,000 to 59,000', '60,000 to 69,000', 'more than 69,000'],
    'Days:Policy-Accident': ['none', '1 to 7', '15 to 30', '8 to 15', 'more than 30'],
    'Days:Policy-Claim': ['15 to 30', '8 to 15', 'more than 30'],
    'PastNumberOfClaims': ['none', '1', '2 to 4', 'more than 4'],
    'AgeOfVehicle': ['new', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', 'more than 7'],
    'Make': np.sort(carclaims_original['Make'].unique())
}

In [10]:
for column, labels  in column_labels.items():
    le = LabelEncoder()
    le.fit(labels)
    carclaims_original[column] = le.transform(carclaims_original[column])

In [11]:
carclaims_original

Unnamed: 0,index,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,0,2,5,6,6,Urban,5,4,1,Female,...,3,No,No,External,3,0,2,1994,Liability,No
1,1,4,3,6,6,Urban,1,4,4,Male,...,4,Yes,No,External,3,3,0,1994,Collision,No
2,2,10,5,0,6,Urban,4,9,2,Male,...,6,No,No,External,3,3,0,1994,Collision,No
3,3,6,2,2,17,Rural,0,5,1,Male,...,7,Yes,No,External,2,3,0,1994,Liability,No
4,4,4,5,1,6,Urban,5,3,2,Female,...,4,No,No,External,3,3,0,1994,Collision,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15414,15415,9,4,0,17,Urban,5,9,5,Male,...,4,No,No,External,3,3,0,1996,Collision,Yes
15415,15416,9,5,4,13,Urban,0,2,1,Male,...,4,No,No,External,2,3,2,1996,Liability,No
15416,15417,9,5,4,17,Rural,0,2,1,Male,...,3,No,No,External,0,3,0,1996,Collision,Yes
15417,15418,2,1,1,17,Urban,4,2,2,Female,...,4,No,No,External,2,3,0,1996,All Perils,No


### One Hot Encoding

In [12]:
columns_one_hot = ['AccidentArea', 'Sex', 'MaritalStatus', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'BasePolicy', 'Fault', 'PolicyType', 'VehicleCategory']

In [13]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
encoded_nominal = ohe.fit_transform(carclaims_original[columns_one_hot])
carclaims_original.drop(columns=columns_one_hot, axis=1, inplace=True)
carclaims_original = pd.concat([carclaims_original, pd.DataFrame(encoded_nominal, columns=ohe.get_feature_names_out(columns_one_hot))], axis=1)

In [14]:
X = carclaims_original.drop('FraudFound', axis=1)
y = carclaims_original['FraudFound']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X

Unnamed: 0,index,Month,WeekOfMonth,DayOfWeek,Make,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Age,VehiclePrice,...,PolicyType_Sedan - Collision,PolicyType_Sedan - Liability,PolicyType_Sport - All Perils,PolicyType_Sport - Collision,PolicyType_Sport - Liability,PolicyType_Utility - All Perils,PolicyType_Utility - Collision,PolicyType_Utility - Liability,VehicleCategory_Sport,VehicleCategory_Utility
0,0,2,5,6,6,5,4,1,21,5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,4,3,6,6,1,4,4,34,5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,10,5,0,6,4,9,2,47,5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,6,2,2,17,0,5,1,65,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,4,5,1,6,5,3,2,27,5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15414,15415,9,4,0,17,5,9,5,35,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15415,15416,9,5,4,13,0,2,1,30,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15416,15417,9,5,4,17,0,2,1,24,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15417,15418,2,1,1,17,4,2,2,34,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [17]:
sc = dbscan(eps=0.5, min_samples=88)

TypeError: missing a required argument: 'X'