In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [110]:
data = pd.read_csv('../../data/vehicle_claims_labeled.csv')

In [111]:
numerical_cols = ['Price', 'Seat_num', 'Door_num', 'repair_cost', 'repair_hours']

In [112]:
som_train_data = data[numerical_cols]

In [113]:
from sklearn.preprocessing import MinMaxScaler
from minisom import MiniSom

In [114]:
som_train_data

Unnamed: 0,Price,Seat_num,Door_num,repair_cost,repair_hours
0,21500.0,5.0,4.0,395.0000,9.0
1,28750.0,5.0,4.0,695.0000,6.0
2,29999.0,5.0,4.0,89.9990,3.0
3,34948.0,5.0,4.0,224.8440,6.0
4,26555.0,5.0,4.0,75.9330,3.0
...,...,...,...,...,...
268250,8750.0,2.0,0.0,207.5000,6.0
268251,7995.0,11.0,0.0,127.9950,6.0
268252,27950.0,2.0,0.0,56.7700,2.0
268253,34950.0,2.0,0.0,60.9700,2.0


In [115]:
train = som_train_data[:200000]
test = som_train_data[200000:]

In [116]:
train_y = data['Label'][:200000]
test_y = data['Label'][200000:]

**Isolation Forest**

In [117]:
from sklearn.ensemble import IsolationForest

In [118]:
n_estimators = 400

In [119]:
clf = IsolationForest(random_state=0, n_estimators=n_estimators).fit(train)

In [120]:
result = clf.predict(test)

In [121]:
result = pd.DataFrame(result)
result.replace(to_replace=1, value=0, inplace=True)
result.replace(to_replace=-1, value=1, inplace=True)

In [122]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [123]:
precision = precision_score(test_y, result, average='binary')
recall = recall_score(test_y, result, average='binary')
f1 = f1_score(test_y, result, average='binary')

In [124]:
precision, recall, f1

(0.917679431323091, 0.2714140386571719, 0.4189259918350256)

In [149]:
log_train = np.log(train)
log_label = train_y
log_test = np.log(test)
log_test_label = test_y

In [150]:
log_train.replace([np.inf, -np.inf], np.nan, inplace=True)
log_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [151]:
log_train.isna().sum()

Price           1010
Seat_num           0
Door_num        3322
repair_cost     1127
repair_hours      54
dtype: int64

In [152]:
idx = np.unique(np.where(log_train.isna()))

In [153]:
idx_test = np.unique(np.where(log_test.isna()))

In [154]:
log_test.reset_index(inplace=True)

In [155]:
log_test.drop(idx_test, axis=0, inplace=True)
log_test_label.reset_index().drop(idx_test, axis=0, inplace=True)

In [156]:
log_train.drop(idx, axis=0, inplace=True)
log_label.drop(idx, axis=0, inplace=True)

KeyError: '[     0      2      3 ... 199996 199997 199999] not found in axis'

In [133]:
clf_log = IsolationForest(random_state=0, n_estimators=n_estimators).fit(log_train)

In [157]:
log_label

1         0
5         0
6         0
7         0
8         0
         ..
199989    1
199990    0
199991    0
199994    0
199998    0
Name: Label, Length: 195611, dtype: int64

In [135]:
result_log = clf_log.predict(log_test)

Feature names unseen at fit time:
- index
Feature names must be in the same order as they were in fit.



ValueError: X has 6 features, but IsolationForest is expecting 5 features as input.

**One Class SVM**

In [17]:
from sklearn.svm import OneClassSVM

In [18]:
clf = OneClassSVM(kernel='rbf', gamma='auto', verbose=True).fit(train)

[LibSVM]

In [19]:
result = clf.predict(test)
result = pd.DataFrame(result)
result.replace(to_replace=1, value=0, inplace=True)
result.replace(to_replace=-1, value=1, inplace=True)

In [21]:
precision = precision_score(test_y, result, average='binary')
recall = recall_score(test_y, result, average='binary')
f1 = f1_score(test_y, result, average='binary')

In [22]:
precision, recall, f1

(0.2929358392741413, 0.8583248558833503, 0.4367978739926488)

**Local Outlier factor**

In [38]:
from sklearn.neighbors import LocalOutlierFactor

In [44]:
clf = LocalOutlierFactor(n_neighbors=100)

In [45]:
result = clf.fit_predict(som_train_data)
result = pd.DataFrame(result)
result.replace(to_replace=1, value=0, inplace=True)
result.replace(to_replace=-1, value=1, inplace=True)

In [46]:
y = data['Label']

In [47]:
precision = precision_score(y, result, average='binary')
recall = recall_score(y, result, average='binary')
f1 = f1_score(y, result, average='binary')

In [48]:
precision, recall, f1

(0.31482977901134807, 0.157905866182664, 0.21032248979017037)

**Gradient Boosting**

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
            max_depth=1, random_state=0).fit(train, train_y)
result = clf.predict(test)
result = pd.DataFrame(result)
precision = precision_score(test_y, result, average='binary')
recall = recall_score(test_y, result, average='binary')
f1 = f1_score(test_y, result, average='binary')
precision, recall, f1

(0.9915380047505938, 0.45296710749406577, 0.6218518690936176)

**Random forest**

In [24]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train, train_y)
result = clf.predict(test)
result = pd.DataFrame(result)
precision = precision_score(test_y, result, average='binary')
recall = recall_score(test_y, result, average='binary')
f1 = f1_score(test_y, result, average='binary')
precision, recall, f1

(1.0, 0.3813496100373008, 0.5521406127258445)