In [1]:
from preprocessing import preprocess
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression

In [2]:
train_file = "data/FRISS_ClaimHistory_training.csv"
test_file = "data/FRISS_ClaimHistory_test.csv"
fraud_file = "data/fraud_cases.csv"

In [22]:
train, test = preprocess(train_file, test_file, fraud_file)

Finished preprocessing.
Dropped 182 Train entries.
Dropped 43 Test entries.
Preprocessing took 1.20 seconds


In [31]:
# Balance train
fraud_entries = train[train["sys_fraud"] == 1]
non_fraud_entries = train[train["sys_fraud"] == 0].sample(len(fraud_entries))
train = fraud_entries.append(non_fraud_entries).sample(frac=1.0)

In [23]:
drop_cols = ['sys_claimid', 'claim_amount_claimed_total', 'occurred_year', 'occurred_month', 'occurred_day', 'reported_year', 'reported_month', 'reported_day']
test = test.drop(columns=drop_cols)
train = train.drop(columns=drop_cols)

In [24]:
train.columns

Index(['claim_location_urban_area', 'object_year_construction',
       'policy_fleet_flag', 'policy_insured_amount', 'policy_profitability',
       'sys_fraud', 'claim_time_interval', 'cause_Animals', 'cause_Collision',
       'cause_Other', 'cause_Theft', 'cause_Weather', 'make_AUDI', 'make_BMW',
       'make_CITROEN', 'make_OPEL', 'make_OTHER', 'make_RENAULT',
       'make_VOLKSWAGEN'],
      dtype='object')

In [32]:
trainX = train.drop(columns=["sys_fraud"])
trainY = train["sys_fraud"]

testX = test.drop(columns=["sys_fraud"])
testY = test["sys_fraud"]

In [33]:
# Logistic Regression

LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(trainX, trainY)

preds = LR.predict(trainX)
print("LR train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = LR.predict(testX)
print("LR test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

LR train metrics
accuracy:0.5925324675324676
recall:0.36363636363636365
precision:0.6706586826347305
f1:0.471578947368421

LR test metrics
accuracy:0.5056872275392092
recall:0.45588235294117646
precision:0.0062924997462701715
f1:0.012413655020522574


In [34]:
# Support vector machines

SVM = sk.svm.LinearSVC()
SVM.fit(trainX, trainY)

preds = SVM.predict(trainX)
print("LR train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = SVM.predict(testX)
print("LR test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

LR train metrics
accuracy:0.5
recall:0.0
precision:0.0
f1:0.0

LR test metrics
accuracy:0.829232850628852
recall:0.15441176470588236
precision:0.006336753168376584
f1:0.012173913043478259


  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# Random forests
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=0)
RF.fit(trainX, trainY)

preds = RF.predict(trainX)
print("RF train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = RF.predict(testX)
print("RF test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

RF train metrics
accuracy:0.7077922077922078
recall:0.577922077922078
precision:0.7807017543859649
f1:0.664179104477612

RF test metrics
accuracy:0.431177030615824
recall:0.6470588235294118
precision:0.007724719101123595
f1:0.015267175572519083


In [36]:
# Multi level perceptron
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(16, 8, 2), random_state=1, max_iter=10000)
NN.fit(trainX, trainY)

preds = NN.predict(trainX)
print("NN train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = NN.predict(testX)
print("NN test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

NN train metrics
accuracy:0.5
recall:1.0
precision:0.5
f1:0.6666666666666666

NN test metrics
accuracy:0.006914866963972541
recall:1.0
precision:0.00681533450263092
f1:0.013538400278731771


In [37]:
preds

array([1., 1., 1., ..., 1., 1., 1.])