In [1]:
from preprocessing import preprocess
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression

In [2]:
train_file = "data/FRISS_ClaimHistory_training.csv"
test_file = "data/FRISS_ClaimHistory_test.csv"
fraud_file = "data/fraud_cases.csv"

In [3]:
train, test = preprocess(train_file, test_file, fraud_file)

Finished preprocessing.
Dropped 182 Train entries.
Dropped 43 Test entries.
Preprocessing took 1.13 seconds


In [17]:
# Balance train
fraud_entries = train[train["sys_fraud"] == 1]
non_fraud_entries = train[train["sys_fraud"] == 0].sample(len(fraud_entries))
train = fraud_entries.append(non_fraud_entries).sample(frac=1.0)

In [18]:
trainX = train.drop(columns=["sys_fraud"])
trainY = train["sys_fraud"]

testX = test.drop(columns=["sys_fraud"])
testY = test["sys_fraud"]

In [19]:
# Logistic Regression

LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(trainX, trainY)

preds = LR.predict(trainX)
print("LR train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = LR.predict(testX)
print("LR test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

LR train metrics
accuracy:0.5
recall:1.0
precision:0.5
f1:0.6666666666666666

LR test metrics
accuracy:0.006814651500726562
recall:1.0
precision:0.006814651500726562
f1:0.013537052704922113


In [20]:
# Support vector machines

SVM = sk.svm.LinearSVC()
SVM.fit(trainX, trainY)

preds = SVM.predict(trainX)
print("LR train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = SVM.predict(testX)
print("LR test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

LR train metrics
accuracy:0.5
recall:0.0
precision:0.0
f1:0.0

LR test metrics
accuracy:0.9931853484992734
recall:0.0
precision:0.0
f1:0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Random forests
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(trainX, trainY)

preds = RF.predict(trainX)
print("RF train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = RF.predict(testX)
print("RF test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

RF train metrics
accuracy:0.6363636363636364
recall:0.5227272727272727
precision:0.6764705882352942
f1:0.5897435897435898

RF test metrics
accuracy:0.4748709725910708
recall:0.6470588235294118
precision:0.008365019011406844
f1:0.016516516516516516


In [22]:
# Multi level perceptron
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(16, 8, 2), random_state=1, max_iter=10000)
NN.fit(trainX, trainY)

preds = NN.predict(trainX)
print("NN train metrics")
print(f"accuracy:{sk.metrics.accuracy_score(trainY, preds)}")
print(f"recall:{sk.metrics.recall_score(trainY, preds)}")
print(f"precision:{sk.metrics.precision_score(trainY, preds)}")
print(f"f1:{sk.metrics.f1_score(trainY, preds)}")
print()
preds = NN.predict(testX)
print("NN test metrics")
print(f"accuracy:{sk.metrics.accuracy_score(testY, preds)}")
print(f"recall:{sk.metrics.recall_score(testY, preds)}")
print(f"precision:{sk.metrics.precision_score(testY, preds)}")
print(f"f1:{sk.metrics.f1_score(testY, preds)}")

NN train metrics
accuracy:0.5
recall:1.0
precision:0.5
f1:0.6666666666666666

NN test metrics
accuracy:0.006814651500726562
recall:1.0
precision:0.006814651500726562
f1:0.013537052704922113


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [23]:
preds

array([1., 1., 1., ..., 1., 1., 1.])