# Fraud Detection with Graph databases and Machine Learning

## Importing the required Python libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# pip install imblearn
from imblearn.over_sampling import SMOTE 
from collections import Counter #for Smote

## Loading and exploring the banksim dataset 

In [2]:
banksim_df = pd.read_csv("../data/bs140513_032310.csv")
banksim_df.head(10)

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
5,0,'C1315400589','3','F','28007','M348934600','28007','es_transportation',25.81,0
6,0,'C765155274','1','F','28007','M348934600','28007','es_transportation',9.1,0
7,0,'C202531238','4','F','28007','M348934600','28007','es_transportation',21.17,0
8,0,'C105845174','3','M','28007','M348934600','28007','es_transportation',32.4,0
9,0,'C39858251','5','F','28007','M348934600','28007','es_transportation',35.4,0


In [3]:
banksim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 10 columns):
step           594643 non-null int64
customer       594643 non-null object
age            594643 non-null object
gender         594643 non-null object
zipcodeOri     594643 non-null object
merchant       594643 non-null object
zipMerchant    594643 non-null object
category       594643 non-null object
amount         594643 non-null float64
fraud          594643 non-null int64
dtypes: float64(1), int64(2), object(7)
memory usage: 45.4+ MB


Viewing the split of the output classes - fraudulent and genuine transactions

In [4]:
banksim_df['fraud'].value_counts()

0    587443
1      7200
Name: fraud, dtype: int64

In [5]:
# Number of unique values per column in the banksim dataset
banksim_df.nunique()

step             180
customer        4112
age                8
gender             4
zipcodeOri         1
merchant          50
zipMerchant        1
category          15
amount         23767
fraud              2
dtype: int64

## Preprocessing the data 

In [6]:
# Obtaining the number of null values in each column
banksim_df.isna().sum()

step           0
customer       0
age            0
gender         0
zipcodeOri     0
merchant       0
zipMerchant    0
category       0
amount         0
fraud          0
dtype: int64

Looks like there are no null values in the dataset.

In [7]:
# Retrieving the class attribute from the dataframe
Y_before_smote = banksim_df['fraud']
Y_before_smote.head()

0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64

In [8]:
'''
Removing unwanted columns
Since zipcodeOri and zipMerchant have the same value for all the rows, these columns are redundant
'''

feature_df = banksim_df.drop(['step', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

In [9]:
feature_df.head()

Unnamed: 0,age,gender,merchant,category,amount
0,'4','M','M348934600','es_transportation',4.55
1,'2','M','M348934600','es_transportation',39.68
2,'4','F','M1823072687','es_transportation',26.89
3,'3','M','M348934600','es_transportation',17.25
4,'5','M','M348934600','es_transportation',35.72


In [10]:
# One hot encoding the categorical variables
feature_df = pd.get_dummies(feature_df, columns=['age', 'gender', 'category', 'merchant'])
feature_df.head()

Unnamed: 0,amount,age_'0',age_'1',age_'2',age_'3',age_'4',age_'5',age_'6',age_'U',gender_'E',...,merchant_'M732195782',merchant_'M78078399',merchant_'M840466850',merchant_'M855959430',merchant_'M857378720',merchant_'M85975013',merchant_'M923029380',merchant_'M933210764',merchant_'M97925176',merchant_'M980657600'
0,4.55,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,39.68,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,26.89,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,17.25,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35.72,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
print('Original dataset shape %s' % Counter(Y_before_smote))

X_before_smote = feature_df
sm = SMOTE(random_state=1)
# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_before_smote, Y_before_smote)
                                               
# Converting back tp data frame from nparray
X_after_smote_df = pd.DataFrame(X_after_smote, columns=X_before_smote.columns)
Y_after_smote_df = pd.DataFrame(Y_after_smote, columns = ["fraud"])
                                               
print('dataset shape after smote %s' % Counter(Y_after_smote))

Original dataset shape Counter({0: 587443, 1: 7200})
dataset shape after smote Counter({0: 587443, 1: 587443})


In [12]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(X_after_smote_df), columns = X_after_smote_df.columns)

scaled_df.head()

Unnamed: 0,amount,age_'0',age_'1',age_'2',age_'3',age_'4',age_'5',age_'6',age_'U',gender_'E',...,merchant_'M732195782',merchant_'M78078399',merchant_'M840466850',merchant_'M855959430',merchant_'M857378720',merchant_'M85975013',merchant_'M923029380',merchant_'M933210764',merchant_'M97925176',merchant_'M980657600'
0,-0.431067,-0.046381,-0.256201,-0.589301,-0.479974,2.53867,-0.264541,-0.158117,-0.031681,-0.031681,...,-0.155582,-0.038942,-0.052562,-0.119198,-0.025003,-0.151185,-0.023052,-0.007664,-0.022585,-0.296406
1,-0.376545,-0.046381,-0.256201,1.696927,-0.479974,-0.393907,-0.264541,-0.158117,-0.031681,-0.031681,...,-0.155582,-0.038942,-0.052562,-0.119198,-0.025003,-0.151185,-0.023052,-0.007664,-0.022585,-0.296406
2,-0.396395,-0.046381,-0.256201,-0.589301,-0.479974,2.53867,-0.264541,-0.158117,-0.031681,-0.031681,...,-0.155582,-0.038942,-0.052562,-0.119198,-0.025003,-0.151185,-0.023052,-0.007664,-0.022585,-0.296406
3,-0.411357,-0.046381,-0.256201,-0.589301,2.083448,-0.393907,-0.264541,-0.158117,-0.031681,-0.031681,...,-0.155582,-0.038942,-0.052562,-0.119198,-0.025003,-0.151185,-0.023052,-0.007664,-0.022585,-0.296406
4,-0.382691,-0.046381,-0.256201,-0.589301,-0.479974,-0.393907,3.780137,-0.158117,-0.031681,-0.031681,...,-0.155582,-0.038942,-0.052562,-0.119198,-0.025003,-0.151185,-0.023052,-0.007664,-0.022585,-0.296406


## Training supervised learning models using intrinsic features from the dataset

In [13]:
k_fold = KFold(n_splits=5, random_state=None, shuffle=False)


random_forest = RandomForestClassifier(max_depth=4, n_estimators=150)
svm = SVC(gamma="auto")
logistic_regression = LogisticRegression(solver='lbfgs')
scaled_df = scaled_df.values
Y_after_smote_df = Y_after_smote_df.values

In [14]:
# Random Forest Classifier

for train_index, test_index in k_fold.split(scaled_df):
    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = Y_after_smote_df[train_index], Y_after_smote_df[test_index]
    
    clf = random_forest.fit(X_train, y_train)

    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions) * 100

    metrics = precision_recall_fscore_support(y_test, predictions, average='macro')
    precision, recall = metrics[0], metrics[1]
    f1_score = 2 * (precision * recall) / (precision + recall)

    
    print("Accuracy = {0:.4f}%".format(accuracy))
    print("Precision = {0:.4f}".format(precision))
    print("Recall = {0:.4f}".format(recall))
    print("F1 Score = {0:.4f}".format(f1_score))
    print("---------------------------------------------")

  import sys


Accuracy = 89.4475%
Precision = 0.5569
Recall = 0.9465
F1 Score = 0.7012
---------------------------------------------


  import sys


Accuracy = 91.1834%
Precision = 0.5573
Recall = 0.9550
F1 Score = 0.7039
---------------------------------------------


  import sys


Accuracy = 96.0290%
Precision = 0.9614
Recall = 0.9622
F1 Score = 0.9618
---------------------------------------------


  import sys
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy = 93.1206%
Precision = 0.5000
Recall = 0.4656
F1 Score = 0.4822
---------------------------------------------


  import sys


Accuracy = 92.8402%
Precision = 0.5000
Recall = 0.4642
F1 Score = 0.4814
---------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# SVM Classifier

# for train_index, test_index in k_fold.split(scaled_df):
#     X_train, X_test = scaled_df[train_index], scaled_df[test_index]
#     y_train, y_test = Y_after_smote_df[train_index], Y_after_smote_df[test_index]
    
#     clf = svm.fit(X_train, y_train)
#     predictions = clf.predict(X_test)
#     accuracy = accuracy_score(y_test, predictions) * 100

#     metrics = precision_recall_fscore_support(y_test, predictions, average='macro')
#     precision, recall = metrics[0], metrics[1]
#     f1_score = 2 * (precision * recall) / (precision + recall)

    
#     print("Accuracy = {0:.4f}%".format(accuracy))
#     print("Precision = {0:.4f}".format(precision))
#     print("Recall = {0:.4f}".format(recall))
#     print("F1 Score = {0:.4f}".format(f1_score))
#     print("---------------------------------------------")

In [16]:
# Logistic Regression Classifier
for train_index, test_index in k_fold.split(scaled_df):
    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = Y_after_smote_df[train_index], Y_after_smote_df[test_index]
    
    clf = logistic_regression.fit(X_train, y_train)

    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions) * 100

    metrics = precision_recall_fscore_support(y_test, predictions, average='macro')
    precision, recall = metrics[0], metrics[1]
    f1_score = 2 * (precision * recall) / (precision + recall)

    
    print("Accuracy = {0:.4f}%".format(accuracy))
    print("Precision = {0:.4f}".format(precision))
    print("Recall = {0:.4f}".format(recall))
    print("F1 Score = {0:.4f}".format(f1_score))
    print("---------------------------------------------")

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy = 98.4960%
Precision = 0.7350
Recall = 0.9477
F1 Score = 0.8279
---------------------------------------------


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy = 99.3574%
Precision = 0.8303
Recall = 0.9445
F1 Score = 0.8837
---------------------------------------------


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy = 99.5417%
Precision = 0.9955
Recall = 0.9953
F1 Score = 0.9954
---------------------------------------------


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy = 99.3595%
Precision = 0.5000
Recall = 0.4968
F1 Score = 0.4984
---------------------------------------------


  y = column_or_1d(y, warn=True)


Accuracy = 99.4225%
Precision = 0.5000
Recall = 0.4971
F1 Score = 0.4986
---------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
