# Fraud Detection with Graph databases and Machine Learning

## Importing the required Python libraries

In [102]:
import numpy as np
import pandas as pd
from py2neo import Graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.decomposition import PCA

## Loading and exploring the banksim dataset 

In [2]:
banksim_df = pd.read_csv("../data/bs140513_032310.csv")
banksim_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [3]:
banksim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 10 columns):
step           594643 non-null int64
customer       594643 non-null object
age            594643 non-null object
gender         594643 non-null object
zipcodeOri     594643 non-null object
merchant       594643 non-null object
zipMerchant    594643 non-null object
category       594643 non-null object
amount         594643 non-null float64
fraud          594643 non-null int64
dtypes: float64(1), int64(2), object(7)
memory usage: 45.4+ MB


Viewing the split of the output classes - fraudulent and genuine transactions

In [4]:
banksim_df['fraud'].value_counts()

0    587443
1      7200
Name: fraud, dtype: int64

In [5]:
# Number of unique values per column in the banksim dataset
banksim_df.nunique()

step             180
customer        4112
age                8
gender             4
zipcodeOri         1
merchant          50
zipMerchant        1
category          15
amount         23767
fraud              2
dtype: int64

## Preprocessing the data 

In [6]:
# Obtaining the number of null values in each column
banksim_df.isna().sum()

step           0
customer       0
age            0
gender         0
zipcodeOri     0
merchant       0
zipMerchant    0
category       0
amount         0
fraud          0
dtype: int64

Looks like there are no null values in the dataset.

In [7]:
# Retrieving the class attribute from the dataframe
Y_before_smote = banksim_df['fraud']
Y_before_smote.head()

0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64

In [8]:
'''
Removing unwanted columns
Since zipcodeOri and zipMerchant have the same value for all the rows, these columns are redundant
'''

feature_df = banksim_df.drop(['step', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)




In [9]:
feature_df.head()

Unnamed: 0,age,gender,merchant,category,amount
0,'4','M','M348934600','es_transportation',4.55
1,'2','M','M348934600','es_transportation',39.68
2,'4','F','M1823072687','es_transportation',26.89
3,'3','M','M348934600','es_transportation',17.25
4,'5','M','M348934600','es_transportation',35.72


In [11]:
# Converting the encoded merchant values back to the original string values
#X_after_smote_df['merchant'] = label_encoder.inverse_transform(X_after_smote_df['merchant'])

# One hot encoding the categorical variables
#feature_df = pd.get_dummies(X_after_smote_df, columns=['age', 'gender', 'category', 'merchant'])
feature_df = pd.get_dummies(feature_df, columns=['age', 'gender', 'category', 'merchant'])
#feature_df = X_after_smote_df
feature_df.head()

Unnamed: 0,amount,age_'0',age_'1',age_'2',age_'3',age_'4',age_'5',age_'6',age_'U',gender_'E',...,merchant_'M732195782',merchant_'M78078399',merchant_'M840466850',merchant_'M855959430',merchant_'M857378720',merchant_'M85975013',merchant_'M923029380',merchant_'M933210764',merchant_'M97925176',merchant_'M980657600'
0,4.55,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,39.68,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,26.89,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,17.25,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35.72,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(feature_df), columns = feature_df.columns)

scaled_df.head()

Unnamed: 0,amount,age_'0',age_'1',age_'2',age_'3',age_'4',age_'5',age_'6',age_'U',gender_'E',...,merchant_'M732195782',merchant_'M78078399',merchant_'M840466850',merchant_'M855959430',merchant_'M857378720',merchant_'M85975013',merchant_'M923029380',merchant_'M933210764',merchant_'M97925176',merchant_'M980657600'
0,-0.299276,-0.064347,-0.329165,-0.678119,-0.57339,2.110495,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
1,0.016067,-0.064347,-0.329165,1.474668,-0.57339,-0.473822,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
2,-0.098742,-0.064347,-0.329165,-0.678119,-0.57339,2.110495,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
3,-0.185275,-0.064347,-0.329165,-0.678119,1.744015,-0.473822,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
4,-0.01948,-0.064347,-0.329165,-0.678119,-0.57339,-0.473822,2.914227,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624


In [13]:
# Performing dimensionality reduction using PCA

# Limiting the number of components such that 95% of the variance is explained
pca = PCA(0.95, svd_solver='full')
scaled_df = pca.fit_transform(scaled_df)

scaled_df.shape


(594643, 55)

## Training supervised learning models using intrinsic features from the dataset

In [56]:
k_fold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)
svm = SVC(gamma="auto")
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=5000)
#scaled_df = scaled_df.values
#labels = Y_after_smote
labels = Y_before_smote

In [29]:
# Logistic Regression Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    
    clf = logistic_regression.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))
    

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.77      0.74      0.76      1440

    accuracy                           0.99    118929
   macro avg       0.89      0.87      0.88    118929
weighted avg       0.99      0.99      0.99    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.86      0.74      0.80      1440

    accuracy                           1.00    118929
   macro avg       0.93      0.87      0.90    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.89      0.75      0.82      1440

    accuracy                           1.00    118929
   macro avg       0.95      0.87      0.91    118929
weighted avg       1.00      1.00      1.00    118929

              preci

In [30]:
# Testing the logistic regression classifier after performing oversampling on the training data using SMOTE 

X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)#, random_state=42, stratify='array-like')


# Handling the imbalance in the dataset using SMOTE

# Encoding the categorical variables because SMOTE cannot work with string values
'''label_encoder = LabelEncoder()
feature_df['gender'] = label_encoder.fit_transform(feature_df['gender'])
feature_df['age'] = label_encoder.fit_transform(feature_df['age'])
feature_df['category'] = label_encoder.fit_transform(feature_df['category'])
feature_df['merchant'] = label_encoder.fit_transform(feature_df['merchant'])'''

print('Original dataset shape %s' % Counter(Y_before_smote))

#X_before_smote = X_train
sm = SMOTE()#random_state=1)

# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_train, y_train)
                                            
                                               
print('dataset shape after smote %s' % Counter(Y_after_smote))

clf = logistic_regression.fit(X_after_smote, Y_after_smote)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

Original dataset shape Counter({0: 587443, 1: 7200})
dataset shape after smote Counter({0: 469968, 1: 469968})
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    117475
           1       0.30      0.98      0.45      1454

    accuracy                           0.97    118929
   macro avg       0.65      0.97      0.72    118929
weighted avg       0.99      0.97      0.98    118929



In [34]:
# Random Forest Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = random_forest.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))
    

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.79      0.74      0.76      1440

    accuracy                           0.99    118929
   macro avg       0.89      0.87      0.88    118929
weighted avg       0.99      0.99      0.99    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.86      0.75      0.80      1440

    accuracy                           1.00    118929
   macro avg       0.93      0.88      0.90    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.90      0.75      0.82      1440

    accuracy                           1.00    118929
   macro avg       0.95      0.87      0.91    118929
weighted avg       1.00      1.00      1.00    118929

              preci

In [35]:

# Testing the random forest classifier after performing oversampling on the training data using SMOTE 

X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)#, random_state=42, stratify='array-like')


# Handling the imbalance in the dataset using SMOTE

# Encoding the categorical variables because SMOTE cannot work with string values
'''label_encoder = LabelEncoder()
feature_df['gender'] = label_encoder.fit_transform(feature_df['gender'])
feature_df['age'] = label_encoder.fit_transform(feature_df['age'])
feature_df['category'] = label_encoder.fit_transform(feature_df['category'])
feature_df['merchant'] = label_encoder.fit_transform(feature_df['merchant'])'''

print('Original dataset shape %s' % Counter(Y_before_smote))

#X_before_smote = X_train
sm = SMOTE()#random_state=1)

# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_train, y_train)
                                            
                                               
print('dataset shape after smote %s' % Counter(Y_after_smote))

clf = random_forest.fit(X_after_smote, Y_after_smote)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

Original dataset shape Counter({0: 587443, 1: 7200})
dataset shape after smote Counter({0: 469991, 1: 469991})
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    117452
           1       0.34      0.97      0.50      1477

    accuracy                           0.98    118929
   macro avg       0.67      0.97      0.74    118929
weighted avg       0.99      0.98      0.98    118929



In [36]:
# SVM Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = svm.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117498
           1       0.87      0.69      0.77      1431

    accuracy                           0.99    118929
   macro avg       0.93      0.84      0.88    118929
weighted avg       0.99      0.99      0.99    118929



## Extracting network features

In [88]:
# Connecting to the Neo4j database
graph = Graph(password="root")

# Query to fetch the network features from Neo4j
query = """
MATCH (p:Placeholder)
RETURN p.id AS id, p.degree AS degree, p.pagerank as pagerank, p.community AS community 
"""

data = graph.run(query)

records = {}

for record in data:
    records[record['id']] = {'degree': record['degree'], 'pagerank': record['pagerank'], 'community': record['community']}


In [89]:
banksim_df = pd.read_csv("../data/bs140513_032310.csv")


In [90]:
def load_degree(record):
    return records[record.split("'")[1]]['degree']
def load_community(record):
    return str(records[record.split("'")[1]]['community'])
def load_pagerank(record):
    return records[record.split("'")[1]]['pagerank']

In [91]:
banksim_df['merchant_degree'] = banksim_df['merchant'].apply(load_degree)
banksim_df['customer_degree'] = banksim_df['customer'].apply(load_degree)
banksim_df['merchant_pagerank'] = banksim_df['merchant'].apply(load_pagerank)
banksim_df['customer_pagerank'] = banksim_df['customer'].apply(load_pagerank)
banksim_df['merchant_community'] = banksim_df['merchant'].apply(load_community)
banksim_df['customer_community'] = banksim_df['customer'].apply(load_community)

In [92]:
banksim_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud,merchant_degree,customer_degree,merchant_pagerank,customer_pagerank,merchant_community,customer_community
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0,3929,7,46.541354,0.15,598811,598811
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0,3929,12,46.541354,0.15,598811,598846
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0,3573,14,41.391096,0.15,598846,598811
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0,3929,11,46.541354,0.15,598811,598846
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0,3929,14,46.541354,0.15,598811,598811


In [93]:
banksim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 16 columns):
step                  594643 non-null int64
customer              594643 non-null object
age                   594643 non-null object
gender                594643 non-null object
zipcodeOri            594643 non-null object
merchant              594643 non-null object
zipMerchant           594643 non-null object
category              594643 non-null object
amount                594643 non-null float64
fraud                 594643 non-null int64
merchant_degree       594643 non-null int64
customer_degree       594643 non-null int64
merchant_pagerank     594643 non-null float64
customer_pagerank     594643 non-null float64
merchant_community    594643 non-null object
customer_community    594643 non-null object
dtypes: float64(3), int64(4), object(9)
memory usage: 72.6+ MB


In [94]:
labels = banksim_df['fraud']

# Dropping the unnecessary columns
feature_df = banksim_df.drop(['step', 'age', 'gender', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

# One hot encoding the categorical variables
feature_df = pd.get_dummies(feature_df, columns=['category', 'merchant', 'merchant_community', 'customer_community'])


In [95]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(feature_df), columns = feature_df.columns)

scaled_df.head()

Unnamed: 0,amount,merchant_degree,customer_degree,merchant_pagerank,customer_pagerank,category_'es_barsandrestaurants',category_'es_contents',category_'es_fashion',category_'es_food',category_'es_health',...,customer_community_598811,customer_community_598812,customer_community_598817,customer_community_598822,customer_community_598824,customer_community_598827,customer_community_598835,customer_community_598844,customer_community_598846,customer_community_598850
0,-0.299276,0.627143,-1.451389,0.686704,-1.0,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,1.410835,-0.027733,-0.010694,-0.010045,-0.004676,-0.008304,-0.016609,-0.059061,-1.393912,-0.005187
1,0.016067,0.627143,0.203358,0.686704,-1.0,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.7088,-0.027733,-0.010694,-0.010045,-0.004676,-0.008304,-0.016609,-0.059061,0.717405,-0.005187
2,-0.098742,0.102754,0.865257,0.060981,-1.0,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,1.410835,-0.027733,-0.010694,-0.010045,-0.004676,-0.008304,-0.016609,-0.059061,-1.393912,-0.005187
3,-0.185275,0.627143,-0.127592,0.686704,-1.0,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.7088,-0.027733,-0.010694,-0.010045,-0.004676,-0.008304,-0.016609,-0.059061,0.717405,-0.005187
4,-0.01948,0.627143,0.865257,0.686704,-1.0,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,1.410835,-0.027733,-0.010694,-0.010045,-0.004676,-0.008304,-0.016609,-0.059061,-1.393912,-0.005187


In [96]:
scaled_df = scaled_df.values
labels = labels.values

## Training supervised learning models using intrinsic features as well as graph based features

In [99]:
k_fold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)
svm = SVC(gamma="auto")
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=5000)


In [97]:
# Logistic Regression Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = logistic_regression.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.81      0.79      0.80      1440

    accuracy                           1.00    118929
   macro avg       0.90      0.89      0.90    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.87      0.80      0.84      1440

    accuracy                           1.00    118929
   macro avg       0.94      0.90      0.92    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.90      0.79      0.84      1440

    accuracy                           1.00    118929
   macro avg       0.95      0.89      0.92    118929
weighted avg       1.00      1.00      1.00    118929

              preci

In [100]:
# Random Forest Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = random_forest.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117432
           1       0.90      0.80      0.85      1497

    accuracy                           1.00    118929
   macro avg       0.95      0.90      0.92    118929
weighted avg       1.00      1.00      1.00    118929



In [101]:
# SVM Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = svm.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117452
           1       0.90      0.77      0.83      1477

    accuracy                           1.00    118929
   macro avg       0.95      0.88      0.91    118929
weighted avg       1.00      1.00      1.00    118929

