# Fraud Detection with Graph databases and Machine Learning

## Importing the required Python libraries

In [23]:
import numpy as np
import pandas as pd

## Loading and exploring the banksim dataset 

In [24]:
banksim_df = pd.read_csv("../data/bs140513_032310.csv")
banksim_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [25]:
banksim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 10 columns):
step           594643 non-null int64
customer       594643 non-null object
age            594643 non-null object
gender         594643 non-null object
zipcodeOri     594643 non-null object
merchant       594643 non-null object
zipMerchant    594643 non-null object
category       594643 non-null object
amount         594643 non-null float64
fraud          594643 non-null int64
dtypes: float64(1), int64(2), object(7)
memory usage: 45.4+ MB


Viewing the split of the output classes - fraudulent and genuine transactions

In [26]:
banksim_df['fraud'].value_counts()

0    587443
1      7200
Name: fraud, dtype: int64

In [27]:
# Number of unique values per column in the banksim dataset
banksim_df.nunique()

step             180
customer        4112
age                8
gender             4
zipcodeOri         1
merchant          50
zipMerchant        1
category          15
amount         23767
fraud              2
dtype: int64

## Preprocessing the data 

In [28]:
# Obtaining the number of null values in each column
banksim_df.isna().sum()

step           0
customer       0
age            0
gender         0
zipcodeOri     0
merchant       0
zipMerchant    0
category       0
amount         0
fraud          0
dtype: int64

Looks like there are no null values in the dataset.

In [29]:
# Retrieving the class attribute from the dataframe
label = banksim_df['fraud']

In [30]:
'''
Removing unwanted columns
Since zipcodeOri and zipMerchant have the same value for all the rows, these columns are redundant
'''

banksim_df = banksim_df.drop(['step', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

In [31]:
banksim_df.head()

Unnamed: 0,customer,age,gender,merchant,category,amount
0,'C1093826151','4','M','M348934600','es_transportation',4.55
1,'C352968107','2','M','M348934600','es_transportation',39.68
2,'C2054744914','4','F','M1823072687','es_transportation',26.89
3,'C1760612790','3','M','M348934600','es_transportation',17.25
4,'C757503768','5','M','M348934600','es_transportation',35.72
