# Credit Card Fraud Detection

The dataset used contains data about credit card transctions that occured on two days, Tuesday October 13 and Wednesday October 14 2020.

## Part 1 - Read and Pre-process Data

In [1]:
# Import Dependencies
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import scipy.stats as st

In [2]:
# Read CSV file
data_df = pd.read_csv("Resources/CreditCardData.csv")
data_df.head()

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1


In [3]:
# Check number of rows and columns
data_df.shape

(100000, 16)

In [4]:
data_df.columns

Index(['Transaction ID', 'Date', 'Day of Week', 'Time', 'Type of Card',
       'Entry Mode', 'Amount', 'Type of Transaction', 'Merchant Group',
       'Country of Transaction', 'Shipping Address', 'Country of Residence',
       'Gender', 'Age', 'Bank', 'Fraud'],
      dtype='object')

In [5]:
# Delete date column
del data_df["Date"]
del data_df["Day of Week"]
data_df.head(2)

Unnamed: 0,Transaction ID,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [6]:
# Identify incomplete rows
data_df.count()

Transaction ID            100000
Time                      100000
Type of Card              100000
Entry Mode                100000
Amount                     99994
Type of Transaction       100000
Merchant Group             99990
Country of Transaction    100000
Shipping Address           99995
Country of Residence      100000
Gender                     99996
Age                       100000
Bank                      100000
Fraud                     100000
dtype: int64

In [7]:
# Since there are only few rows with incomplete information, drop rows with missing data
data_df = data_df.dropna(how="any")

In [8]:
# Verify dropped rows
data_df.count()

Transaction ID            99977
Time                      99977
Type of Card              99977
Entry Mode                99977
Amount                    99977
Type of Transaction       99977
Merchant Group            99977
Country of Transaction    99977
Shipping Address          99977
Country of Residence      99977
Gender                    99977
Age                       99977
Bank                      99977
Fraud                     99977
dtype: int64

In [9]:
# Check data types of each column
data_df.dtypes

Transaction ID             object
Time                        int64
Type of Card               object
Entry Mode                 object
Amount                     object
Type of Transaction        object
Merchant Group             object
Country of Transaction     object
Shipping Address           object
Country of Residence       object
Gender                     object
Age                       float64
Bank                       object
Fraud                       int64
dtype: object

In [10]:
# Change the data type for Amount into the appropriate type
data_df["Amount"] = data_df["Amount"].str.replace('£', '')
data_df["Amount"] = data_df["Amount"].astype("float64")
data_df.head(2)

Unnamed: 0,Transaction ID,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,19,Visa,Tap,5.0,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,17,MasterCard,PIN,288.0,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [11]:
# Verify that data types have been converted
data_df.dtypes

Transaction ID             object
Time                        int64
Type of Card               object
Entry Mode                 object
Amount                    float64
Type of Transaction        object
Merchant Group             object
Country of Transaction     object
Shipping Address           object
Country of Residence       object
Gender                     object
Age                       float64
Bank                       object
Fraud                       int64
dtype: object

In [12]:
# Reset index
data_df = data_df.reset_index(drop=True)
data_df.head(2)

Unnamed: 0,Transaction ID,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,19,Visa,Tap,5.0,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,17,MasterCard,PIN,288.0,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [13]:
# Display a statistical overview of the numerical columns
data_df.describe()

Unnamed: 0,Time,Amount,Age,Fraud
count,99977.0,99977.0,99977.0,99977.0
mean,14.5631,112.579933,44.993595,0.071937
std,5.308202,123.435613,9.948121,0.258384
min,0.0,5.0,15.0,0.0
25%,10.0,17.0,38.2,0.0
50%,15.0,30.0,44.9,0.0
75%,19.0,208.0,51.7,0.0
max,24.0,400.0,86.1,1.0


In [14]:
# Rename Amount column
data_df = data_df.rename(columns={"Amount": "Amount (in £)"})
data_df.tail(2)

Unnamed: 0,Transaction ID,Time,Type of Card,Entry Mode,Amount (in £),Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
99975,#3107 092,22,Visa,Tap,25.0,POS,Products,United Kingdom,United Kingdom,United Kingdom,M,48.2,Barclays,0
99976,#3400 711,16,Visa,PIN,226.0,POS,Restaurant,United Kingdom,United Kingdom,United Kingdom,M,31.7,Monzo,0


In [None]:
data_df["Transaction ID	"] = data_df["Transaction ID"].str.replace('[#', '')
data_df["Amount"] = data_df["Amount"].astype("float64")
data_df.head(2)