In [None]:
# Import Packaes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# import warnings
# warnings.filterwarnings("ignore")

# Prevents Pandas Copy Warning
pd.set_option("mode.copy_on_write", True)

## Exploratory Data Analysis

In [None]:
# Load in the data
credit_card_data = pd.read_csv("./data/creditcard.csv")

# Inspect the data
display(credit_card_data.sample(10))

print(f"Credit card dataset dimensions: {credit_card_data.shape}")

# Information about each variable
print("")

display(credit_card_data.info())

# Check for missing values
display(credit_card_data.isna().sum())

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
271576,164638.0,2.098201,-0.109596,-1.814724,-0.221776,0.637626,-0.312991,0.202749,-0.137806,0.445808,...,-0.140211,-0.291202,0.136328,0.100753,0.103721,-0.092195,-0.059628,-0.072881,1.0,0
135285,81185.0,0.621321,-1.432427,0.584147,0.454856,-0.634604,2.025801,-0.889955,0.664764,-0.702656,...,0.064625,0.408042,-0.137224,-1.035378,0.085924,-0.134462,0.100312,0.048666,239.0,0
145120,86648.0,-1.597781,-0.080151,0.259743,-1.556867,1.420462,5.205158,1.718131,-0.748291,0.530779,...,0.263189,-0.608515,0.164685,0.683522,0.366033,-0.466016,-0.441905,-0.617693,346.1,0
89765,62713.0,1.258219,-1.402439,0.341214,-1.432218,-1.54773,-0.455403,-0.927502,-0.057314,-2.030018,...,-0.35593,-1.009853,0.058955,-0.069517,0.076163,-0.488614,0.006404,0.034758,133.0,0
118222,74999.0,1.266662,0.348043,0.274193,0.995065,-0.106423,-0.605768,0.125457,-0.178202,0.032818,...,-0.081897,-0.188772,-0.08666,-0.123207,0.62065,-0.408813,0.024135,0.021443,4.99,0
187759,127678.0,-4.090825,-10.557549,-6.462266,1.775752,-2.502521,-0.465696,5.149696,-1.4036,0.208279,...,2.19812,-0.963186,-3.030751,0.842049,-0.570463,0.704777,-0.779869,0.469297,3268.19,0
188855,128161.0,1.140012,-1.806578,-1.069601,0.199537,-0.214082,1.487659,-0.395228,0.376896,0.748607,...,0.506526,0.819046,-0.158409,-1.548256,-0.640477,0.721402,-0.060888,-0.007719,393.0,0
224382,143766.0,-2.001868,-1.047706,-1.435791,0.741947,2.21657,-1.675562,1.002258,0.192773,-1.317147,...,0.66169,0.847852,-0.272457,-0.444555,1.182714,-0.198864,-0.207095,-0.42636,158.61,0
12146,21154.0,1.210991,0.753194,0.127593,2.502477,0.43407,-0.297114,0.315559,-0.154075,0.101255,...,-0.14339,-0.256663,-0.137216,-0.065928,0.696261,0.034811,-0.075553,-0.0112,3.65,0
105721,69660.0,-1.126495,0.21584,1.389023,0.25347,0.616742,0.374974,1.048017,0.203144,-0.600183,...,0.119743,0.146524,0.052021,-0.310409,0.392876,-0.466371,0.044593,0.102784,141.0,0


Credit card dataset dimensions: (284807, 31)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V2

None

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

After loading in the dataset we observe the following:

- There are 284,807 observation and 31 variables.
- All variables are floats types except for `Class` which is an integer
- There are no missing values