In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
df = pd.read_csv('Dataset/credit_card_transactions-ibm_v2.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24386900 entries, 0 to 24386899
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   User            int64  
 1   Card            int64  
 2   Year            int64  
 3   Month           int64  
 4   Day             int64  
 5   Time            object 
 6   Amount          object 
 7   Use Chip        object 
 8   Merchant Name   int64  
 9   Merchant City   object 
 10  Merchant State  object 
 11  Zip             float64
 12  MCC             int64  
 13  Errors?         object 
 14  Is Fraud?       object 
dtypes: float64(1), int64(7), object(7)
memory usage: 2.7+ GB


In [8]:
for i in df.columns:
    counts = df[i].value_counts()
    
    print(f"Value counts for column {i} : \n {counts}\n")

Value counts for column User : 
 User
486     82355
396     80749
332     70010
262     68089
1249    65644
        ...  
457        25
231        21
1367       20
1767       16
1817       15
Name: count, Length: 2000, dtype: int64

Value counts for column Card : 
 Card
0    8696411
1    6493597
2    4305594
3    2790785
4    1309120
5     563097
6     176729
7      46383
8       5184
Name: count, dtype: int64

Value counts for column Year : 
 Year
2019    1723938
2017    1723360
2018    1721615
2016    1708924
2015    1701371
2014    1672343
2013    1650917
2012    1610829
2011    1570551
2010    1491225
2009    1355434
2008    1223460
2007    1064483
2006     908793
2005     746653
2004     597003
2003     466408
2002     350732
2020     336500
2001     257998
2000     177729
1999     118250
1998      78345
1997      49753
1996      29945
1995      20928
1994      14316
1993       8378
1992       5134
1991       1585
Name: count, dtype: int64

Value counts for column Month : 
 Month


In [9]:
df.isnull().sum()

User                     0
Card                     0
Year                     0
Month                    0
Day                      0
Time                     0
Amount                   0
Use Chip                 0
Merchant Name            0
Merchant City            0
Merchant State     2720821
Zip                2878135
MCC                      0
Errors?           23998469
Is Fraud?                0
dtype: int64

### Data Preprocessing

In [10]:
#assuming that each customer has only one credit card
df['Card_id'] = df['User'].astype(str) + "" + df['Card'].astype(str) 


In [13]:
#Clean Amount, remove the currecncy sign and convert the datatype from string to  float
df['Amount'] = df['Amount'].str.replace("$","").astype(float)

In [15]:
#Convert time to hour and minute because it can't be int

df["Hour"] = df['Time'].str[0:2]
df["Minutes"] = df["Time"].str[3:5]

In [16]:
#Fill missing errors with no error because no error is recorded 

df["Errors?"] = df["Errors?"].fillna("No error")

In [17]:
#Drop unwanted columns- Duplicate columns and columns with missing values

df = df.drop(["User","Card","Merchant State","Zip"], axis =1)

In [18]:
#Convert strings to numeric

df["Is Fraud?"] =df["Is Fraud?"].apply(lambda x:1 if x =='Yes' else 0)

In [19]:
le = LabelEncoder()
df['Merchant City']=le.fit_transform(df['Merchant City'])
df['Use Chip'] = le.fit