In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
df = pd.read_csv('Dataset/credit_card_transactions-ibm_v2.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24386900 entries, 0 to 24386899
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   User            int64  
 1   Card            int64  
 2   Year            int64  
 3   Month           int64  
 4   Day             int64  
 5   Time            object 
 6   Amount          object 
 7   Use Chip        object 
 8   Merchant Name   int64  
 9   Merchant City   object 
 10  Merchant State  object 
 11  Zip             float64
 12  MCC             int64  
 13  Errors?         object 
 14  Is Fraud?       object 
dtypes: float64(1), int64(7), object(7)
memory usage: 2.7+ GB


In [4]:
for i in df.columns:
    counts = df[i].value_counts()
    
    print(f"Value counts for column {i} : \n {counts}\n")

Value counts for column User : 
 User
486     82355
396     80749
332     70010
262     68089
1249    65644
        ...  
457        25
231        21
1367       20
1767       16
1817       15
Name: count, Length: 2000, dtype: int64

Value counts for column Card : 
 Card
0    8696411
1    6493597
2    4305594
3    2790785
4    1309120
5     563097
6     176729
7      46383
8       5184
Name: count, dtype: int64

Value counts for column Year : 
 Year
2019    1723938
2017    1723360
2018    1721615
2016    1708924
2015    1701371
2014    1672343
2013    1650917
2012    1610829
2011    1570551
2010    1491225
2009    1355434
2008    1223460
2007    1064483
2006     908793
2005     746653
2004     597003
2003     466408
2002     350732
2020     336500
2001     257998
2000     177729
1999     118250
1998      78345
1997      49753
1996      29945
1995      20928
1994      14316
1993       8378
1992       5134
1991       1585
Name: count, dtype: int64

Value counts for column Month : 
 Month


In [5]:
df.isnull().sum()

User                     0
Card                     0
Year                     0
Month                    0
Day                      0
Time                     0
Amount                   0
Use Chip                 0
Merchant Name            0
Merchant City            0
Merchant State     2720821
Zip                2878135
MCC                      0
Errors?           23998469
Is Fraud?                0
dtype: int64

### Data Preprocessing

In [6]:
#assuming that each customer has only one credit card
df['Card_id'] = df['User'].astype(str) + "" + df['Card'].astype(str) 


In [7]:
#Clean Amount, remove the currecncy sign and convert the datatype from string to  float
df['Amount'] = df['Amount'].str.replace("$","").astype(float)

In [8]:
#Convert time to hour and minute because it can't be int

df["Hour"] = df['Time'].str[0:2]
df["Minutes"] = df["Time"].str[3:5]

In [9]:
#Fill missing errors with no error because no error is recorded 

df["Errors?"] = df["Errors?"].fillna("No error")

In [10]:
#Drop unwanted columns- Duplicate columns and columns with missing values

df = df.drop(["User","Card","Merchant State","Zip"], axis =1)

In [11]:
#Convert strings to numeric

df["Is Fraud?"] =df["Is Fraud?"].apply(lambda x:1 if x =='Yes' else 0)

In [12]:
le = LabelEncoder()
df['Merchant City']=le.fit_transform(df['Merchant City'])
df['Use Chip'] = le.fit_transform(df['Use Chip'])
df['Errors?'] = le.fit_transform(df['Errors?'])

In [13]:
df['Errors?'].unique()

array([22, 23, 20, 14, 15, 10, 16,  3,  0, 17, 21,  8,  4,  1,  5, 11, 12,
       13,  7,  2,  9, 18, 19,  6])

### Graph Preprocessing for Fraud Detection

In [14]:
# Create an empty Graph
#We are using multigraph because a user can buy from multiple merchants

G = nx.MultiGraph()

#Add nodes to the graph for each unique card_id, merchant_name
G.add_nodes_from(df['Card_id'].unique(), type="card_id")
G.add_nodes_from(df['Merchant Name'].unique(), type='merchant_name')

In [15]:
#add edges and properties to edge

for i, row in df.iterrows():
    year = row ["Year"],
    month = row["Month"],
    day = row["Day"],
    hour = row["Hour"],
    minute = row["Minutes"],
    amount = row["Amount"],
    use_chip = row["Use Chip"],
    merchant_city = row["Merchant City"],
    errors = row["Errors?"],
    mcc = row["MCC"]
    G.add_edge(row['Card_id'], row['Merchant Name'], year =year, month = month, day = day, hour=hour, minute = minute, amount=amount, use_chip=use_chip, merchant_city = merchant_city, errors=errors, mcc=mcc)
    

In [16]:
#Get the number of number of nodes and edges in the graph

number_nodes = G.number_of_nodes()
number_edges = G.number_of_edges()


print(f"Number of nodes: {number_nodes}")
print(f"Number of edges: {number_edges}")

Number of nodes: 106482
Number of edges: 24386900


In [17]:
#convert the graph to an adjancency matrix to see which nodes are connected to each other

adjacent_matrix = nx.adjacency_matrix(G).todense()

In [18]:
adjacent_matrix.shape

(106482, 106482)

In [19]:
#To retrieve some sample of nodes in the graph(G) and print their principles

#to get a small sample

sample_nodes = list(G.nodes())[:50]

#To retrieve the properties of the sample nodes
node_properties = nx.get_node_attributes(G, 'type')


#print the properties of the sample nodes
for i in sample_nodes:
    print(f" Node:{i}, properties: {node_properties[i]}")

 Node:00, properties: card_id
 Node:01, properties: card_id
 Node:02, properties: card_id
 Node:03, properties: card_id
 Node:04, properties: card_id
 Node:10, properties: card_id
 Node:11, properties: card_id
 Node:12, properties: card_id
 Node:13, properties: card_id
 Node:14, properties: card_id
 Node:20, properties: card_id
 Node:21, properties: card_id
 Node:22, properties: card_id
 Node:23, properties: card_id
 Node:24, properties: card_id
 Node:30, properties: card_id
 Node:31, properties: card_id
 Node:32, properties: card_id
 Node:33, properties: card_id
 Node:40, properties: card_id
 Node:50, properties: card_id
 Node:51, properties: card_id
 Node:52, properties: card_id
 Node:60, properties: card_id
 Node:61, properties: card_id
 Node:62, properties: card_id
 Node:70, properties: card_id
 Node:80, properties: card_id
 Node:81, properties: card_id
 Node:82, properties: card_id
 Node:83, properties: card_id
 Node:84, properties: card_id
 Node:90, properties: card_id
 Node:100,

In [20]:
ss = 10  # Setting the variable sample size to 10

for j, edge in enumerate(G.edges()):  # Loop through the edges of graph G
    print(G.get_edge_data(*edge))  # Print the data associated with the current edge
    if j >= ss - 1:  # Check if the loop index 'j' is greater than or equal to sample size - 1
        break  # If the condition is met, exit the loop


{0: {'year': (2002,), 'month': (9,), 'day': (1,), 'hour': ('06',), 'minute': ('21',), 'amount': (134.09,), 'use_chip': (2,), 'merchant_city': (6274,), 'errors': (22,), 'mcc': 5300}, 1: {'year': (2002,), 'month': (9,), 'day': (10,), 'hour': ('06',), 'minute': ('22',), 'amount': (102.18,), 'use_chip': (2,), 'merchant_city': (6274,), 'errors': (22,), 'mcc': 5300}, 2: {'year': (2002,), 'month': (9,), 'day': (16,), 'hour': ('06',), 'minute': ('00',), 'amount': (115.34,), 'use_chip': (2,), 'merchant_city': (6274,), 'errors': (22,), 'mcc': 5300}, 3: {'year': (2002,), 'month': (9,), 'day': (18,), 'hour': ('06',), 'minute': ('19',), 'amount': (128.85,), 'use_chip': (2,), 'merchant_city': (6274,), 'errors': (22,), 'mcc': 5300}, 4: {'year': (2002,), 'month': (9,), 'day': (23,), 'hour': ('06',), 'minute': ('01',), 'amount': (134.89,), 'use_chip': (2,), 'merchant_city': (6274,), 'errors': (22,), 'mcc': 5300}, 5: {'year': (2002,), 'month': (9,), 'day': (24,), 'hour': ('06',), 'minute': ('07',), 'amo

In [22]:
#To retrieve properties of a edges and print

#Retries the properties errors of all the edges
edge_properties = nx.get_edge_attributes(G, 'errors')

#count the number of edges by property value
edge_count_by_property = Counter(edge_properties.values())

#print the count of edges by property value

for property_value, count in edge_count_by_property.items():
    print(f"Property value: {property_value}, Count:{count}")

Property value: (22,), Count:23998469
Property value: (20,), Count:242783
Property value: (14,), Count:58918
Property value: (23,), Count:48157
Property value: (15,), Count:581
Property value: (10,), Count:10716
Property value: (16,), Count:128
Property value: (3,), Count:13321
Property value: (0,), Count:10740
Property value: (21,), Count:457
Property value: (17,), Count:2079
Property value: (8,), Count:122
Property value: (4,), Count:60
Property value: (1,), Count:89
Property value: (5,), Count:54
Property value: (11,), Count:47
Property value: (12,), Count:78
Property value: (13,), Count:32
Property value: (7,), Count:1
Property value: (2,), Count:21
Property value: (9,), Count:25
Property value: (18,), Count:13
Property value: (19,), Count:7
Property value: (6,), Count:2


In [23]:
#prepare the data for input into the model
edge_list = list(G.edges(data=True))

In [26]:
edge_list

[('00',
  3527213246127876953,
  {'year': (2002,),
   'month': (9,),
   'day': (1,),
   'hour': ('06',),
   'minute': ('21',),
   'amount': (134.09,),
   'use_chip': (2,),
   'merchant_city': (6274,),
   'errors': (22,),
   'mcc': 5300}),
 ('00',
  3527213246127876953,
  {'year': (2002,),
   'month': (9,),
   'day': (10,),
   'hour': ('06',),
   'minute': ('22',),
   'amount': (102.18,),
   'use_chip': (2,),
   'merchant_city': (6274,),
   'errors': (22,),
   'mcc': 5300}),
 ('00',
  3527213246127876953,
  {'year': (2002,),
   'month': (9,),
   'day': (16,),
   'hour': ('06',),
   'minute': ('00',),
   'amount': (115.34,),
   'use_chip': (2,),
   'merchant_city': (6274,),
   'errors': (22,),
   'mcc': 5300}),
 ('00',
  3527213246127876953,
  {'year': (2002,),
   'month': (9,),
   'day': (18,),
   'hour': ('06',),
   'minute': ('19',),
   'amount': (128.85,),
   'use_chip': (2,),
   'merchant_city': (6274,),
   'errors': (22,),
   'mcc': 5300}),
 ('00',
  3527213246127876953,
  {'year':

In [27]:
i = 0
list(edge_list[i][2].values())

[(2002,), (9,), (1,), ('06',), ('21',), (134.09,), (2,), (6274,), (22,), 5300]

### Data Splitting

In [29]:
#prepare the data for input into the model

# Get a list of edges along with their attributes from the graph G
edge_list = list(G.edges(data=True))

# Initialize an empty list to store processed edge attributes
x = []

# Iterate through each edge in the edge_list
for edge in edge_list:
    # Extract values of attributes for the current edge
    edge_values = list(edge[2].values())

    # Process each attribute value in edge_values
    edge_values = [
        float(i[0]) if type(i) == tuple and type(i[0]) == str else i[0] if type(i) == tuple else i 
        for i in edge_values
    ]

    # Append the processed values to the list x
    x.append(edge_values)

# Convert the list x to a PyTorch tensor with dtype float
x = torch.tensor(x, dtype=torch.float)

    

### Pytorch Neural Network - Model Training

### simple feedforward neural network



#### Network X

In [30]:
class FraudGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FraudGNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    def forward(self,x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x.squeeze(-1)

In [31]:
y = torch.tensor(df['Is Fraud?'].values, dtype=torch.float)