In [9]:
import pandas as pd
import numpy as np
import networkx as nx
from node2vec import Node2Vec
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import kagglehub

In [10]:
# load the transaction data
def load_transaction_data(filepath):
    df = pd.read_csv(filepath)
    return df

In [14]:
 # create the graphs for the transaction

def create_transaction_graph(df):
  G = nx.DiGraph() # initiallized empty directed graph


  # node labels --> bankName_AccountNumber  
  df['source_node'] = df['From Bank'].astype(str) + '_' + df['Account'].astype(str)
  df['target_node'] = df['To Bank'].astype(str) + '_' + df['Account.1'].astype(str)

  # iterate through the csv dataset
  for idx, row in df.iterrows():
    source = row['source_node']
    target = row['target_node']

  if G.has_edge(source, target):
    G[source][target]['weight'] += 1
    G[source][target]['total_amount'] += row['amount_paid']
  else:
    G.add_edge(source, target, 
                weight=1, # can mark it as overlapping transactions
                total_amount=row['Amount Received'],
                currency=row['Receiving Currency'],
                payment_format=row['Payment Format'])
      
  return G, df

In [19]:
'''
dimensions = size of vector representation for each node
walk_length = how long each random walk will be 
num_walks = how many walks to start from each node
p = return probability --> how likely is the walk going back to the previous node
q = in-out probability --> how likely is the walk exploring inward / outward.
workers = CPU-based (number of threads)
quiet -> false will show progress output.
'''
def generate_embeddings(G, dimensions=64, walk_length=30, num_walks=200, 
                                 p=1, q=1, workers=4):

    node2vec = Node2Vec(
        G,
        dimensions=dimensions,
        walk_length=walk_length,
        num_walks=num_walks,
        p=p,
        q=q,
        workers=workers,
        quiet=False
    )

    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return model
    

In [17]:
path = kagglehub.dataset_download("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
print(path)
csv_file = path + "/HI-Small_Trans.csv"

/home/koichi/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8


In [18]:
# load csv
df = load_transaction_data(csv_file)
print("transaction loaded.")
# create graph
G, df = create_transaction_graph(df)
print("graph created.")

done


In [20]:
print("training model....")
training_model = generate_embeddings(G, dimensions=64)
print("training model done..")

training model....


Computing transition probabilities: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3374.34it/s]


training model done..


Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 193464.21it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 186082.70it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 159237.05it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 204201.75it/s]
