In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

import torch
import torch.nn as nn
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, BatchNorm
from tqdm import tqdm

In [None]:
with open("../config.json") as f:
    config = json.load(f)

DATASET = config["dataset"]
THRESHOLD = config["if_model"]["threshold"]

nodes_location = f"../data/01-ibm-transactions-for-aml/temporal_graph/{DATASET}_nodes"
edges_location = f"../data/01-ibm-transactions-for-aml/temporal_graph/{DATASET}_edges"
embeddings_location = f"../data/01-ibm-transactions-for-aml/gnn_account/{DATASET}_account_embbedings.csv"

In [None]:
nodes_pd = pd.read_parquet(nodes_location)
edges_pd = pd.read_parquet(edges_location)
print("#nodes=" + str(len(nodes_pd)) + "\n#edges=" + str(len(edges_pd)))

embeddings = pd.read_csv(embeddings_location, index_col=0)
print("embeddings shape: ", embeddings.shape)

In [None]:
embeddings['emb_id'] = embeddings.index

In [None]:
nodes_pd = nodes_pd.merge(embeddings, left_on='source', right_on='emb_id', how='left').drop(columns='emb_id')
nodes_pd = nodes_pd.merge(embeddings, left_on='target', right_on='emb_id', how='left').drop(columns='emb_id')

In [None]:
edges_pd.drop(columns=["src_date", "dst_date"], inplace=True)
edges_pd.rename(columns={"src": "source", "dst": "target"}, inplace=True)
nodes_pd.drop(columns=['source', 'target', 'transaction_date'], inplace=True)
nodes_pd['transaction_timestamp'] = nodes_pd['transaction_timestamp'].astype(int)

min_timestamp = nodes_pd['transaction_timestamp'].min()
nodes_pd.loc[:, 'transaction_timestamp'] -= min_timestamp

In [None]:
transactions = pd.read_parquet(f"../data/01-ibm-transactions-for-aml/preprocessed/{DATASET}-transactions")

relevant_columns = ['transaction_id', 'source_currency', 'target_currency', 'format']
transactions = transactions[relevant_columns]

In [None]:
freq_enc = transactions['source_currency'].value_counts() / len(transactions)  
transactions.loc[:, 'source_currency'] = transactions['source_currency'].map(freq_enc)

freq_enc = transactions['target_currency'].value_counts() / len(transactions)  
transactions.loc[:, 'target_currency'] = transactions['target_currency'].map(freq_enc)

freq_enc = transactions['format'].value_counts() / len(transactions)  
transactions.loc[:, 'format'] = transactions['format'].map(freq_enc)

In [None]:
nodes_pd = nodes_pd.merge(transactions, left_on='id', right_on='transaction_id', how='left').drop(columns='transaction_id')

nodes_pd['source_currency'] = nodes_pd['source_currency'].astype(np.float32)
nodes_pd['target_currency'] = nodes_pd['target_currency'].astype(np.float32)
nodes_pd['format'] = nodes_pd['format'].astype(np.float64)

In [None]:
node_encoder = LabelEncoder()
nodes_pd['node_id'] = node_encoder.fit_transform(nodes_pd['id'])
edges_pd['source_id'] = node_encoder.transform(edges_pd['source'])
edges_pd['target_id'] = node_encoder.transform(edges_pd['target'])

In [None]:
scaler_amount = MinMaxScaler()
scaler_timestamp = MinMaxScaler()
nodes_pd["amount"] = scaler_amount.fit_transform(nodes_pd[["amount"]])
nodes_pd["transaction_timestamp"] = scaler_timestamp.fit_transform(nodes_pd[["transaction_timestamp"]])

In [None]:
normal = pd.read_csv(f"../data/01-ibm-transactions-for-aml/filtered_output/normal_{DATASET}_{THRESHOLD}.csv")
anomalous = pd.read_csv(f"../data/01-ibm-transactions-for-aml/filtered_output/non_normal_{DATASET}_{THRESHOLD}.csv")
normal_ids = list(set(normal["transaction_id"].values))
anomalous_ids = list(set(anomalous["transaction_id"].values))

normal_nodes = nodes_pd[nodes_pd["id"].isin(normal_ids)]
anomalous_nodes = nodes_pd[nodes_pd["id"].isin(anomalous_ids)]

normal_node_id_to_index = {node_id: idx for idx, node_id in enumerate(normal_nodes['id'].values)}
anomalous_node_id_to_index = {node_id: idx for idx, node_id in enumerate(anomalous_nodes['id'].values)}

normal_edges = edges_pd[edges_pd["source"].isin(normal_nodes["id"]) & edges_pd["target"].isin(normal_nodes["id"])]
anomalous_edges = edges_pd[edges_pd["source"].isin(anomalous_nodes["id"]) & edges_pd["target"].isin(anomalous_nodes["id"])]

normal_edge_index_np = np.array([
    [normal_node_id_to_index[src], normal_node_id_to_index[dst]] 
    for src, dst in zip(normal_edges['source'].values, normal_edges['target'].values)
], dtype=np.int32).T

anomalous_edge_index_np = np.array([
    [anomalous_node_id_to_index[src], anomalous_node_id_to_index[dst]] 
    for src, dst in zip(anomalous_edges['source'].values, anomalous_edges['target'].values)
], dtype=np.int32).T

normal_edge_index = torch.tensor(normal_edge_index_np, dtype=torch.long)
anomalous_edge_index = torch.tensor(anomalous_edge_index_np, dtype=torch.long)

normal_edge_attr = torch.tensor(normal_edges['weight'].values, dtype=torch.float).unsqueeze(1)
anomalous_edge_attr = torch.tensor(anomalous_edges['weight'].values, dtype=torch.float).unsqueeze(1)

normal_node_features = normal_nodes.drop(columns=['id', 'node_id']).values
anomalous_node_features = anomalous_nodes.drop(columns=['id', 'node_id']).values

scaler = MinMaxScaler()
normal_node_features = scaler.fit_transform(normal_node_features)
anomalous_node_features = scaler.transform(anomalous_node_features)

normal_node_features = torch.tensor(normal_node_features, dtype=torch.float)
anomalous_node_features = torch.tensor(anomalous_node_features, dtype=torch.float)

train_data = Data(x=normal_node_features, edge_index=normal_edge_index, edge_attr=normal_edge_attr)
test_data = Data(x=anomalous_node_features, edge_index=anomalous_edge_index, edge_attr=anomalous_edge_attr)

In [None]:
class GraphSAGEAutoencoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2, dropout=0.5):
        super(GraphSAGEAutoencoder, self).__init__()
        self.encoder = torch.nn.ModuleList()
        self.bns = torch.nn.ModuleList()
        
        self.encoder.append(SAGEConv(in_channels, hidden_channels))
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        
        for _ in range(num_layers - 2):
            self.encoder.append(SAGEConv(hidden_channels, hidden_channels))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        
        self.encoder.append(SAGEConv(hidden_channels, out_channels))
        self.dropout = dropout
        
        self.decoder = torch.nn.Linear(out_channels, in_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv, bn in zip(self.encoder[:-1], self.bns):
            x = conv(x, edge_index)
            x = bn(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        embeddings = self.encoder[-1](x, edge_index)
        reconstructed = self.decoder(embeddings)
        return reconstructed, embeddings


model = GraphSAGEAutoencoder(
    in_channels=train_data.num_node_features, 
    hidden_channels=32, 
    out_channels=16, 
    num_layers=2, 
    dropout=0.2
)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)

def reconstruction_loss(reconstructed, original):
    return F.mse_loss(reconstructed, original)

def train_autoencoder(data, model, num_epochs=200):
    model.train()
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
    
    for epoch in tqdm(range(num_epochs), desc="Training Epochs"):
        optimizer.zero_grad()
        
        reconstructed, embeddings = model(data)
        loss = reconstruction_loss(reconstructed, data.x)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch} -> Loss: {loss.item()}')
        scheduler.step()
    
    return model, embeddings


model, embeddings = train_autoencoder(train_data, model, num_epochs=40)

In [None]:
model.eval()
with torch.no_grad():
    reconstructed, test_embeddings = model(test_data)

reconstruction_errors = torch.mean((reconstructed - test_data.x) ** 2, dim=1).cpu().numpy()

ranked_indices = np.argsort(reconstruction_errors)[::-1]

top_k = 5000
top_k_anomalous_nodes = ranked_indices[:top_k]
top_k_anomalous_node_ids = anomalous_nodes['id'].iloc[top_k_anomalous_nodes]

In [None]:
transactions = pd.read_parquet(f"../data/01-ibm-transactions-for-aml/preprocessed/{DATASET}-transactions")
real_laundering_ids = transactions[transactions['is_laundering'] == 1]['transaction_id'].values

In [None]:
true_positives = len(set(real_laundering_ids) & set(top_k_anomalous_node_ids))
false_positives = top_k - true_positives

print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")

In [None]:
threshold = np.percentile(reconstruction_errors, 90)
anomalous_transactions = anomalous_nodes[reconstruction_errors > threshold]
anomalous_transactions_ids = anomalous_transactions['id'].values

print(f"Threshold: {threshold}")
print(f"Anomalous Transactions: {len(anomalous_transactions_ids)}")

In [None]:
print("TP:", len(set(anomalous_transactions_ids) & set(real_laundering_ids)))