In [None]:
import json
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, MinMaxScaler
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch_geometric.utils import negative_sampling

In [None]:
with open("../config.json") as f:
    config = json.load(f)  

DATASET = config["dataset"]
THRESHOLD = config["if_model"]["threshold"]

In [None]:
transactions = pd.read_parquet(f"../data/01-ibm-transactions-for-aml/preprocessed/{DATASET}-transactions")

In [None]:
data_aggregated_all = transactions.groupby(["source", "target"]).agg(
        sent_amount=pd.NamedAgg(column="amount", aggfunc="sum"),
        received_amount=pd.NamedAgg(column="amount", aggfunc="sum")
).reset_index()

mapping_source = data_aggregated_all.groupby("source").agg({"sent_amount": "sum"})["sent_amount"].to_dict()
mapping_target = data_aggregated_all.groupby("target").agg({"received_amount": "sum"})["received_amount"].to_dict()

data_aggregated_all["total_sent_by_source"] = data_aggregated_all["source"].map(mapping_source)
data_aggregated_all["total_received_by_target"] = data_aggregated_all["target"].map(mapping_target)

data_aggregated_all["weight"] = data_aggregated_all.apply(
    lambda x: (
        x["sent_amount"] / x["total_sent_by_source"] + 
        x["received_amount"] / x["total_received_by_target"]
    ), axis=1
)

edges = data_aggregated_all.loc[:, ["source", "target", "weight", "sent_amount"]]

In [None]:
normal = pd.read_csv(f"../data/01-ibm-transactions-for-aml/filtered_output/normal_{DATASET}_{THRESHOLD}.csv")
anomalous = pd.read_csv(f"../data/01-ibm-transactions-for-aml/filtered_output/non_normal_{DATASET}_{THRESHOLD}.csv")

normal_ids = set(normal["transaction_id"].values)
anomalous_ids = set(anomalous["transaction_id"].values)

In [None]:
nodes = transactions["source"].unique()
nodes = np.append(nodes, transactions["target"].unique())
nodes = np.unique(nodes)
nodes = pd.DataFrame(nodes, columns=["node"])

In [None]:
transactions["if_anomaly"] = np.where(transactions["transaction_id"].isin(anomalous_ids), 1, 0)

suspicious_by_source = transactions.groupby("source").agg({"if_anomaly": "sum"}).reset_index()
suspicious_by_target = transactions.groupby("target").agg({"if_anomaly": "sum"}).reset_index()

total_by_source = transactions.groupby("source").agg({"transaction_id": "count"}).reset_index()
total_by_target = transactions.groupby("target").agg({"transaction_id": "count"}).reset_index()

suspicious_by_source["suspicious_by_source"] = suspicious_by_source["if_anomaly"] / total_by_source["transaction_id"]
suspicious_by_target["suspicious_by_target"] = suspicious_by_target["if_anomaly"] / total_by_target["transaction_id"]

suspicious_by_source = suspicious_by_source.loc[:, ["source", "suspicious_by_source"]]
suspicious_by_target = suspicious_by_target.loc[:, ["target", "suspicious_by_target"]]
suspicious_by_source.columns = ["node", "suspicious_by_source"]
suspicious_by_target.columns = ["node", "suspicious_by_target"]

nodes = pd.merge(nodes, suspicious_by_source, on="node", how="left").fillna(0)
nodes = pd.merge(nodes, suspicious_by_target, on="node", how="left").fillna(0)

In [None]:
nodes["total_sent"] = nodes["node"].map(mapping_source).fillna(0)
nodes["total_received"] = nodes["node"].map(mapping_target).fillna(0)

In [None]:
G = nx.from_pandas_edgelist(edges, source='source', target='target', edge_attr=True, create_using=nx.DiGraph())
in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())
pagerank = nx.pagerank(G, weight='weight')
clustering_coefficient = nx.clustering(G.to_undirected(), weight='weight')
hits_hubs, hits_authorities = nx.hits(G, max_iter=100, tol=1.0e-8, nstart=None, normalized=True)
node_metrics = pd.DataFrame({
    'node': list(G.nodes()),
    'in_degree': pd.Series(in_degree),
    'out_degree': pd.Series(out_degree),
    'pagerank': pd.Series(pagerank),
    'clust_coefficient': pd.Series(clustering_coefficient),
    'hits_hubs': pd.Series(hits_hubs),
    'hits_auth': pd.Series(hits_authorities)
})

nodes = pd.merge(nodes, node_metrics, on='node', how='left')

nodes['inflow_ratio'] = nodes['total_received'] / (nodes['total_sent'] + nodes['total_received'])
nodes['outflow_ratio'] = nodes['total_sent'] / (nodes['total_sent'] + nodes['total_received'])

In [None]:
avg_sent_by_node = transactions.groupby('source').agg({'amount': 'mean'}).reset_index()
avg_sent_by_node.columns = ['node', 'avg_sent']
nodes = pd.merge(nodes, avg_sent_by_node, on='node', how='left')

avg_received_by_node = transactions.groupby('target').agg({'amount': 'mean'}).reset_index()
avg_received_by_node.columns = ['node', 'avg_received']
nodes = pd.merge(nodes, avg_received_by_node, on='node', how='left')

In [None]:
count = transactions.groupby(['source', 'target']).size().reset_index(name='count')
edges = pd.merge(edges, count, on=['source', 'target'], how='left')

In [None]:
nodes = nodes.fillna(0)
edges = edges.fillna(0)

In [None]:
node_columns_log_transform = ['total_sent', 'total_received', 'avg_sent', 'avg_received', 'in_degree', 'out_degree']

node_columns_min_max = ['pagerank', 'clust_coefficient', 'hits_hubs', 'hits_auth']

edge_columns_log_transform = ['sent_amount']
edge_columns_min_max = ['weight', 'count']

scaler = MinMaxScaler()

for col in node_columns_log_transform:
    nodes[col] = np.log1p(nodes[col])

nodes[node_columns_log_transform + node_columns_min_max] = scaler.fit_transform(
    nodes[node_columns_log_transform + node_columns_min_max])

for col in edge_columns_log_transform:
    edges[col] = np.log1p(edges[col])

edges[edge_columns_log_transform + edge_columns_min_max] = scaler.fit_transform(
    edges[edge_columns_log_transform + edge_columns_min_max])

In [None]:
nodes_location = f"../data/01-ibm-transactions-for-aml/gnn_account/{DATASET}_nodes.csv"
edges_location = f"../data/01-ibm-transactions-for-aml/gnn_account/{DATASET}_edges.csv"

nodes.to_csv(nodes_location, index=False)
edges.to_csv(edges_location, index=False)

nodes = pd.read_csv(nodes_location)
edges = pd.read_csv(edges_location)

In [None]:
node_encoder = LabelEncoder()
nodes['node_id'] = node_encoder.fit_transform(nodes['node'])
edges['source_id'] = node_encoder.transform(edges['source'])
edges['target_id'] = node_encoder.transform(edges['target'])

In [None]:
node_features = nodes.drop(columns=['node', 'node_id']).values
node_features = torch.tensor(node_features, dtype=torch.float)

edge_index_np = np.array([edges['source_id'].values, edges['target_id'].values], dtype=np.int64)
edge_index = torch.tensor(edge_index_np, dtype=torch.long)
edge_attr = torch.tensor(edges['weight'].values, dtype=torch.float).unsqueeze(1)


data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

In [None]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        self.reset_parameters()

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.normalize(x, p=2, dim=-1)
        x = self.conv2(x, edge_index)
        x = F.normalize(x, p=2, dim=-1)
        return x


model = GraphSAGE(in_channels=data.num_features, hidden_channels=64, out_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loader = NeighborLoader(data, num_neighbors=[25, 10], batch_size=256)

epsilon = 1e-15

def unsupervised_loss(z, pos_edge_index, neg_edge_index):
    pos_loss = -torch.log(torch.sigmoid((z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=-1)) + epsilon).mean()
    neg_loss = -torch.log(1 - torch.sigmoid((z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=-1)) + epsilon).mean()
    return pos_loss + neg_loss


model.train()
for epoch in range(50):
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        z = model(batch.x, batch.edge_index)
        
        pos_edge_index = batch.edge_index
        neg_edge_index = negative_sampling(edge_index=batch.edge_index, num_nodes=batch.num_nodes, num_neg_samples=pos_edge_index.size(1))
        
        loss = unsupervised_loss(z, pos_edge_index, neg_edge_index)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(loader)}')

In [None]:
model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index)

embeddings = embeddings.detach().cpu().numpy()

original_node_ids = node_encoder.inverse_transform(range(embeddings.shape[0]))
embeddings_df = pd.DataFrame(embeddings, index=original_node_ids, columns=[f'emb_{i}' for i in range(embeddings.shape[1])])

embeddings_df.to_csv(f"../data/01-ibm-transactions-for-aml/gnn_account/{DATASET}_account_embbedings.csv", index=True)
print("embeddings saved")