In [2]:
import json
import pandas as pd
import numpy as np
import os
import logging
import sys
import random
import torch
import itertools
from torch_geometric.data import Data
from torch_geometric.typing import OptTensor
from torch_geometric.loader import LinkNeighborLoader

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
class Arg:
  def __init__(self):
    self.seed = 1
    self.n_epochs = 100
    self.batch_size = 512
    self.num_neighs = [100, 100]
    self.tqdm = True
    self.data = "Small_HI"
    self.model = "pna"
    self.testing = False
    self.save_model = True
    self.unique_name = 'pna1'
    self.finetune = False
    self.inference = False
    self.ports = False
    self.tds = False
    self.ego = False
    self.reverse_mp = False
    self.emlps = False

args = Arg()

In [5]:
with open('data_config.json', 'r') as config_file:
    data_config = json.load(config_file)

In [6]:
data_config

{'paths': {'aml_data': '/data',
  'model_to_load': '/models',
  'model_to_save': '/models'}}

In [7]:
def logger_setup():
    # Setup logging
    log_directory = "logs"
    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)-5.5s] %(message)s",
        handlers=[
            logging.FileHandler(os.path.join(log_directory, "logs.log")),     ## log to local log file
            logging.StreamHandler(sys.stdout)          ## log also to stdout (i.e., print to screen)
        ]
    )

In [8]:
def set_seed(seed: int = 0) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    logging.info(f"Random seed set as {seed}")

In [9]:
transaction_file = f"./{data_config['paths']['aml_data']}/{args.data}/formatted_transactions.csv" #replace this with your path to the respective AML data objects
df_edges = pd.read_csv(transaction_file)
df_edges.head()

Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount Sent,Sent Currency,Amount Received,Received Currency,Payment Format,Is Laundering
0,316720,256803,256803,10,47.64,13,47.64,13,0,0
1,261688,212164,212164,10,3917.42,10,3917.42,10,0,0
2,261696,212165,208177,10,97.49,10,97.49,10,2,0
3,126680,102447,102448,10,13939.05,2,13939.05,2,5,0
4,126564,102351,102351,10,10.37,2,10.37,2,0,0


In [50]:
df_edges.columns

Index(['EdgeID', 'from_id', 'to_id', 'Timestamp', 'Amount Sent',
       'Sent Currency', 'Amount Received', 'Received Currency',
       'Payment Format', 'Is Laundering'],
      dtype='object')

## Start timestamp to 0

In [51]:
df_edges['Timestamp'] = df_edges['Timestamp'] - df_edges['Timestamp'].min()
df_edges.head()

Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount Sent,Sent Currency,Amount Received,Received Currency,Payment Format,Is Laundering
0,316720,256803,256803,0,47.64,13,47.64,13,0,0
1,261688,212164,212164,0,3917.42,10,3917.42,10,0,0
2,261696,212165,208177,0,97.49,10,97.49,10,2,0
3,126680,102447,102448,0,13939.05,2,13939.05,2,5,0
4,126564,102351,102351,0,10.37,2,10.37,2,0,0


In [52]:
df_edges[df_edges["from_id"] == 2]

Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount Sent,Sent Currency,Amount Received,Received Currency,Payment Format,Is Laundering
981220,982879,2,353098,72060,18.69,0,18.69,0,3,0
994070,982878,2,353098,73440,77.4,0,77.4,0,1,0
1050672,1047784,2,2,79560,64.81,0,64.81,0,0,0
1830377,1825725,2,353098,168180,18.69,0,18.69,0,3,0
1834037,1825724,2,353098,168600,77.4,0,77.4,0,1,0
2354474,2349866,2,353098,355800,77.4,0,77.4,0,1,0
2355474,2349867,2,353098,355980,18.69,0,18.69,0,3,0
2728433,2728718,2,4,424920,829.87,0,829.87,0,3,0
2834084,2832122,2,353098,441720,77.4,0,77.4,0,1,0
2835105,2832123,2,353098,441900,18.69,0,18.69,0,3,0


## Total accounts

In [53]:
max_n_id = df_edges.loc[:, ['from_id', 'to_id']].to_numpy().max() + 1
max_n_id

515088

In [54]:
df_nodes = pd.DataFrame({'NodeID': np.arange(max_n_id), 'Feature': np.ones(max_n_id)})
df_nodes.head()

Unnamed: 0,NodeID,Feature
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
4,4,1.0


In [78]:
timestamps = torch.Tensor(df_edges["Timestamp"])
timestamps

tensor([      0.,       0.,       0.,  ..., 1509480., 1515480., 1527480.])

In [60]:
y = torch.LongTensor(df_edges["Is Laundering"].to_numpy())
y

tensor([0, 0, 0,  ..., 1, 1, 1])

### Illicit transactions, total transactions

In [84]:
torch.sum(y).item()

5177

In [83]:
total_transactions = y.shape[0]
total_transactions

5078345

### Illicit ratio

In [64]:
f"{torch.sum(y).item() / y.shape[0] * 100:.2f} %"

'0.10 %'

In [65]:
edge_features = ['Timestamp', 'Amount Received', 'Received Currency', 'Payment Format']
node_features = ['Feature']

In [66]:
x = torch.tensor(df_nodes.loc[:, node_features].to_numpy())
x

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]], dtype=torch.float64)

### edge from (node1, node2)

In [91]:
edge_index = torch.LongTensor(df_edges.loc[:, ['from_id', 'to_id']].to_numpy().T)
edge_index

tensor([[256803, 212164, 212165,  ...,  71717,  71717,  71717],
        [256803, 212164, 208177,  ..., 131619, 273443,  49001]])

### Edge features

In [92]:
edge_attr = torch.tensor(df_edges.loc[:, edge_features].to_numpy()).float()
edge_attr

tensor([[0.0000e+00, 4.7640e+01, 1.3000e+01, 0.0000e+00],
        [0.0000e+00, 3.9174e+03, 1.0000e+01, 0.0000e+00],
        [0.0000e+00, 9.7490e+01, 1.0000e+01, 2.0000e+00],
        ...,
        [1.5095e+06, 1.7853e+03, 2.0000e+00, 3.0000e+00],
        [1.5155e+06, 2.1545e+03, 0.0000e+00, 3.0000e+00],
        [1.5275e+06, 5.4485e+03, 0.0000e+00, 3.0000e+00]])

In [93]:
n_days = int(timestamps.max() / (3600 * 24) + 1)
n_days

18

In [96]:
daily_irs, weighted_daily_irs, daily_inds, daily_trans = [], [], [], [] #irs = illicit ratios, inds = indices, trans = transactions
for day in range(n_days):
    l = day * 24 * 3600
    r = (day + 1) * 24 * 3600
    day_inds = torch.where((timestamps >= l) & (timestamps < r))[0]
    daily_irs.append(y[day_inds].float().mean())
    weighted_daily_irs.append(y[day_inds].float().mean() * day_inds.shape[0] / total_transactions)
    daily_inds.append(day_inds)
    daily_trans.append(day_inds.shape[0])

split_per = [0.6, 0.2, 0.2]
daily_totals = np.array(daily_trans)
d_ts = daily_totals
I = list(range(len(d_ts)))
split_scores = dict()
for i,j in itertools.combinations(I, 2):
    if j >= i:
        split_totals = [d_ts[:i].sum(), d_ts[i:j].sum(), d_ts[j:].sum()]
        split_totals_sum = np.sum(split_totals)
        split_props = [v/split_totals_sum for v in split_totals]
        split_error = [abs(v-t)/t for v,t in zip(split_props, split_per)]
        score = max(split_error) #- (split_totals_sum/total) + 1
        split_scores[(i,j)] = score

i,j = min(split_scores, key=split_scores.get)
split = [list(range(i)), list(range(i, j)), list(range(j, len(daily_totals)))]
split

[[0, 1, 2, 3, 4, 5], [6, 7], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]]

In [97]:

#Now, we seperate the transactions based on their indices in the timestamp array
split_inds = {k: [] for k in range(3)}
for i in range(3):
    for day in split[i]:
        split_inds[i].append(daily_inds[day]) #split_inds contains a list for each split (tr,val,te) which contains the indices of each day seperately

tr_inds = torch.cat(split_inds[0])
val_inds = torch.cat(split_inds[1])
te_inds = torch.cat(split_inds[2])

tr_inds.shape, val_inds.shape, te_inds.shape

(torch.Size([3248921]), torch.Size([965524]), torch.Size([863900]))

In [101]:
f"Total train samples: {tr_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: \
{y[tr_inds].float().mean() * 100 :.2f}% || Train days: {split[0][:5]}"

'Total train samples: 63.98% || IR: 0.08% || Train days: [0, 1, 2, 3, 4]'

In [103]:
f"Total val samples: {val_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: \
{y[val_inds].float().mean() * 100:.2f}% || Val days: {split[1][:5]}"

'Total val samples: 19.01% || IR: 0.11% || Val days: [6, 7]'

In [104]:

f"Total test samples: {te_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: \
{y[te_inds].float().mean() * 100:.2f}% || Test days: {split[2][:5]}"


'Total test samples: 17.01% || IR: 0.19% || Test days: [8, 9, 10, 11, 12]'

In [105]:
tr_x, val_x, te_x = x, x, x
e_tr = tr_inds.numpy()
e_val = np.concatenate([tr_inds, val_inds])

tr_edge_index,  tr_edge_attr,  tr_y,  tr_edge_times  = edge_index[:,e_tr],  edge_attr[e_tr],  y[e_tr],  timestamps[e_tr]
val_edge_index, val_edge_attr, val_y, val_edge_times = edge_index[:,e_val], edge_attr[e_val], y[e_val], timestamps[e_val]
te_edge_index,  te_edge_attr,  te_y,  te_edge_times  = edge_index,          edge_attr,        y,        timestamps

In [107]:
class GraphData(Data):
    '''This is the homogenous graph object we use for GNN training if reverse MP is not enabled'''
    def __init__(
        self,
        x: OptTensor = None,
        edge_index: OptTensor = None,
        edge_attr: OptTensor = None,
        y: OptTensor = None,
        pos: OptTensor = None,
        readout: str = 'edge',
        num_nodes: int = None,
        timestamps: OptTensor = None,
        node_timestamps: OptTensor = None,
        **kwargs
    ):
        super().__init__(x, edge_index, edge_attr, y, pos, **kwargs)
        self.readout = readout
        self.loss_fn = 'ce'
        self.num_nodes = int(self.x.shape[0])
        self.node_timestamps = node_timestamps
        if timestamps is not None:
            self.timestamps = timestamps
        elif edge_attr is not None:
            self.timestamps = edge_attr[:,0].clone()
        else:
            self.timestamps = None

In [108]:
def z_norm(data):
    std = data.std(0).unsqueeze(0)
    std = torch.where(std == 0, torch.tensor(1, dtype=torch.float32).cpu(), std)
    return (data - data.mean(0).unsqueeze(0)) / std

In [109]:
tr_data = GraphData (x=tr_x,  y=tr_y,  edge_index=tr_edge_index,  edge_attr=tr_edge_attr,  timestamps=tr_edge_times )
val_data = GraphData(x=val_x, y=val_y, edge_index=val_edge_index, edge_attr=val_edge_attr, timestamps=val_edge_times)
te_data = GraphData (x=te_x,  y=te_y,  edge_index=te_edge_index,  edge_attr=te_edge_attr,  timestamps=te_edge_times )

    #Normalize data
tr_data.x = val_data.x = te_data.x = z_norm(tr_data.x)
tr_data.edge_attr, val_data.edge_attr, te_data.edge_attr = z_norm(tr_data.edge_attr), z_norm(val_data.edge_attr), z_norm(te_data.edge_attr)

In [111]:
tr_data.edge_attr = torch.cat([torch.arange(tr_data.edge_attr.shape[0]).view(-1, 1), tr_data.edge_attr], dim=1)
val_data.edge_attr = torch.cat([torch.arange(val_data.edge_attr.shape[0]).view(-1, 1), val_data.edge_attr], dim=1)
te_data.edge_attr = torch.cat([torch.arange(te_data.edge_attr.shape[0]).view(-1, 1), te_data.edge_attr], dim=1)

In [112]:
transform = None

In [114]:
tr_loader =  LinkNeighborLoader(tr_data, num_neighbors=args.num_neighs, batch_size=args.batch_size, shuffle=True, transform=transform)
val_loader = LinkNeighborLoader(val_data,num_neighbors=args.num_neighs, edge_label_index=val_data.edge_index[:, val_inds],
                                        edge_label=val_data.y[val_inds], batch_size=args.batch_size, shuffle=False, transform=transform)
te_loader =  LinkNeighborLoader(te_data,num_neighbors=args.num_neighs, edge_label_index=te_data.edge_index[:, te_inds],
                                edge_label=te_data.y[te_inds], batch_size=args.batch_size, shuffle=False, transform=transform)

In [116]:
tr_data

GraphData(x=[515088, 1], edge_index=[2, 3248921], edge_attr=[3248921, 5], y=[3248921], readout='edge', loss_fn='ce', num_nodes=515088, timestamps=[3248921])