Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# Notebook for dataloader, model training and inference 
## This notebook consists of steps to 
1. load processed graph data into a data dict  raw data into edge, node list data for the data loader for model training 
2. load the model training configurations 
3. pass the data dict in step (1) to the model 
4. train the model 
5. apply model inference on the specific snapshot

In [1]:
import sys 
import os

In [2]:
sys.path.append('../../')
sys.path.append('../../src/')

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pickle
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load processed data 

In [5]:
# open the training data pickle
file = open("../../data/03_primary/financial_fraud/training_data.pkl", 'rb')

# load the pickle file 
data = pickle.load(file)

# close the file
file.close()

In [6]:
rows, cols, labels, weights, headtail, train_size, test_size, nb_nodes, nb_edges = data

In [7]:
#5 snapshots being train and 5 later snapshots being test
train_size, test_size, nb_nodes, nb_edges 

(5, 5, 4162, 47132)

In [8]:
type(headtail)

numpy.ndarray

In [9]:
headtail #sparse matrix saving the node indexes of all edges 

array([list([0, 4119, 4127, 4134, 4136, 4146, 4154]),
       list([1, 4121, 4122, 4133, 4140, 4146, 4149, 4151, 4152, 4155]),
       list([2, 4112, 4130, 4134, 4136, 4142, 4155, 4157]), ...,
       list([46, 147, 769, 1100, 1453, 1478, 1539, 1554, 1632, 1654, 1660, 1782, 1917, 1993, 2187, 2387, 2413, 2473, 2594, 2624, 2626, 2772, 2816, 2844, 2903, 2981, 3036, 3104, 3303, 3326, 3412, 3438, 3442, 3488, 3685, 3871, 4159]),
       list([25, 48, 50, 61, 94, 108, 111, 123, 126, 137, 144, 149, 173, 188, 210, 224, 275, 289, 291, 295, 297, 302, 323, 330, 355, 365, 409, 415, 416, 497, 498, 509, 510, 529, 530, 552, 556, 571, 588, 593, 595, 613, 617, 629, 676, 706, 707, 751, 773, 793, 801, 808, 818, 821, 832, 848, 867, 868, 870, 889, 890, 893, 919, 961, 964, 997, 1000, 1007, 1017, 1050, 1064, 1076, 1079, 1080, 1086, 1138, 1141, 1143, 1151, 1156, 1241, 1270, 1281, 1287, 1374, 1385, 1398, 1413, 1416, 1425, 1452, 1507, 1521, 1530, 1534, 1548, 1553, 1557, 1584, 1586, 1607, 1611, 1616, 1617, 1654, 1661

In [10]:
rows #source nodes of edges stored as row indexes

[array([3317, 2363, 3396, ..., 1738, 2754, 2754], dtype=int32),
 array([2431, 2431, 2350, ..., 3564, 2144, 4055], dtype=int32),
 array([ 642, 3586,   67, ..., 2095, 3604, 2196], dtype=int32),
 array([3989, 3989, 1160, ...,  320,  782,  782], dtype=int32),
 array([2515,  255,  985, ..., 2817, 1885, 2495], dtype=int32),
 array([2495, 2495, 1447, ..., 1162, 3497, 3497], dtype=int32),
 array([ 308, 1099, 1099, ...,  225, 2671, 1855], dtype=int32),
 array([1948, 1747, 1381, ..., 1106, 3293, 3859], dtype=int32),
 array([2286, 2291, 2493, ..., 1003, 2395, 2186], dtype=int32),
 array([1384, 1259,  444, ...,  529, 1083, 3304], dtype=int32)]

In [11]:
cols #target nodes of edges stored as row indexes 

[array([4148, 4154, 4127, ..., 4132, 4146, 4148], dtype=int32),
 array([4122, 4114, 4146, ..., 4131, 4150, 4142], dtype=int32),
 array([4158, 4131, 4146, ..., 4116, 4136, 4142], dtype=int32),
 array([4116, 4150, 4134, ..., 4117, 4143, 4123], dtype=int32),
 array([4149, 4149, 4127, ..., 4143, 4116, 4139], dtype=int32),
 array([4153, 4160, 4143, ..., 4157, 4112, 4135], dtype=int32),
 array([4152, 4154, 4120, ..., 4130, 4157, 4122], dtype=int32),
 array([4121, 4157, 4116, ..., 4157, 4114, 4157], dtype=int32),
 array([4157, 4157, 4142, ..., 4142, 4130, 4142], dtype=int32),
 array([4130, 4130, 4130, ..., 4143, 4130, 4130], dtype=int32)]

In [12]:
degrees = np.array([len(x) for x in headtail])
num_snap = test_size + train_size
labels = [torch.LongTensor(label) for label in labels]

snap_train = list(range(num_snap))[:train_size]
snap_test = list(range(num_snap))[train_size:]

In [13]:
snap_train

[0, 1, 2, 3, 4]

In [14]:
snap_test

[5, 6, 7, 8, 9]

In [15]:
labels, len(labels), labels[0].shape

([tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 1,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([1, 1, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([1, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0])],
 10,
 torch.Size([5000]))

In [16]:
idx = list(range(nb_nodes))
index_id_map = {i:i for i in idx}
idx = np.array(idx)

# Define data dictionary

In [17]:
from anomaly_detection_spatial_temporal_data.model.model_config import TaddyConfig

In [18]:
import yaml

In [19]:
train_config_file = '../../conf/base/parameters/taddy.yml'

In [20]:
with open(train_config_file, "r") as stream:
    try:
        train_config=yaml.safe_load(stream)
        print(train_config)
    except yaml.YAMLError as exc:
        print(exc)

{'data_load_options': {'c': 0.15, 'eps': 0.001, 'random_state': 3, 'batch_size': 256, 'load_all_tag': False, 'neighbor_num': 5, 'window_size': 2, 'compute_s': True, 'eigen_file_name': 'data/05_model_input/eigen.pkl'}, 'model_options': {'neighbor_num': 5, 'window_size': 2, 'batch_size': 256, 'embedding_dim': 32, 'num_hidden_layers': 2, 'num_attention_heads': 2, 'seed': 1, 'print_feq': 10, 'lr': 0.001, 'weight_decay': '5e-4', 'max_epoch': 10, 'spy_tag': True, 'max_hop_dis_index': 100, 'max_inti_pos_index': 100, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.5, 'attention_probs_dropout_prob': 0.3, 'initializer_range': 0.02, 'layer_norm_eps': 1e-12, 'is_decoder': False, 'save_directory': 'data/07_model_output/'}, 'infer_options': {'snap_num': 6}}


In [21]:
eigen_file_name = "../../data/05_model_input/financial_fraud/eigen_tmp.pkl"
data_loader_config = train_config['data_load_options']

In [22]:
from anomaly_detection_spatial_temporal_data.utils import ensure_directory

In [23]:
data_loader_config

{'c': 0.15,
 'eps': 0.001,
 'random_state': 3,
 'batch_size': 256,
 'load_all_tag': False,
 'neighbor_num': 5,
 'window_size': 2,
 'compute_s': True,
 'eigen_file_name': 'data/05_model_input/eigen.pkl'}

In [24]:
type(data_loader_config['compute_s'])

bool

In [25]:
import scipy.sparse as sp
from numpy.linalg import inv

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix. (0226)"""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def adj_normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -0.5).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx).dot(r_mat_inv)
    return mx

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation. (0226)"""
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    # adj_np = np.array(adj.todense())
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    adj_normalized = sparse_mx_to_torch_sparse_tensor(adj_normalized)
    return adj_normalized

def get_adjs(rows, cols, weights, nb_nodes, eigen_file_name, data_loader_config):
    """Generate adjacency matrix and conduct eigenvalue decomposition for node sampling"""
    if not os.path.exists(eigen_file_name):
        generate_eigen = True
        print('Generating eigen as: ' + eigen_file_name)
    else:
        generate_eigen = False
        print('Loading eigen from: ' + eigen_file_name)
        with open(eigen_file_name, 'rb') as f:
            eigen_adjs_sparse = pickle.load(f)
        eigen_adjs = []
        for eigen_adj_sparse in eigen_adjs_sparse:
            eigen_adjs.append(np.array(eigen_adj_sparse.todense()))

    adjs = []
    if generate_eigen:
        eigen_adjs = []
        eigen_adjs_sparse = []

    for i in range(len(rows)):
        adj = sp.csr_matrix((weights[i], (rows[i], cols[i])), shape=(nb_nodes, nb_nodes), dtype=np.float32)
        adjs.append(preprocess_adj(adj))
        if data_loader_config['compute_s']:
            if generate_eigen:
                eigen_adj = data_loader_config['c'] * inv((sp.eye(adj.shape[0]) - (1 - data_loader_config['c']) * adj_normalize(adj)).toarray())
                for p in range(adj.shape[0]):
                    eigen_adj[p,p] = 0.
                eigen_adj = normalize(eigen_adj)
                eigen_adjs.append(eigen_adj)
                eigen_adjs_sparse.append(sp.csr_matrix(eigen_adj))

        else:
            eigen_adjs.append(None)

    if generate_eigen:
        with open(eigen_file_name, 'wb') as f:
            pickle.dump(eigen_adjs_sparse, f, pickle.HIGHEST_PROTOCOL)

    return adjs, eigen_adjs

In [26]:
ensure_directory(eigen_file_name)
edges = [np.vstack((rows[i], cols[i])).T for i in range(num_snap)]
adjs, eigen_adjs = get_adjs(rows, cols, weights, nb_nodes, eigen_file_name, data_loader_config)

Loading eigen from: ../../data/05_model_input/financial_fraud/eigen_tmp.pkl


In [27]:
data_dict = {
    'X': None, 
    'A': adjs, 
    'S': eigen_adjs, 
    'index_id_map': index_id_map, 
    'edges': edges,
    'y': labels, 
    'idx': idx, 
    'snap_train': snap_train, 
    'degrees': degrees,
    'snap_test': snap_test, 
    'num_snap': num_snap}

# Define model and load model training config 

In [28]:
from anomaly_detection_spatial_temporal_data.model.dynamic_graph import Taddy

In [29]:
train_config

{'data_load_options': {'c': 0.15,
  'eps': 0.001,
  'random_state': 3,
  'batch_size': 256,
  'load_all_tag': False,
  'neighbor_num': 5,
  'window_size': 2,
  'compute_s': True,
  'eigen_file_name': 'data/05_model_input/eigen.pkl'},
 'model_options': {'neighbor_num': 5,
  'window_size': 2,
  'batch_size': 256,
  'embedding_dim': 32,
  'num_hidden_layers': 2,
  'num_attention_heads': 2,
  'seed': 1,
  'print_feq': 10,
  'lr': 0.001,
  'weight_decay': '5e-4',
  'max_epoch': 10,
  'spy_tag': True,
  'max_hop_dis_index': 100,
  'max_inti_pos_index': 100,
  'hidden_act': 'gelu',
  'hidden_dropout_prob': 0.5,
  'attention_probs_dropout_prob': 0.3,
  'initializer_range': 0.02,
  'layer_norm_eps': 1e-12,
  'is_decoder': False,
  'save_directory': 'data/07_model_output/'},
 'infer_options': {'snap_num': 6}}

In [30]:
train_config['model_options']['save_directory'] = '../../data/07_model_output/financial_fraud' #change save path for notebook

In [31]:
if not os.path.exists(train_config['model_options']['save_directory']):
    os.makedirs(train_config['model_options']['save_directory'])

In [32]:
model_config = TaddyConfig(config=train_config['model_options'])
model_obj = Taddy(data_dict, model_config)

In [33]:
model_config.save_directory

'../../data/07_model_output/financial_fraud'

# Train model

In [34]:
learned_result,save_model_path = model_obj.run()

Epoch: 1, loss:0.6934, Time: 28.0603s
Epoch: 2, loss:0.6833, Time: 28.0489s
Epoch: 3, loss:0.6767, Time: 31.4435s
Epoch: 4, loss:0.6690, Time: 28.4979s
Epoch: 5, loss:0.6602, Time: 29.2117s
Epoch: 6, loss:0.6230, Time: 28.3041s
Epoch: 7, loss:0.6042, Time: 29.8792s
Epoch: 8, loss:0.5904, Time: 28.3881s
Epoch: 9, loss:0.5320, Time: 28.6048s
Epoch: 10, loss:0.5016, Time: 28.1970s
Snap: 05 | AUC: 0.6736
Snap: 06 | AUC: 0.6444
Snap: 07 | AUC: 0.6407
Snap: 08 | AUC: 0.6331
Snap: 09 | AUC: 0.8296
TOTAL AUC:0.6006


# Model training result 

In [35]:
learned_result

{1: {'train_loss': 0.6933763027191162},
 2: {'train_loss': 0.6833107173442841},
 3: {'train_loss': 0.6766712069511414},
 4: {'train_loss': 0.6690231710672379},
 5: {'train_loss': 0.6601802408695221},
 6: {'train_loss': 0.6229529976844788},
 7: {'train_loss': 0.6041898727416992},
 8: {'train_loss': 0.5904221832752228},
 9: {'train_loss': 0.5320391654968262},
 10: {'train_loss': 0.5016221180558205, 'test_auc': 0.6005636457322073}}

In [36]:
save_model_path

'../../data/07_model_output/financial_fraud/taddy_model_9.pth'

# Run inference on the specific snapshot 

### load trained model 

In [37]:
import torch
import transformers

In [41]:
model = torch.load(save_model_path)

In [42]:
type(model)

anomaly_detection_spatial_temporal_data.model.dynamic_graph.Taddy

In [43]:
snap_num = 9

In [44]:
pred = model.predict(snap_num)

Generating embeddings...
Embeddings created!


In [45]:
type(pred)

numpy.ndarray

In [46]:
pred

array([0.28766072, 0.28766072, 0.30370262, ..., 0.8222538 , 0.28766072,
       0.30370262], dtype=float32)

In [47]:
labels[1]

tensor([0, 0, 0,  ..., 0, 0, 0])

In [48]:
from sklearn import metrics

In [49]:
auc = metrics.roc_auc_score(labels[snap_num],pred)

In [50]:
auc

0.8296228020419738

# References

Edgar Alonso Lopez-Rojas and Stefan Axelsson. 2014. BANKSIM: A BANK PAYMENTS SIMULATOR FOR FRAUD DETECTION RESEARCH.

Yixin Liu, Shirui Pan, Yu Guang Wang, Fei Xiong, Liang Wang, Qingfeng Chen, and Vincent CS Lee. 2015. Anomaly Detection in Dynamic Graphs via Transformer.