Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0


# Prepare Financial Fraud dataset for dynamic graph model (TADDY)

The TADDY model an anomaly detection that detects anomalous edges in dynamic (changing over time) graphs. It learns edge embeddings that combine spatial (neighboring nodes and edges) of the graph as well as temporal information. A fully connected layer then classifies the embeddings as anomaly/not anomaly.

The model expects graph snapshots with labeled edges, so this notebook prepares the BankSim dataset for TADDY.

## This notebook consists of steps to 
1. Process raw data into edge, node list data for the data loader for model training 
3. Create graph snapshots with earlier snapshots used for train, and later snapshots for test

In [1]:
import sys 
import os

In [2]:
sys.path.append('../../src/')

In [3]:
import pandas as pd
import numpy as np
import pickle

from anomaly_detection_spatial_temporal_data.utils import ensure_directory

# Load raw data

In [6]:
raw_data_path = '../../data/01_raw/financial_fraud/bs140513_032310.csv'

raw_trans_data = pd.read_csv(raw_data_path)

raw_trans_data.shape

(594643, 10)

In [11]:
raw_net_data_path = '../../data/01_raw/financial_fraud/bsNET140513_032310.csv'

raw_net_trans_data = pd.read_csv(raw_net_data_path)

raw_net_trans_data.shape

(594643, 5)

# Process edge data for dynamic graph model 
## Customer can be treated as source node and merchant can be treated as target node 

In [13]:
edges = raw_trans_data[['step','customer','merchant','category','amount','fraud']]

In [17]:
# remove self loops where customer bought from self
edges = edges.loc[edges.customer!=edges.merchant]

edges.shape

(594643, 6)

### check duplicated (customer, merchant) pairs 

In [None]:
customer_merchant_trans_count = edges.groupby(
    by=['customer','merchant']
).agg({'step':'count'}) #there are 47132 unique pairs 

In [None]:
customer_merchant_trans_fraud = edges.groupby(by=['customer','merchant']).agg({'fraud':'sum'})

In [20]:
customer_merchant_trans_fraud.columns

Index(['fraud'], dtype='object')

### Oberservation: 1065 (customer, merchant) pairs had been flagged as fraud for more than 1 time

In [21]:
customer_merchant_trans_fraud.loc[customer_merchant_trans_fraud.fraud>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C1001065306','M17379832',2
'C1001065306','M480139044',3
'C1001065306','M980657600',2
'C1007572087','M732195782',2
'C1013313546','M980657600',2
...,...,...
'C974315171','M980657600',3
'C980181294','M480139044',2
'C980181294','M732195782',2
'C989137613','M732195782',2


### Oberservation: 1108 (customer, merchant) pairs had changing labels

In [22]:
customer_merchant_trans_fraud_consistency = edges.groupby(by=['customer','merchant']).agg({'fraud':'mean'})
customer_merchant_trans_fraud_consistency

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C1000148617','M1053599405',0.0
'C1000148617','M1400236507',0.0
'C1000148617','M1741626453',0.0
'C1000148617','M1823072687',0.0
'C1000148617','M1842530320',0.0
...,...,...
'C999723254','M348934600',0.0
'C999723254','M349281107',0.0
'C999723254','M480139044',1.0
'C999723254','M855959430',0.0


In [23]:
customer_merchant_trans_fraud_consistency.loc[
    (customer_merchant_trans_fraud_consistency.fraud!=1) & (customer_merchant_trans_fraud_consistency.fraud!=0) 
]

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C100045114','M1198415165',0.250000
'C100045114','M2122776122',0.500000
'C1001065306','M480139044',0.500000
'C1001065306','M50039827',0.200000
'C1001065306','M980657600',0.666667
...,...,...
'C995844287','M1198415165',0.333333
'C995844287','M855959430',0.200000
'C997029022','M480139044',0.333333
'C998690782','M732195782',0.500000


# Dedupe (customer, merchant) pair, only keep the last transaction (the latest)

In [24]:
edges.shape

(594643, 6)

In [25]:
edges_deduped = edges.drop_duplicates(subset=['customer','merchant'], keep='last', )

In [26]:
edges_deduped.shape

(47132, 6)

In [29]:
edges_array = np.array(edges_deduped[['customer','merchant']])

### convert str ids to int indexes 

In [31]:
vertexs, edges_1d = np.unique(edges_array, return_inverse=True)

In [12]:
# vertexs, len(vertexs)

### save str ids to int indexes mapping

In [33]:
vertex_to_id = {}
for i,vertex in enumerate(vertexs):
    vertex_to_id.setdefault(vertex,i)

In [35]:
vertex_to_id_df = pd.DataFrame.from_dict(
    vertex_to_id, 
    orient='index', 
    columns=['idx']
).reset_index().rename(columns={"index": "name"})

#### save id to index mapping


In [39]:
vertex_to_id_file_path = "../../data/02_intermediate/financial_fraud/node_id.csv"

ensure_directory(vertex_to_id_file_path)

vertex_to_id_df.to_csv("../../data/02_intermediate/financial_fraud/node_id.csv", index=False)

In [41]:
edges_idx = np.reshape(edges_1d, [-1, 2])

In [42]:
edges_idx, len(edges_idx)

(array([[3317, 4148],
        [2363, 4154],
        [3396, 4127],
        ...,
        [ 529, 4143],
        [1083, 4130],
        [3304, 4130]]),
 47132)

### Check whether the node indexes for the top 3 edge list records are correct 
It's critical that the indexes are correctly aligned with raw data, and the indexes in the graph (represented as sparse graph)

In [43]:
### manually checkingg the node id for the note indexes
# (vertexs[3317], vertexs[4148]), (vertexs[2363], vertexs[4154]),(vertexs[3396], vertexs[4127]), (vertexs[3304], vertexs[4130])

(("'C623601481'", "'M50039827'"),
 ("'C2092526272'", "'M840466850'"),
 ("'C661876608'", "'M1741626453'"),
 ("'C616528518'", "'M1823072687'"))

In [44]:
### consistent with the raw data 
# edges_deduped.head(3)

Unnamed: 0,step,customer,merchant,category,amount,fraud
12,0,'C623601481','M50039827','es_health',68.79,0
148,0,'C2092526272','M840466850','es_tech',163.56,0
153,0,'C661876608','M1741626453','es_sportsandtoys',11.83,0


In [45]:
# print('vertex:', len(vertexs), 'edge:', len(edges_idx))

vertex: 4162 edge: 47132


# Find labels for the edge

In [46]:
from tqdm import tqdm

In [47]:
edge_label_arr = np.zeros([edges_deduped.shape[0], 3], dtype=np.int32)
for idx, row in tqdm(edges_deduped.reset_index().iterrows(), total=edges_deduped.shape[0]): #using deduped trans 
    edge_label_arr[idx][0] = vertex_to_id[row['customer']]
    edge_label_arr[idx][1] = vertex_to_id[row['merchant']]
    edge_label_arr[idx][2] = row['fraud']

100%|██████████| 47132/47132 [00:02<00:00, 20764.57it/s]


In [48]:
edge_label_arr.shape

(47132, 3)

In [50]:
edge_label_postprocessed_df = pd.DataFrame(edge_label_arr, columns=['source','target','label'])

In [51]:
edge_label_postprocessed_df.head()

Unnamed: 0,source,target,label
0,3317,4148,0
1,2363,4154,0
2,3396,4127,0
3,948,4151,0
4,2086,4155,0
...,...,...,...
47127,1639,4130,0
47128,3369,4130,0
47129,529,4143,0
47130,1083,4130,0


In [53]:
edge_label_df_file_path = "../../data/02_intermediate/financial_fraud/edge_label.csv"
edge_list_arr_file_path = "../../data/02_intermediate/financial_fraud/edge_list.npz"

ensure_directory(edge_label_df_file_path)
ensure_directory(edge_list_arr_file_path)

In [55]:
with open(edge_list_arr_file_path, mode="wb") as f:
    np.savez(f,data=edge_label_arr)

### check again the processed data are consistent with the raw data 

In [56]:
# (vertexs[edge_label_arr[0][0]], vertexs[edge_label_arr[0][1]])

("'C623601481'", "'M50039827'")

In [57]:
# edges_deduped.loc[(edges_deduped.customer ==vertexs[edge_label_arr[0][0]] )& (edges_deduped.merchant ==vertexs[edge_label_arr[0][1]])]

Unnamed: 0,step,customer,merchant,category,amount,fraud
12,0,'C623601481','M50039827','es_health',68.79,0


In [58]:
#check fraud ratio
edge_label_postprocessed_df['label'].value_counts(normalize=True)

0    0.912353
1    0.087647
Name: label, dtype: float64

# Split train/test data and generate data for graph dataloader 

In [59]:
edges_deduped.shape

(47132, 6)

In [60]:
edges_deduped.head()

Unnamed: 0,step,customer,merchant,category,amount,fraud
12,0,'C623601481','M50039827','es_health',68.79,0
148,0,'C2092526272','M840466850','es_tech',163.56,0
153,0,'C661876608','M1741626453','es_sportsandtoys',11.83,0
194,0,'C1436756684','M692898500','es_health',187.62,0
218,0,'C1960866892','M855959430','es_hyper',22.65,0
227,0,'C991774315','M349281107','es_fashion',79.71,0
253,0,'C1959067413','M209847108','es_wellnessandbeauty',86.09,0
256,0,'C1465698425','M2122776122','es_home',98.19,0
282,0,'C181787207','M1873032707','es_hotelservices',71.76,0
283,0,'C2063978670','M2011752106','es_hotelservices',194.59,0


In [62]:
len(edge_label_arr), len(vertexs)

(47132, 4162)

In [63]:
vertex_to_id_df.shape, edge_label_arr.shape

((4162, 2), (47132, 3))

In [65]:
m = len(edge_label_arr) #edge number 
n = len(vertex_to_id_df) #node number 

print(f"Number of edges: {m}, Number of nodes: {n}")

(47132, 4162)

In [67]:
train_per = 0.5 #split in half 

train_num = int(np.floor(train_per * m))

train = edge_label_arr[0:train_num, :] #first half being training samples
test = edge_label_arr[train_num:, :] #second half being test samples 

23566

In [69]:
train.shape, test.shape

((23566, 3), (23566, 3))

# Build graph in the format of a sparse matrix with edge list 
Again, it's critical that the indexes are correctly aligned with raw data, and the indexes in the graph (represented as sparse graph)

In [74]:
from scipy.sparse import csr_matrix,coo_matrix,eye

In [75]:
train_mat = csr_matrix(
    (np.ones([np.size(train, 0)], dtype=np.int32), 
    (train[:, 0], train[:, 1])),
    shape=(n, n))

In [76]:
train_mat.shape

(4162, 4162)

In [77]:
train_mat = train_mat + train_mat.transpose() #enforce symmetry 

#### check edgelist id with the sparse matrix idx

In [78]:
# train_mat[3317,4148], train_mat[4148,3317]

(1, 1)

In [79]:
# train_mat[86,4145], train_mat[4145,86] #being 0 because this edge is in the test set 

(0, 0)

In [80]:
train_mat = (train_mat + train_mat.transpose() + eye(n)).tolil() #Convert to List of Lists format

In [81]:
headtail = train_mat.rows #store the indexes of edges

In [15]:
# headtail

In [83]:
#check degrees of each source node 
degrees = np.array([len(x) for x in headtail])

# Creating snapshots of graphs for the dataloader of TADDY model

In [84]:
snap_size=5000

In [86]:
train_size = int(len(train) / snap_size + 0.5) #making slices of snapshots
test_size = int(len(test) / snap_size + 0.5)

In [87]:
train_size, test_size

(5, 5)

In [88]:
rows = []
cols = []
weis = []
labs = []
for ii in range(train_size):
    start_loc = ii * snap_size
    end_loc = (ii + 1) * snap_size

    row = np.array(train[start_loc:end_loc, 0], dtype=np.int32) #source nodes of edges stored as row indexes 
    col = np.array(train[start_loc:end_loc, 1], dtype=np.int32) #target nodes of edges stored as row indexes 
    lab = np.array(train[start_loc:end_loc, 2], dtype=np.int32) #labels
    wei = np.ones_like(row, dtype=np.int32) #weights of edge (all set to be 1 in this experiment)

    rows.append(row)
    cols.append(col)
    weis.append(wei) #weights
    labs.append(lab) #label

In [89]:
for i in range(test_size):
    start_loc = i * snap_size
    end_loc = (i + 1) * snap_size

    row = np.array(test[start_loc:end_loc, 0], dtype=np.int32)
    col = np.array(test[start_loc:end_loc, 1], dtype=np.int32)
    lab = np.array(test[start_loc:end_loc, 2], dtype=np.int32)
    wei = np.ones_like(row, dtype=np.int32)

    rows.append(row)
    cols.append(col)
    weis.append(wei)
    labs.append(lab)

In [90]:
len(rows), rows[0].shape

(10, (5000,))

In [91]:
rows[0]

array([3317, 2363, 3396, ..., 1738, 2754, 2754], dtype=int32)

In [92]:
len(cols), cols[0].shape

(10, (5000,))

In [93]:
cols[0]

array([4148, 4154, 4127, ..., 4132, 4146, 4148], dtype=int32)

In [94]:
len(labs), labs[0].shape

(10, (5000,))

In [95]:
labs[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

### save all intermediate graph data

In [100]:
train_test_data_file_path = '../../data/03_primary/financial_fraud/training_data.pkl'
ensure_directory(train_test_data_file_path)

train_test_data = (rows,cols,labs,weis,headtail,train_size,test_size,n,m)

with open(train_test_data_file_path, 'wb') as f:
    pickle.dump(train_test_data, f)

# References

Edgar Alonso Lopez-Rojas and Stefan Axelsson. 2014. BANKSIM: A BANK PAYMENTS SIMULATOR FOR FRAUD DETECTION RESEARCH.

Yixin Liu, Shirui Pan, Yu Guang Wang, Fei Xiong, Liang Wang, Qingfeng Chen, and Vincent CS Lee. 2015. Anomaly Detection in Dynamic Graphs via Transformer.