In this notebook, we create necessary variables for dashboard inputs to visualzie the fraud data.

In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import HeteroData
import networkx as nx
import plotly.graph_objs as go
import os
import sys
sys.path.append(os.path.abspath('../model/'))
from utils import create_hetero_data

In [8]:
# Path to your heterogeneous data
path_hetero_data = ''

# Create the hetero data
data_name = 'amlsim_mixed'  # 'elliptic' dgraph_fin amlsim_mixed
ext_rate = 0.3
data_path = f'../hetero_data/{data_name}/ext_{ext_rate}/'
accounts = pd.read_csv(data_path + 'accounts.csv')
data = create_hetero_data(data_path)

ext_embed_file = os.path.join(data_path, 'ext_embed.pt')
if os.path.exists(ext_embed_file):
    data['external'].x = torch.load(ext_embed_file)
else:
    print('External node embeddings not foundm, please run the feature balanced algorithm.')

# Prepare data for visualization
internal_ids = data['internal'].id.numpy()
external_ids = data['external'].id.numpy()

# Extract edge indices
internal_internal_edges = data['internal', 'internal_txn', 'internal'].edge_index.numpy()
internal_external_edges = data['internal', 'external_withdraw', 'external'].edge_index.numpy()
external_internal_edges = data['external', 'external_deposit', 'internal'].edge_index.numpy()

# Create a bipartite graph using NetworkX
B = nx.Graph()

# Add nodes with the bipartite attribute
B.add_nodes_from(internal_ids, bipartite=0, type='internal')
B.add_nodes_from(external_ids, bipartite=1, type='external')

# Add edges (assuming you want to visualize both internal to external and external to internal)
for edge in internal_external_edges.T:
    B.add_edge(edge[0], edge[1])

for edge in external_internal_edges.T:
    B.add_edge(edge[0], edge[1])

for edge in internal_internal_edges.T:
    B.add_edge(edge[0], edge[1])

In [9]:
# pos = nx.spring_layout(B) # Too long to compute
# Use TSNE to reduce the dimensionality of the nodes into 2D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
pca = PCA(n_components=2)
internal_emmbedding = data['internal'].x.detach().cpu().numpy()
external_embedding = data['external'].x.detach().cpu().numpy()
X = np.concatenate([internal_emmbedding, external_embedding], axis=0)
pos = pca.fit_transform(X)

In [10]:
internal_ids.shape,internal_emmbedding.shape,external_ids.shape,external_embedding.shape

((13898,), (13898, 73), (6102,), (6102, 73))

In [11]:
import pickle
with open(f'{data_name}_ext_{ext_rate}_layout.pkl', 'wb') as f:
    pickle.dump(pos, f)

In [7]:
len(internal_emmbedding)+len(external_embedding)

203769

In [13]:
accounts = pd.read_csv(data_path + 'accounts.csv')

In [15]:
len(accounts)

20000