In [5]:
import pandas as pd
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T

from torch_geometric.data import Data

account_df = pd.read_csv('data/processed/kaggle_transaction_dataset_centrality.csv')
address_mapping = pd.DataFrame(data={
    'address': account_df['address'], 
    'mapped_id': account_df['address'].index
})

transaction_df = pd.read_csv('data/queried/full_transactions.csv')
transaction_mapping = transaction_df \
    .merge(
        address_mapping, 
        left_on='address from', 
        right_on='address'
    ) \
    .merge(
        address_mapping, 
        left_on='address to', 
        right_on='address', 
        suffixes=('_from', '_to')
    )

x = F.normalize(
    torch.tensor(
        account_df.drop(columns=['address', 'flag']).to_numpy(), 
        dtype=torch.float
    )
)
y = torch.tensor(
    account_df['flag'].to_numpy(), 
    dtype=torch.long
)
edge_index = torch.tensor(
    transaction_mapping[['mapped_id_from', 'mapped_id_to']].to_numpy(), 
    dtype=torch.long
)

data = Data(x=x, y=y, edge_index=edge_index)
data = T.ToUndirected()(data)
data = T.RandomNodeSplit(num_val=0, num_test=2000)(data)

In [37]:
classes, counts = torch.unique(y, return_counts=True)

# Calculate class weights based on their frequency
total_samples = torch.sum(counts).float()
class_weights = total_samples / (classes.numel() * counts.float())

print("Class Weights:", class_weights)

Class Weights: tensor([0.6940, 1.7887])


In [54]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 64)
        self.conv2 = GCNConv(64, 64)
        self.conv3 = GCNConv(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)

        return F.log_softmax(x, dim=1)

class GAT(torch.nn.Module):
    def __init__(self, heads=4):
        super().__init__()
        self.conv1 = GATConv(data.num_node_features, 32, heads=heads)
        self.conv2 = GATConv(32 * heads, 32, heads=heads)
        self.conv3 = GATConv(32 * heads, 2, heads=1)  # Last layer typically uses a single head

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)

        return F.log_softmax(x, dim=1)


In [57]:
from sklearn.metrics import f1_score, roc_auc_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAT().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

model.train()
for epoch in range(201):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], weight=class_weights)
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        pred = model(data).argmax(dim=1)
        correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
        acc = int(correct) / int(data.test_mask.sum())
        f1 = f1_score(data.y[data.test_mask], pred[data.test_mask], average='weighted')
        auroc = roc_auc_score(data.y[data.test_mask], pred[data.test_mask])
        print(f'Epoch {str(epoch).zfill(3)}: Accuracy {acc:.3f} F1 Score {f1:.3f} AUROC: {auroc:.3f}')

Epoch 000: Accuracy 0.612 F1 Score 0.631 AUROC: 0.668
Epoch 010: Accuracy 0.676 F1 Score 0.693 AUROC: 0.749
Epoch 020: Accuracy 0.683 F1 Score 0.699 AUROC: 0.764
Epoch 030: Accuracy 0.684 F1 Score 0.700 AUROC: 0.769
Epoch 040: Accuracy 0.684 F1 Score 0.699 AUROC: 0.770
Epoch 050: Accuracy 0.688 F1 Score 0.703 AUROC: 0.771
Epoch 060: Accuracy 0.689 F1 Score 0.703 AUROC: 0.778
Epoch 070: Accuracy 0.680 F1 Score 0.695 AUROC: 0.770
Epoch 080: Accuracy 0.684 F1 Score 0.699 AUROC: 0.775
Epoch 090: Accuracy 0.688 F1 Score 0.702 AUROC: 0.774
Epoch 100: Accuracy 0.684 F1 Score 0.699 AUROC: 0.772
Epoch 110: Accuracy 0.685 F1 Score 0.700 AUROC: 0.775
Epoch 120: Accuracy 0.686 F1 Score 0.701 AUROC: 0.774
Epoch 130: Accuracy 0.687 F1 Score 0.702 AUROC: 0.775
Epoch 140: Accuracy 0.684 F1 Score 0.699 AUROC: 0.772
Epoch 150: Accuracy 0.689 F1 Score 0.703 AUROC: 0.776
Epoch 160: Accuracy 0.688 F1 Score 0.703 AUROC: 0.774
Epoch 170: Accuracy 0.689 F1 Score 0.704 AUROC: 0.776
Epoch 180: Accuracy 0.690 F1