In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T

from torch_geometric.data import Data

account_df = pd.read_csv('data/processed/kaggle_transaction_dataset.csv')
address_mapping = pd.DataFrame(data={
    'address': account_df['address'], 
    'mapped_id': account_df['address'].index
})

transaction_df = pd.read_csv('data/queried/full_transactions.csv')
transaction_mapping = transaction_df \
    .merge(
        address_mapping, 
        left_on='address from', 
        right_on='address'
    ) \
    .merge(
        address_mapping, 
        left_on='address to', 
        right_on='address', 
        suffixes=('_from', '_to')
    )

x = F.normalize(
    torch.tensor(
        account_df.drop(columns=['address', 'flag']).to_numpy(), 
        dtype=torch.float
    )
)
y = torch.tensor(
    account_df['flag'].to_numpy(), 
    dtype=torch.long
)
edge_index = torch.tensor(
    transaction_mapping[['mapped_id_from', 'mapped_id_to']].to_numpy(), 
    dtype=torch.long
)

data = Data(x=x, y=y, edge_index=edge_index)
data = T.ToUndirected()(data)
data = T.RandomNodeSplit()(data)

In [7]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 32)
        self.conv2 = GCNConv(32, 32)
        self.conv3 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)

        return F.log_softmax(x, dim=1)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(500):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    pred = model(data).argmax(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
    acc = int(correct) / int(data.test_mask.sum())
    print(f'Epoch {epoch}: Accuracy: {acc:.4f}')

Epoch 0: Accuracy: 0.6470
Epoch 1: Accuracy: 0.7170
Epoch 2: Accuracy: 0.7300
Epoch 3: Accuracy: 0.7310
Epoch 4: Accuracy: 0.7300
Epoch 5: Accuracy: 0.7300
Epoch 6: Accuracy: 0.7300
Epoch 7: Accuracy: 0.7300
Epoch 8: Accuracy: 0.7300
Epoch 9: Accuracy: 0.7300
Epoch 10: Accuracy: 0.7300
Epoch 11: Accuracy: 0.7300
Epoch 12: Accuracy: 0.7300
Epoch 13: Accuracy: 0.7310
Epoch 14: Accuracy: 0.7300
Epoch 15: Accuracy: 0.7300
Epoch 16: Accuracy: 0.7300
Epoch 17: Accuracy: 0.7300
Epoch 18: Accuracy: 0.7300
Epoch 19: Accuracy: 0.7300
Epoch 20: Accuracy: 0.7300
Epoch 21: Accuracy: 0.7300
Epoch 22: Accuracy: 0.7300
Epoch 23: Accuracy: 0.7300
Epoch 24: Accuracy: 0.7300
Epoch 25: Accuracy: 0.7300
Epoch 26: Accuracy: 0.7300
Epoch 27: Accuracy: 0.7300
Epoch 28: Accuracy: 0.7300
Epoch 29: Accuracy: 0.7300
Epoch 30: Accuracy: 0.7300
Epoch 31: Accuracy: 0.7300
Epoch 32: Accuracy: 0.7300
Epoch 33: Accuracy: 0.7300
Epoch 34: Accuracy: 0.7300
Epoch 35: Accuracy: 0.7300
Epoch 36: Accuracy: 0.7300
Epoch 37: A