In [23]:
from importlib import reload
import torch
from sklearn.metrics import precision_score, recall_score

import data
reload(data)
from data import AmlsimDataset

import modules
reload(modules)
from modules import GCN
from modules import GraphSAGE
from torch_geometric.data import DataLoader
import torch.optim as optim


In [24]:
# Set device to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [25]:
# Load data
traindata = AmlsimDataset(node_file='data/simulation2/swedbank/train/nodes.csv', edge_file='data/simulation2/swedbank/train/edges.csv', node_features=True, node_labels=True).get_data()
testdata = AmlsimDataset(node_file='data/simulation2/swedbank/test/nodes.csv', edge_file='data/simulation2/swedbank/test/edges.csv', node_features=True, node_labels=True).get_data()
traindata = traindata.to(device)
testdata = testdata.to(device)

AttributeError: 'list' object has no attribute 'to'

In [17]:
# Normalize data
mean = traindata.x.mean(dim=0, keepdim=True)
std = traindata.x.std(dim=0, keepdim=True)
traindata.x = (traindata.x - mean) / std
testdata.x = (testdata.x - mean) / std

AttributeError: 'list' object has no attribute 'x'

In [18]:
# Instantiate model
input_dim = 10
hidden_dim = 16
output_dim = 2
n_layers = 3
dropout = 0.3
model = GCN(input_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)

GCN(
  (convs): ModuleList(
    (0): GCNConv(10, 16)
    (1): GCNConv(16, 16)
    (2): GCNConv(16, 2)
  )
  (bns): ModuleList(
    (0-1): 2 x BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (softmax): Softmax(dim=1)
)

In [None]:
print(model.output_dim)

2


In [None]:
# optimizer
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# loss function
weight = torch.tensor([1, 5.75], dtype=torch.float32).to(device)
criterion = torch.nn.CrossEntropyLoss(weight = weight)

In [None]:
for epoch in range(300):
    model.train()
    optimizer.zero_grad()
    out = model(traindata)
    loss = criterion(out, traindata.y)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            out = model(testdata)
            loss = criterion(out, testdata.y)
            precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
            recall = recall_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
            print(f'epoch: {epoch + 1}, loss: {loss:.4f}, precision: {precision:.4f}, recall: {recall:.4f}')

epoch: 10, loss: 0.6950, precision: 0.5429, recall: 0.3434
epoch: 20, loss: 0.6907, precision: 0.6061, recall: 0.3614
epoch: 30, loss: 0.6749, precision: 0.5620, recall: 0.4096
epoch: 40, loss: 0.6534, precision: 0.5241, recall: 0.4578
epoch: 50, loss: 0.6354, precision: 0.5205, recall: 0.5361
epoch: 60, loss: 0.6186, precision: 0.4762, recall: 0.6024
epoch: 70, loss: 0.6058, precision: 0.4542, recall: 0.6867
epoch: 80, loss: 0.5936, precision: 0.4342, recall: 0.7349
epoch: 90, loss: 0.5812, precision: 0.3853, recall: 0.8193
epoch: 100, loss: 0.5760, precision: 0.3527, recall: 0.8795
epoch: 110, loss: 0.5704, precision: 0.3182, recall: 0.9277
epoch: 120, loss: 0.5667, precision: 0.2931, recall: 0.9940
epoch: 130, loss: 0.5638, precision: 0.2938, recall: 1.0000
epoch: 140, loss: 0.5603, precision: 0.2933, recall: 1.0000
epoch: 150, loss: 0.5580, precision: 0.2933, recall: 1.0000
epoch: 160, loss: 0.5573, precision: 0.2923, recall: 0.9880
epoch: 170, loss: 0.5552, precision: 0.2941, reca

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

model.eval()
with torch.no_grad():
    out = model(testdata)
    y_pred = out.cpu().numpy().argmax(axis=1)
    y_true = testdata.y.cpu().numpy()
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

label_0_count = np.count_nonzero(y_true == 0)
label_1_count = np.count_nonzero(y_true == 1)
print(f"Number ground truth labels equal to 0: {label_0_count}")
print(f"Number ground truth labels equal to 1: {label_1_count}")

print(f"recall = {cm[1, 1] / label_1_count}")
print(f"False positive rate = {cm[0, 1] / label_0_count}")

[[103 297]
 [ 11 155]]
Number ground truth labels equal to 0: 400
Number ground truth labels equal to 1: 166
recall = 0.9337349397590361
False positive rate = 0.7425


## Train GraphSAGE


In [21]:
# set device
device = torch.device('cuda:0')
    
# data
traindata = AmlsimDataset(node_file='data/simulation2/swedbank/train/nodes.csv', edge_file='data/simulation2/swedbank/train/edges.csv', node_features=True, node_labels=True).get_data()
testdata = AmlsimDataset(node_file='data/simulation2/swedbank/test/nodes.csv', edge_file='data/simulation2/swedbank/test/edges.csv', node_features=True, node_labels=True).get_data()
traindata = traindata.to(device)
testdata = testdata.to(device)
    
# normalize features
mean = traindata.x.mean(dim=0, keepdim=True)
std = traindata.x.std(dim=0, keepdim=True)
traindata.x = (traindata.x - mean) / std
testdata.x = (testdata.x - mean) / std
    

AttributeError: 'list' object has no attribute 'to'

In [None]:
print(len(traindata))

# create dataloader
batch_size = 64
trainloader = DataLoader(traindata, batch_size=batch_size, shuffle=True)
testloader = DataLoader(testdata, batch_size=batch_size, shuffle=False)
    
#print train_loader
print(len(trainloader))

# model
input_dim = 10
hidden_dim = 65
output_dim = 2
dropout = 0.07279450042274103
model = GraphSAGE(input_dim, hidden_dim, output_dim, dropout)
model.to(device)
    
# optimizer
lr = 0.010353064733105691
optimizer = optim.Adam(model.parameters(), lr=lr)
    
# loss function
beta = 0.9999999914740594
n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()]

#add weights
criterion = torch.nn.CrossEntropyLoss()    

In [22]:

for epoch in range(100):
    model.train()
    for batch in trainloader:
        optimizer.zero_grad()
        out = model(traindata)
        loss = criterion(out, traindata.y)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0 or epoch == 0:
            model.eval()
            with torch.no_grad():
                out = model(testdata)
                loss = criterion(out, testdata.y)
                accuracy = accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1))
                balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1))
                precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
                recall = recall_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
                f1 = f1_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
                print(f'epoch: {epoch + 1}, loss: {loss:.4f}, accuracy: {accuracy:.4f}, balanced_accuracy: {balanced_accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}')


NameError: name 'trainloader' is not defined