In [1]:
from importlib import reload
import torch
import torch.nn.functional as F
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score

import data
reload(data)
from data import AmlsimDataset

import modules
reload(modules)
from modules import GCN, GCN_GNNExplainer, GCN_GraphSVX
from modules import GraphSAGE
from torch_geometric.data import DataLoader
import torch.optim as optim


In [2]:
# Set device to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [3]:
# Load data
traindata = AmlsimDataset(node_file='data/simulation2/swedbank/train/nodes.csv', edge_file='data/simulation2/swedbank/train/edges.csv', node_features=True, node_labels=True).get_data()
testdata = AmlsimDataset(node_file='data/simulation2/swedbank/test/nodes.csv', edge_file='data/simulation2/swedbank/test/edges.csv', node_features=True, node_labels=True).get_data()
traindata = traindata.to(device)
testdata = testdata.to(device)

# # Convert label tensors to one-hot encoded form
# traindata.y = F.one_hot(traindata.y, num_classes=2)
# testdata.y = F.one_hot(testdata.y, num_classes=2)

In [4]:
# Normalize data
mean = traindata.x.mean(dim=0, keepdim=True)
std = traindata.x.std(dim=0, keepdim=True)
traindata.x = (traindata.x - mean) / std
testdata.x = (testdata.x - mean) / std

In [5]:
# Instantiate model
input_dim = 10
hidden_dim = 16
output_dim = 2
n_layers = 3
dropout = 0.3
model = GCN_GraphSVX(input_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)

GCN_GraphSVX(
  (convs): ModuleList(
    (0): GCNConv(10, 16)
    (1): GCNConv(16, 16)
    (2): GCNConv(16, 2)
  )
  (bns): ModuleList(
    (0-1): 2 x BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (softmax): Softmax(dim=1)
)

In [6]:
print(model.output_dim)

2


In [7]:
# optimizer
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [8]:
# loss function
criterion = torch.nn.NLLLoss()

In [9]:
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model.forward(traindata.x, traindata.edge_index)
    loss = criterion(out, traindata.y)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            out = model.forward(testdata.x, testdata.edge_index)
            loss = criterion(out, testdata.y)
            precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
            recall = recall_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0)
            print(f'epoch: {epoch + 1}, loss: {loss:.4f}, precision: {precision:.4f}, recall: {recall:.4f}')

epoch: 10, loss: -0.5610, precision: 0.5000, recall: 0.0723
epoch: 20, loss: -0.6158, precision: 0.5600, recall: 0.0843
epoch: 30, loss: -0.6563, precision: 0.7442, recall: 0.1928
epoch: 40, loss: -0.6841, precision: 0.7805, recall: 0.1928
epoch: 50, loss: -0.7004, precision: 0.7714, recall: 0.1627
epoch: 60, loss: -0.7099, precision: 0.7941, recall: 0.1627
epoch: 70, loss: -0.7175, precision: 0.8438, recall: 0.1627
epoch: 80, loss: -0.7230, precision: 0.8438, recall: 0.1627
epoch: 90, loss: -0.7270, precision: 0.8710, recall: 0.1627
epoch: 100, loss: -0.7295, precision: 0.8710, recall: 0.1627
epoch: 110, loss: -0.7325, precision: 0.8485, recall: 0.1687
epoch: 120, loss: -0.7343, precision: 0.8485, recall: 0.1687
epoch: 130, loss: -0.7359, precision: 0.8485, recall: 0.1687
epoch: 140, loss: -0.7372, precision: 0.8485, recall: 0.1687
epoch: 150, loss: -0.7381, precision: 0.7949, recall: 0.1867
epoch: 160, loss: -0.7387, precision: 0.7949, recall: 0.1867
epoch: 170, loss: -0.7395, precis

In [10]:
from sklearn.metrics import confusion_matrix
import numpy as np

model.eval()
with torch.no_grad():
    out = model.forward(testdata.x, testdata.edge_index)
    y_pred = out.cpu().numpy().argmax(axis=1)
    y_true = testdata.y.cpu().numpy()
    cm = confusion_matrix(y_true, y_pred)
    print(cm)





[[389  11]
 [132  34]]


In [11]:
from sklearn.model_selection import train_test_split

def split_function(y, args_train_ratio=0.6, seed=10):
    return _get_train_val_test_masks(y.shape[0], y, (1-args_train_ratio)/2, (1-args_train_ratio), seed=seed)

def _get_train_val_test_masks(total_size, y_true, val_fraction, test_fraction, seed):
    """Performs stratified train/test/val split

    Args:
        total_size (int): dataset total number of instances
        y_true (numpy array): labels
        val_fraction (int): validation/test set proportion
        test_fraction (int): test and val sets proportion
        seed (int): seed value

    Returns:
        [torch.tensors]: train, validation and test masks - boolean values
    """
    # Split into a train, val and test set
    # Store indexes of the nodes belong to train, val and test set
    indexes = range(total_size)
    indexes_train, indexes_test = train_test_split(
        indexes, test_size=test_fraction, stratify=y_true, random_state=seed)
    indexes_train, indexes_val = train_test_split(indexes_train, test_size=val_fraction, stratify=y_true[indexes_train],
                                                  random_state=seed)
    # Init masks
    train_idxs = np.zeros(total_size, dtype=bool)
    val_idxs = np.zeros(total_size, dtype=bool)
    test_idxs = np.zeros(total_size, dtype=bool)

    # Update masks using corresponding indexes
    train_idxs[indexes_train] = True
    val_idxs[indexes_val] = True
    test_idxs[indexes_test] = True

    return torch.from_numpy(train_idxs), torch.from_numpy(val_idxs), torch.from_numpy(test_idxs)

In [12]:
from types import SimpleNamespace

train_ratio = 0.8
testdata.to('cpu')
model.to('cpu')

data = SimpleNamespace()
data.x = testdata.x
data.edge_index  = testdata.edge_index
data.y = testdata.y
data.num_classes = 2
data.num_features = 10
data.num_nodes = testdata.x.shape[0]
data.name = 'test'
data.train_mask, data.val_mask, data.test_mask = split_function(data.y.numpy(), train_ratio)

In [13]:
import GraphSVX_explainers
reload(GraphSVX_explainers)

from GraphSVX_explainers import GraphSVX, GraphLIME


explainer = GraphSVX(data, model, True)

In [22]:
explanations = explainer.explain(node_indexes=[318])
print('Sum explanations: ', [np.sum(explanation) for explanation in explanations])
print('Base value: ', explainer.base_values)

model = model.to('cpu')
print(f'Model class probabilities: ',{model.forward(testdata.x.to('cpu'), testdata.edge_index.to('cpu'))[318]})
print(f'True label = {testdata.y[318]}')

0it [00:00, ?it/s]

10it [00:00, 75.57it/s]


WLS: Matrix not invertible
r2:  0.49901398812245956
weighted r2:  0.9858230765409469
Explanations include 8 node features and 2 neighbours for this node        for 2 classes
Model prediction is class 0 with confidence 2.7114129066467285, while true label is 1
Base value 2.622413944548696 for class  0
Weights for node features:  0.21191291898552578 and neighbours:  -0.12296751529356698
Most influential features:  [(8, -0.538347544557837), (4, 0.4769126772880554), (3, 0.16716720163822174)]
Time:  0.19017815589904785
Sum explanations:  [0.0889454036919588]
Base value:  [2.6224086345959376, 2.622411509629994, 2.6224120983794696, 2.622399734699588, 2.6224035582503227, 2.622413944548696]
Model class probabilities:  {tensor([0.9975, 0.0025], grad_fn=<SelectBackward0>)}
True label = 1
