<a href="https://colab.research.google.com/github/YasminHeimann/Graph_Anomaly/blob/master/gnn_panda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Basic PANDA tutorial on graphs, training a 2-layer GCN**



In [None]:
!pip install dgl
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F



In [None]:
class Arguments:
    dataset = 'cora'
    epochs = 100
    lr = 0.001
    model = 'gcn'
    task = 'node_class'
    layers = 1
    h_dim1 = 16
    ewc = False


#**Get dataset from DGL for a specific task, e.g.,Node Classification**

In [None]:
import dgl.data

def get_graph(dataset_name):
    if dataset_name =='cora':
        dataset = dgl.data.CoraGraphDataset()
        # cora db has one graph
        g = dataset[0]
    elif dataset_name == 'citeseer':
        dataset = dgl.data.CiteseerGraphDataset()
        # citeseer db has one graph todo ?
        g = dataset[0]
    else:
        print('Default Dataset is cora')
        dataset = dgl.data.CoraGraphDataset()
        # cora db has one graph
        g = dataset[0]
    print('\nNumber of categories:', dataset.num_classes)
    print('Node features')
    print(g.ndata)
    print('Edge features')
    print(g.edata)
    return g, dataset


#**Define GNN models**

In [None]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = dgl.nn.GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)
    
    def forward(self, g):
        in_feat = g.ndata['feat']
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


#**Training Pre-Task**

In [None]:
def train(g, model, args):
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    best_val_acc = 0
    best_test_acc = 0

    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    for e in range(200):
        # Forward
        logits = model(g)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc))


#**Get the pre-trained model by arguments given**

In [None]:
def get_model_architecture(args, g, dataset):
    if args.model == 'gcn':
        return GCN(g.ndata['feat'].shape[1], args.h_dim1, dataset.num_classes)  # .to('cuda')
    else:
        print('Default model architecture is gcn')
        return GCN(g.ndata['feat'].shape[1], args.h_dim1, dataset.num_classes)  # .to('cuda')

def get_trained_node_class_model(args):
    # Create the model with given dimensions
    g, dataset = get_graph(args.dataset)
    print("features dim: ", g.ndata['feat'].shape[1])

    model = get_model_architecture(args, g, dataset)
    train(g, model, args)

    print("hello gur")
    print("\nfinished training GCN\n")
    return model, g, dataset

def get_pre_trained_model(args):
    if args.task == 'node_class':
        return get_trained_node_class_model(args)
    else:
        print('Default model is node classification with gcn')
        return get_node_class_model(args)


#**PANDA training**

In [None]:
!apt install libomp-dev
!python -m pip install --upgrade faiss
# !python -m pip install --upgrade faiss-gpu


Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.


**knn-faiss**

In [None]:
import numpy as np
import faiss
def knn_score(train_set, test_set, n_neighbours=2):
    """
    Calculates the KNN distance
    """
    index = faiss.IndexFlatL2(train_set.shape[1])
    index.add(train_set)
    D, _ = index.search(test_set, n_neighbours)
    return np.sum(D, axis=1)


In [None]:
from sklearn.metrics import roc_auc_score

# one big TODO
def get_score(model, device, g, num_classes=2, multi_class=False):
  train_mask = g.ndata['train_mask']
  test_mask = g.ndata['test_mask']
  test_labels = g.ndata['label'][test_mask]
  labels = (test_labels.cpu().detach().numpy() > 3).astype(int)

  with torch.no_grad():
      features = model(g)
      train_feature_space = features[train_mask]
      test_feature_space = features[test_mask]

  train_np = train_feature_space.cpu().detach().numpy()
  test_np = test_feature_space.cpu().detach().numpy()
  # distances from knn
  distances = knn_score(train_np, test_np, num_classes)

  #auc
  # multi class
  if multi_class:
    one_hot = np.zeros((labels.size, labels.max()+1))
    one_hot[np.arange(labels.size),labels] = 1
    auc = roc_auc_score(one_hot,distances.reshape(-1, 1), #average='weighted', 
                        multi_class='ovr')
  else:
    # todo: is it how to calc auc?
    auc = roc_auc_score(labels, distances)

  return auc, train_np


**KNN with scikit learn**

In [None]:
from sklearn.neighbors import NearestNeighbors
def sklearn_knn(test_np, train_np, n_neighbors):
  knn = NearestNeighbors(n_neighbors)
  knn.fit(train_np)
  D, I = knn.kneighbors(test_np)


#**Train Loop**

In [None]:
import torch.optim as optim

class CompactnessLoss(nn.Module):
    def __init__(self, center):
        super(CompactnessLoss, self).__init__()
        self.center = center

    def forward(self, inputs):
        m = inputs.size(1)
        variances = (inputs - self.center).norm(dim=1).pow(2) / m
        return variances.mean()


def train_panda_model(model, graph, device, args, ewc_loss, num_classes):
    model.eval()  # todo
    auc, train_feature_space = get_score(model, device, graph, num_classes)
    print('Epoch: {}, AUROC is: {}'.format(0, auc))
    optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=0.00005, momentum=0.9)

    #todo how many labels? two?
    center = torch.FloatTensor(train_feature_space).mean(dim=0)
    criterion = CompactnessLoss(center.to(device))
    train_mask = graph.ndata['train_mask']
    # todo: is the train mask on two labels? one label? in article?
    # todo: i assume only 2 labels in general.
    for epoch in range(args.epochs):
        model.train()
        running_loss = run_epoch(model, graph, train_mask, optimizer, criterion, device, args.ewc, ewc_loss)
        print('Epoch: {}, Loss: {}'.format(epoch + 1, running_loss))
        model.eval()
        auc, feature_space = get_score(model, device, graph, num_classes)
        print('Epoch: {}, AUROC is: {}'.format(epoch + 1, auc))


def run_epoch(model, graph, train_mask, optimizer, criterion, device, ewc, ewc_loss):
    # todo: images = imgs.to(device)
    optimizer.zero_grad()

    logits = model(graph)

    loss = criterion(logits[train_mask])

    if ewc:
        loss += ewc_loss(model)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1e-3)

    optimizer.step()

    return loss.item()

#**Run pipeline**

In [None]:
args = Arguments()
model, g, dataset  = get_pre_trained_model(args)
print("features dim: ", g.ndata['feat'].shape[1])


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

Number of categories: 7
Node features
{'feat': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'label': tensor([3, 4, 4,  ..., 3, 3, 3]), 'test_mask': tensor([False, False, False,  ...,  True,  True,  True]), 'train_mask': tensor([ True,  True,  True,  ..., False, False, False]), 'val_mask': tensor([False, False, False,  ..., False, False, False])}
Edge features
{}
features dim:  1433
In epoch 0, loss: 1.946, val acc: 0.266 (best 0.266), test acc: 0.267 (best 0.267)
In epoch 5, loss: 1.941, val acc: 0.316 (best 0.316), test acc: 0.323 (best 0.323)
In epoch 10, loss: 1.936, val acc: 0.432 (best 0.432), t

In [None]:
# ewc None
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# test with 2 labels only - take half > 3 as 0 label and the rest 1 (1800, 900 samples)

# todo: better architecture, GAT https://paperswithcode.com/paper/graph-attention-networks
# todo: take only 2 labels (e.g., 0 and 1 or 0 and 3)
# todo: is train only on label?

# for 3: 800 samples, others ~200-300
train_panda_model(model, g, device, args, None, 2)


# self training:
# g, dataset = get_graph(args.dataset)
# #model = GCN(g.ndata['feat'].shape[1], 16, dataset.num_classes)
# model = get_model(args)
# train(g, model)


Epoch: 0, AUROC is: 0.519149270856466
Epoch: 1, Loss: 0.11761083453893661
Epoch: 1, AUROC is: 0.519149270856466
Epoch: 2, Loss: 0.11761071532964706
Epoch: 2, AUROC is: 0.519149270856466
Epoch: 3, Loss: 0.11761046946048737
Epoch: 3, AUROC is: 0.5191446443112
Epoch: 4, Loss: 0.11761009693145752
Epoch: 4, AUROC is: 0.5191400177659338
Epoch: 5, Loss: 0.11760963499546051
Epoch: 5, AUROC is: 0.5191400177659338
Epoch: 6, Loss: 0.11760909110307693
Epoch: 6, AUROC is: 0.519149270856466
Epoch: 7, Loss: 0.1176084652543068
Epoch: 7, AUROC is: 0.5191538974017321
Epoch: 8, Loss: 0.11760775744915009
Epoch: 8, AUROC is: 0.5191631504922644
Epoch: 9, Loss: 0.117606982588768
Epoch: 9, AUROC is: 0.5191585239469982
Epoch: 10, Loss: 0.11760617792606354
Epoch: 10, AUROC is: 0.5191608372196314
Epoch: 11, Loss: 0.1176052913069725
Epoch: 11, AUROC is: 0.5191677770375305
Epoch: 12, Loss: 0.11760436743497849
Epoch: 12, AUROC is: 0.5191724035827967
Epoch: 13, Loss: 0.11760339885950089
Epoch: 13, AUROC is: 0.519172