In [30]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import random

In [3]:
import torch
import dgl
import networkx as nx
import matplotlib.pyplot as plt

In [157]:
folder = '../data/graphs_spreads'
ratings = {'False': 0, 'True': 1}

def load_graph(filename):
    with open(filename) as file:
        return json.load(file)

dataset = [
    (load_graph(f'{folder}/{rating}/{filename}'), label) 
    for rating, label in ratings.items()
    for filename in os.listdir(f'{folder}/{rating}')[:60] 
    if filename.endswith('json')
]

graphs, labels = zip(*dataset)

In [158]:
def build_graph(graph):
    nxg = nx.DiGraph()
    nxg.add_nodes_from([n for n in graph['nodes']])
    nxg.add_edges_from([(e['source'], e['target']) for e in graph['edges']])
    g = dgl.DGLGraph()
    g.from_networkx(nxg)
    return g

dgl_graphs = [build_graph(g) for g in graphs]

In [159]:
import sklearn
from sklearn.model_selection import train_test_split

In [174]:
X_train, X_val, y_train, y_val = train_test_split(dgl_graphs, labels, test_size=0.2, stratify=labels)

In [161]:
def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)

In [162]:
import dgl.function as fn
import torch.nn as nn


# Sends a message of node feature h.
msg = fn.copy_src(src='h', out='m')

def reduce(nodes):
    """Take an average over all neighbor node features hu and use it to
    overwrite the original node feature."""
    accum = torch.mean(nodes.mailbox['m'], 1) ## should think about this, even sum didnt solve it, was 'mean'
    return {'h': accum}

class NodeApplyModule(nn.Module):
    """Update the node feature hv with ReLU(Whv+b)."""
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.activation = activation

    def forward(self, node):
        h = self.linear(node.data['h'])
        h = self.activation(h)
        return {'h' : h}

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        # Initialize the node features with h.
        g.ndata['h'] = feature
        g.update_all(msg, reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')

In [163]:
def get_origin_indices(graph):
    targets = set(edge.item() for edge in graph.edges()[1])
    return [i for i in range(len(graph)) if i not in targets]

def get_node_types(graph):
    origins = get_origin_indices(graph)
    types = torch.zeros(len(graph))
    types[origins] = 1.
    return types.view(-1, 1).float()

In [175]:
import torch.nn.functional as F


class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super(Classifier, self).__init__()

        self.layers = nn.ModuleList([
            GCN(in_dim, hidden_dim, F.relu),
            GCN(hidden_dim, hidden_dim, F.relu),
        ])
        self.classify = nn.Linear(hidden_dim, 1)

    def forward(self, g):
        # For undirected graphs, in_degree is the same as
        # out_degree. I changed it to out_degrees() because it's a spanning tree
        out_degrees = g.out_degrees().view(-1, 1).float()
        types = get_node_types(g)
        h = torch.cat([out_degrees, types], dim=-1)
        for conv in self.layers:
            h = conv(g, h)
        g.ndata['h'] = h
        hg = dgl.mean_nodes(g, 'h')
        return torch.sigmoid(self.classify(hg))

In [182]:
import torch.optim as optim
from torch.utils.data import DataLoader

data_loader = DataLoader(list(zip(X_train, y_train)), shuffle=True, collate_fn=collate, batch_size=4)

model = Classifier(2, 256)
loss_func = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epoch_losses = []
for epoch in range(80):
    epoch_loss = 0
    for iter, (bg, label) in enumerate(data_loader):
        prediction = model(bg)
        loss = loss_func(prediction.squeeze(), label.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
    epoch_loss /= (iter + 1)
    print('Epoch {}, loss {:.4f}'.format(epoch, epoch_loss))
    epoch_losses.append(epoch_loss)

Epoch 0, loss 0.7543
Epoch 1, loss 0.7110
Epoch 2, loss 0.6974
Epoch 3, loss 0.6977
Epoch 4, loss 0.7255
Epoch 5, loss 0.6698
Epoch 6, loss 0.6695
Epoch 7, loss 0.6690
Epoch 8, loss 0.6590
Epoch 9, loss 0.6658
Epoch 10, loss 0.6714
Epoch 11, loss 0.6565
Epoch 12, loss 0.6488
Epoch 13, loss 0.6530
Epoch 14, loss 0.6560
Epoch 15, loss 0.6522
Epoch 16, loss 0.6680
Epoch 17, loss 0.6525
Epoch 18, loss 0.6684
Epoch 19, loss 0.6521
Epoch 20, loss 0.6503
Epoch 21, loss 0.6658
Epoch 22, loss 0.6559
Epoch 23, loss 0.6487
Epoch 24, loss 0.6535
Epoch 25, loss 0.6467
Epoch 26, loss 0.6457
Epoch 27, loss 0.6474
Epoch 28, loss 0.6421
Epoch 29, loss 0.6433
Epoch 30, loss 0.6579
Epoch 31, loss 0.6448
Epoch 32, loss 0.6444
Epoch 33, loss 0.6435
Epoch 34, loss 0.6441
Epoch 35, loss 0.6537
Epoch 36, loss 0.6570
Epoch 37, loss 0.6463
Epoch 38, loss 0.6521
Epoch 39, loss 0.6385
Epoch 40, loss 0.6442
Epoch 41, loss 0.6523
Epoch 42, loss 0.6395
Epoch 43, loss 0.6396
Epoch 44, loss 0.6632
Epoch 45, loss 0.647

In [183]:
model.eval()
# Convert a list of tuples to two lists
# test_X, test_Y = map(list, zip(*(testset)))
test_bg = dgl.batch(X_val)
test_Y = torch.tensor(y_val).float().view(-1, 1)
probs_Y = model(test_bg) # was torch.softmax(model(test_bg), 1)
pred_Y = torch.round(probs_Y).view(-1, 1)
print('Accuracy of argmax predictions on the test set: {:4f}%'.format(
    (test_Y == pred_Y.float()).sum().item() / len(test_Y) * 100))

Accuracy of argmax predictions on the test set: 54.166667%
