In [120]:
import json
import networkx as nx
import dgl
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.utils import check_eq_shape, expand_as_pair
import dgl.function as fn


In [121]:
json_file = 'elyzee_data/elyzee_train.json'
graph_train_networkx = nx.node_link_graph(json.load(open(json_file)))

json_file = 'elyzee_data/elyzee_test.json'
graph_test_networkx = nx.node_link_graph(json.load(open(json_file)))

In [122]:
num_classes = 7
labels_dict = {
    'lr': 0, 
    'em': 1, 
    'fn': 2, 
    'fi': 3, 
    'ps': 4,
    'multi_affiliations': 5, 
    'indetermined': 6
}

In [123]:
graph_train_dgl = dgl.from_networkx(graph_train_networkx, node_attrs=['features'])
graph_test_dgl = dgl.from_networkx(graph_test_networkx, node_attrs=['features'])
print(graph_train_dgl)
features_train = graph_train_dgl.ndata['features']
num_features = features_train.shape[1]
print(features_train.shape)

print(graph_test_dgl)
features_test = graph_test_dgl.ndata['features']
print(features_test.shape)

Graph(num_nodes=5507, num_edges=19284,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32)}
      edata_schemes={})
torch.Size([5507, 999])
Graph(num_nodes=2222, num_edges=6402,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32)}
      edata_schemes={})
torch.Size([2222, 999])


In [124]:
labels_train = [labels_dict[node['affiliation']] for node in graph_train_networkx.nodes.values()]
labels_train =  F.one_hot(torch.tensor(labels_train)).float()
labels_train = labels_train
graph_train_dgl.ndata['label'] = labels_train
print(graph_train_dgl)

labels_test = [labels_dict[node['affiliation']] for node in graph_test_networkx.nodes.values()]
labels_test = F.one_hot(torch.tensor(labels_test))
labels_test = labels_test.float()
graph_test_dgl.ndata['label'] = labels_test
print(graph_test_dgl)

Graph(num_nodes=5507, num_edges=19284,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32), 'label': Scheme(shape=(7,), dtype=torch.float32)}
      edata_schemes={})
Graph(num_nodes=2222, num_edges=6402,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32), 'label': Scheme(shape=(7,), dtype=torch.float32)}
      edata_schemes={})


In [125]:
def gcn_message(edges):
    return {'msg' : edges.src['h']}

def gcn_reduce(nodes):
    return {'h' : torch.sum(nodes.mailbox['msg'], dim=1)}

# Define the GCNLayer module
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, inputs):
        g.ndata['h'] = inputs
        g.send_and_recv(g.edges(), gcn_message, gcn_reduce)
        h = g.ndata.pop('h')
        return self.linear(h)

In [126]:
# Define a 2-layer GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.gcn1 = GCNLayer(in_feats, hidden_size)
        self.gcn2 = GCNLayer(hidden_size, num_classes)

    def forward(self, g, inputs):
        h = self.gcn1(g, inputs)
        h = torch.relu(h)
        h = self.gcn2(g, h)
        h = F.softmax(h, dim=1)
        return h


In [127]:
model = GCN(in_feats=999, hidden_size=16, num_classes=7)

In [128]:
inputs_train = features_train
print(inputs_train.shape)
inputs_test = features_test
print(inputs_test.shape)

torch.Size([5507, 999])
torch.Size([2222, 999])


In [129]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# for epoch in range(10):
#     optimizer.zero_grad()
#     logits_train = model(graph_train_dgl, inputs_train)
#     loss = F.cross_entropy(logits_train, labels_train)    
#     loss.backward()
#     optimizer.step()
#     # print(logits_train[0], "=", labels_train[0])
#     
#     with torch.no_grad():
#         logits_test = model(graph_test_dgl, inputs_test)
#         loss_test = F.cross_entropy(logits_test, labels_test)
#         print('Epoch %d | Loss: %.4f | Loss Test: %.4f' % (epoch, loss.item(), loss_test.item()))  
# for i in range(5507):
#     print(logits_train[i], "=", labels_train[i])

In [130]:
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F

In [131]:
from dgl.nn import GraphConv


class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, h_feats)
        self.conv3 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        h = F.relu(h)
        h = self.conv3(g, h)
        return h


In [132]:
def train(model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        
        # Forward
        logits_train = model(graph_train_dgl, inputs_train)
        # Compute loss
        loss = F.cross_entropy(logits_train, labels_train) 
        # Backward
        loss.backward()

        optimizer.step()
        train_acc = (torch.argmax(logits_train, dim=1) == torch.argmax(labels_train, dim=1)).float().mean()
        
        if epoch % 10 == 0:
            with torch.no_grad():
                model.eval()
                logits_test = model(graph_test_dgl, inputs_test)
                loss_test = F.cross_entropy(logits_test, labels_test)
                test_acc = (torch.argmax(logits_test, dim=1) == torch.argmax(labels_test, dim=1)).float().mean()
                print('Epoch %d | Loss: %.4f | Accuracy: %.4f | Loss Test: %.4f | Test Accuracy: %.4f' % (epoch, loss.item(), train_acc, loss_test.item(), test_acc))

    # for a,b in zip(logits_test, labels_test):
    #     print(a, b)

In [133]:
model = GCN(graph_train_dgl.ndata["features"].shape[1], 16, num_classes)
train(model)

Epoch 0 | Loss: 2.0367 | Accuracy: 0.1088 | Loss Test: 7.8392 | Test Accuracy: 0.1436
Epoch 10 | Loss: 1.9655 | Accuracy: 0.1620 | Loss Test: 1.9786 | Test Accuracy: 0.1467
Epoch 20 | Loss: 1.9290 | Accuracy: 0.2103 | Loss Test: 1.9450 | Test Accuracy: 0.1890
Epoch 30 | Loss: 1.9246 | Accuracy: 0.2103 | Loss Test: 1.9472 | Test Accuracy: 0.1890
Epoch 40 | Loss: 1.9226 | Accuracy: 0.2103 | Loss Test: 1.9435 | Test Accuracy: 0.1890
Epoch 50 | Loss: 1.9217 | Accuracy: 0.2103 | Loss Test: 1.9408 | Test Accuracy: 0.1890
Epoch 60 | Loss: 1.9213 | Accuracy: 0.2103 | Loss Test: 1.9406 | Test Accuracy: 0.1890
Epoch 70 | Loss: 1.9213 | Accuracy: 0.2103 | Loss Test: 1.9417 | Test Accuracy: 0.1890
Epoch 80 | Loss: 1.9212 | Accuracy: 0.2103 | Loss Test: 1.9412 | Test Accuracy: 0.1890
Epoch 90 | Loss: 1.9212 | Accuracy: 0.2103 | Loss Test: 1.9409 | Test Accuracy: 0.1890


In [134]:
g = graph_train_dgl
g

Graph(num_nodes=5507, num_edges=19284,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32), 'label': Scheme(shape=(7,), dtype=torch.float32)}
      edata_schemes={})

In [135]:

from dgl.nn import GraphConv


class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


# Create the model with given dimensions
model = GCN(g.ndata["features"].shape[1], 100, num_classes)


######################################################################
# DGL provides implementation of many popular neighbor aggregation
# modules. You can easily invoke them with one line of code.
#


######################################################################
# Training the GCN
# ----------------
#
# Training this GCN is similar to training other PyTorch neural networks.
#


def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata["features"]
    labels = g.ndata["label"]
    for e in range(10000):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits, labels)

        # Compute accuracy on training/validation/test
        train_acc = (pred == labels_train).float().mean()


        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print(
                f"In epoch {e}, loss: {loss:.3f}, train acc: {train_acc:.3f}"
            )


model = GCN(g.ndata["features"].shape[1], 16, num_classes)
train(g, model)

RuntimeError: The size of tensor a (5507) must match the size of tensor b (7) at non-singleton dimension 1

In [136]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv #GATConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # Initialize the layers
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.linear = Linear(hidden_channels, hidden_channels)
        self.out = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        # First Message Passing Layer (Transformation)
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        # Second Message Passing Layer
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = self.linear(x)
        x = F.dropout(x, p=0.5, training=self.training)

        # Output layer 
        x = F.softmax(self.out(x), dim=1)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GCNConv(999, 16)
  (conv2): GCNConv(16, 16)
  (linear): Linear(in_features=16, out_features=16, bias=True)
  (out): Linear(in_features=16, out_features=7, bias=True)
)


In [137]:
from torch_geometric.utils.convert import from_networkx
g = from_networkx(graph_train_networkx)
g

Data(edge_index=[2, 19284], features=[5507, 999], affiliation=[5507], num_nodes=5507)

In [138]:
g.features = F.normalize(g.features, dim=1)
g.features

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0486, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [148]:
# Initialize model
model = GCN(hidden_channels=1000)

# Use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
g.to(device)
print(g.features, g.edge_index)
labels_train = labels_train.to(device)

# Initialize Optimizer
learning_rate = 0.01
decay = 5e-4
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             weight_decay=decay)
# Define loss function (CrossEntropyLoss for Classification Problems with 
# probability distributions)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad() 
      # Use all data as input, because all nodes have node features
      out = model(g.features, g.edge_index)  
      # Only use nodes with labels available for loss calculation --> mask
      loss = criterion(out, labels_train)
      loss.backward() 
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(g.features, g.edge_index)
      # Use the class with highest probability.
      pred = out.argmax(dim=1)  
      # Check against ground-truth labels.
      test_correct = pred == labels_train.argmax(dim=1)
      print(test_correct)
      for i in range(5507):
          print(pred[i], labels_train[i])

      # Derive ratio of correct predictions.
      test_acc = int(test_correct.sum()) / 5507
      return test_acc




tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0486, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0') tensor([[   0,    0,    0,  ..., 5504, 5505, 5506],
        [   1,   63,  589,  ..., 5402, 5402, 5402]], device='cuda:0')


In [149]:
losses = []
for epoch in range(0, 1001):
    loss = train()
    losses.append(loss)
    if epoch % 100 == 0:
      print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 000, Loss: 1.9456
Epoch: 100, Loss: 1.2685
Epoch: 200, Loss: 1.2429
Epoch: 300, Loss: 1.2161
Epoch: 400, Loss: 1.2148
Epoch: 500, Loss: 1.2458
Epoch: 600, Loss: 1.2122
Epoch: 700, Loss: 1.2232
Epoch: 800, Loss: 1.2248
Epoch: 900, Loss: 1.2233
Epoch: 1000, Loss: 1.2249


In [150]:
test()

tensor([True, True, True,  ..., True, True, True], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor(3, device='cuda:0') tensor([0., 0.

0.9714908298529145