In [108]:
import json
import networkx as nx
import dgl
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.utils import check_eq_shape, expand_as_pair
import dgl.function as fn


In [109]:
json_file = 'elyzee_data/elyzee_train.json'
graph_train_networkx = nx.node_link_graph(json.load(open(json_file)))

json_file = 'elyzee_data/elyzee_test.json'
graph_test_networkx = nx.node_link_graph(json.load(open(json_file)))

In [110]:
num_classes = 7
labels_dict = {
    'lr': 0, 
    'em': 1, 
    'fn': 2, 
    'fi': 3, 
    'ps': 4,
    'multi_affiliations': 5, 
    'indetermined': 6
}

In [111]:
graph_train_dgl = dgl.from_networkx(graph_train_networkx, node_attrs=['features'])
graph_train_dgl.
graph_test_dgl = dgl.from_networkx(graph_test_networkx, node_attrs=['features'])
print(graph_train_dgl)
features_train = graph_train_dgl.ndata['features']
print(features_train.shape)

print(graph_test_dgl)
features_test = graph_test_dgl.ndata['features']
print(features_test.shape)

Graph(num_nodes=5507, num_edges=19284,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32)}
      edata_schemes={})
torch.Size([5507, 999])
Graph(num_nodes=2222, num_edges=6402,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32)}
      edata_schemes={})
torch.Size([2222, 999])


In [126]:
labels_train = [labels_dict[node['affiliation']] for node in graph_train_networkx.nodes.values()]
labels_train = torch.tensor(labels_train) #F.one_hot()
labels_train = labels_train
graph_train_dgl.ndata['label'] = labels_train
print(graph_train_dgl)

labels_test = [labels_dict[node['affiliation']] for node in graph_test_networkx.nodes.values()]
labels_test = F.one_hot(torch.tensor(labels_test))
labels_test = labels_test.float()
graph_test_dgl.ndata['label'] = labels_test
print(graph_test_dgl)

Graph(num_nodes=5507, num_edges=19284,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
Graph(num_nodes=2222, num_edges=6402,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32), 'label': Scheme(shape=(7,), dtype=torch.float32)}
      edata_schemes={})


In [127]:
def gcn_message(edges):
    return {'msg' : edges.src['h']}

def gcn_reduce(nodes):
    return {'h' : torch.sum(nodes.mailbox['msg'], dim=1)}

# Define the GCNLayer module
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, inputs):
        g.ndata['h'] = inputs
        g.send_and_recv(g.edges(), gcn_message, gcn_reduce)
        h = g.ndata.pop('h')
        return self.linear(h)

In [128]:
# Define a 2-layer GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.gcn1 = GCNLayer(in_feats, hidden_size)
        self.gcn2 = GCNLayer(hidden_size, num_classes)

    def forward(self, g, inputs):
        h = self.gcn1(g, inputs)
        h = torch.relu(h)
        h = self.gcn2(g, h)
        h = F.softmax(h, dim=1)
        return h


In [95]:
model = GCN(in_feats=999, hidden_size=16, num_classes=7)

In [96]:
inputs_train = features_train
print(inputs_train.shape)
inputs_test = features_test
print(inputs_test.shape)

torch.Size([5507, 999])
torch.Size([2222, 999])


In [97]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# for epoch in range(10):
#     optimizer.zero_grad()
#     logits_train = model(graph_train_dgl, inputs_train)
#     loss = F.cross_entropy(logits_train, labels_train)    
#     loss.backward()
#     optimizer.step()
#     # print(logits_train[0], "=", labels_train[0])
#     
#     with torch.no_grad():
#         logits_test = model(graph_test_dgl, inputs_test)
#         loss_test = F.cross_entropy(logits_test, labels_test)
#         print('Epoch %d | Loss: %.4f | Loss Test: %.4f' % (epoch, loss.item(), loss_test.item()))  
# for i in range(5507):
#     print(logits_train[i], "=", labels_train[i])

In [98]:
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F

In [99]:
from dgl.nn import GraphConv


class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, h_feats)
        self.conv3 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        h = F.relu(h)
        h = self.conv3(g, h)
        return h


In [100]:
def train(model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    for epoch in range(100):
        optimizer.zero_grad()
        # Forward
        logits_train = model(graph_train_dgl, inputs_train)
        # Compute loss
        loss = F.cross_entropy(logits_train, labels_train) 
        # Backward
        loss.backward()
        optimizer.step()
        
        train_acc = (torch.argmax(logits_train, dim=1) == torch.argmax(labels_train, dim=1)).float().mean()
        
        if epoch % 10 == 0:
            with torch.no_grad():
                logits_test = model(graph_test_dgl, inputs_test)
                loss_test = F.cross_entropy(logits_test, labels_test)
                test_acc = (torch.argmax(logits_test, dim=1) == torch.argmax(labels_test, dim=1)).float().mean()
                print('Epoch %d | Loss: %.4f | Accuracy: %.4f | Loss Test: %.4f | Test Accuracy: %.4f' % (epoch, loss.item(), train_acc, loss_test.item(), test_acc))

    # for a,b in zip(logits_test, labels_test):
    #     print(a, b)

In [101]:
model = GCN(graph_train_dgl.ndata["features"].shape[1], 16, num_classes)
train(model)

Epoch 0 | Loss: 1.9890 | Accuracy: 0.1558 | Loss Test: 4.8205 | Test Accuracy: 0.1436
Epoch 10 | Loss: 1.9416 | Accuracy: 0.1625 | Loss Test: 1.9440 | Test Accuracy: 0.1463
Epoch 20 | Loss: 1.9242 | Accuracy: 0.2103 | Loss Test: 1.9439 | Test Accuracy: 0.1895
Epoch 30 | Loss: 1.9222 | Accuracy: 0.2103 | Loss Test: 1.9429 | Test Accuracy: 0.1890
Epoch 40 | Loss: 1.9203 | Accuracy: 0.2108 | Loss Test: 1.9363 | Test Accuracy: 0.1904
Epoch 50 | Loss: 1.9200 | Accuracy: 0.2110 | Loss Test: 1.9377 | Test Accuracy: 0.1890
Epoch 60 | Loss: 1.9197 | Accuracy: 0.2110 | Loss Test: 1.9378 | Test Accuracy: 0.1895
Epoch 70 | Loss: 1.9196 | Accuracy: 0.2110 | Loss Test: 1.9368 | Test Accuracy: 0.1890
Epoch 80 | Loss: 1.9196 | Accuracy: 0.2110 | Loss Test: 1.9370 | Test Accuracy: 0.1890
Epoch 90 | Loss: 1.9196 | Accuracy: 0.2106 | Loss Test: 1.9372 | Test Accuracy: 0.1890


In [129]:
g = graph_train_dgl
g

Graph(num_nodes=5507, num_edges=19284,
      ndata_schemes={'features': Scheme(shape=(999,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})

In [None]:

from dgl.nn import GraphConv


class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


# Create the model with given dimensions
model = GCN(g.ndata["features"].shape[1], 100, num_classes)


######################################################################
# DGL provides implementation of many popular neighbor aggregation
# modules. You can easily invoke them with one line of code.
#


######################################################################
# Training the GCN
# ----------------
#
# Training this GCN is similar to training other PyTorch neural networks.
#


def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata["features"]
    labels = g.ndata["label"]
    for e in range(10000):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits, labels)

        # Compute accuracy on training/validation/test
        train_acc = (pred == labels_train).float().mean()


        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print(
                f"In epoch {e}, loss: {loss:.3f}, train acc: {train_acc:.3f}"
            )


model = GCN(g.ndata["features"].shape[1], 16, num_classes)
train(g, model)

In epoch 0, loss: 1.960, train acc: 0.123
In epoch 5, loss: 2.125, train acc: 0.162
In epoch 10, loss: 1.960, train acc: 0.125
In epoch 15, loss: 1.940, train acc: 0.125
In epoch 20, loss: 1.923, train acc: 0.210
In epoch 25, loss: 1.921, train acc: 0.210
In epoch 30, loss: 1.923, train acc: 0.210
In epoch 35, loss: 1.922, train acc: 0.210
In epoch 40, loss: 1.920, train acc: 0.210
In epoch 45, loss: 1.920, train acc: 0.210
In epoch 50, loss: 1.920, train acc: 0.210
In epoch 55, loss: 1.920, train acc: 0.210
In epoch 60, loss: 1.920, train acc: 0.210
In epoch 65, loss: 1.919, train acc: 0.210
In epoch 70, loss: 1.919, train acc: 0.210
In epoch 75, loss: 1.919, train acc: 0.210
In epoch 80, loss: 1.919, train acc: 0.210
In epoch 85, loss: 1.918, train acc: 0.211
In epoch 90, loss: 1.918, train acc: 0.211
In epoch 95, loss: 1.917, train acc: 0.211
In epoch 100, loss: 1.918, train acc: 0.211
In epoch 105, loss: 1.917, train acc: 0.211
In epoch 110, loss: 1.917, train acc: 0.211
In epoch 1