### Load and preprocess data

In [3]:
from collections import defaultdict, OrderedDict

DATA_DIR = "data/PubMed-Diabetes/"
EDGE_PATH = DATA_DIR + "Pubmed-Diabetes.DIRECTED.cites.tab"
NODE_PATH = DATA_DIR + "Pubmed-Diabetes.NODE.paper.tab"
TF_IDF_DIM = 500

# Load and process graph links
print("Loading and processing graph links...")
node_pairs = set()
with open(EDGE_PATH, 'r') as f:
    next(f)  # skip header
    next(f)  # skip header
    for line in f:
        columns = line.split()
        src = int(columns[1][6:])
        dest = int(columns[3].strip()[6:])
        node_pairs.add((src, dest))
        
# Load and process graph nodes
print("Loading and processing graph nodes...")
node2vec = OrderedDict()
node2label = dict()
class_1 = list()
class_2 = list()
class_3 = list()
with open(NODE_PATH, 'r') as f:
    next(f)  # skip header
    vocabs = [e.split(':')[1] for e in next(f).split()[1:]]
    for line in f:
        columns = line.split()
        node = int(columns[0])
        label = int(columns[1][-1])
        tf_idf_vec = [0.0] * TF_IDF_DIM

        for e in columns[2:-1]:
            word, value = e.split('=')
            tf_idf_vec[vocabs.index(word)] = float(value)

        node2vec[node] = tf_idf_vec
        node2label[node] = label - 1
        if label == 1:
            class_1.append(node)
        elif label == 2:
            class_2.append(node)
        elif label == 3:
            class_3.append(node)

# Debug statistics
print("Number of links:", len(node_pairs))
assert len(node2vec) == (len(class_1) + len(class_2) + len(class_3))
print("Number of nodes:", len(node2vec))
print("Number of nodes belong to Class 1", len(class_1))
print("Number of nodes belong to Class 2", len(class_2))
print("Number of nodes belong to Class 3", len(class_3))


Loading and processing graph links...
Loading and processing graph nodes...
Number of links: 44338
Number of nodes: 19717
Number of nodes belong to Class 1 4103
Number of nodes belong to Class 2 7875
Number of nodes belong to Class 3 7739


### Neural Network related parameters

In [156]:
MODEL_DIR = "model/"
TEST_SIZE = 1000
SEED_NODES = 20
NUM_CATEGORIES = 3

ALPHA = 0.2
HIDDEN_1_DIM = 250
HIDDEN_2_DIM = 100

NUM_EPOCH = 12
BATCH_SIZE = 100
LEARNING_RATE = 0.0001

### Split data into train/test set

In [5]:
# Important variables from previous cells: node_pairs, class_1, class_2, class_3
test_nodes = class_1[-TEST_SIZE:] + class_2[-TEST_SIZE:] + class_3[-TEST_SIZE:]
train_node_pairs = []
for src, dest in node_pairs:
    if not (src in test_nodes or dest in test_nodes):
        train_node_pairs.append((src, dest))

seed_nodes = class_1[:SEED_NODES] + class_2[:SEED_NODES] + class_3[:SEED_NODES]

### Model Architecture

In [205]:
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

class MY_NGM_FFNN(nn.Module):
    def __init__(self, alpha, input_dim, hidden1_dim, hidden2_dim, output_dim, device=torch.device('cpu')):
        super(MY_NGM_FFNN, self).__init__()

        self.alpha = alpha
        self.loss_function = nn.NLLLoss()

        self.hidden1 = nn.Linear(input_dim, hidden1_dim)
        self.hidden2 = nn.Linear(hidden1_dim, hidden2_dim)
        self.output = nn.Linear(hidden2_dim, output_dim)

        self.device = device
        self.to(device)

    def save(self, output_dir, model_name):
        print("Saving model...")
        torch.save(self.state_dict(), output_dir + model_name + ".pt")
        print("Model saved.")

    def load(self, output_dir, model_name):
        print("Loading model...")
        self.load_state_dict(torch.load(output_dir + model_name + ".pt"))
        print("Model loaded.")
        
    def forward(self, tf_idf_vec):
        # First feed-forward layer
        hidden1 = F.relu(self.hidden1(tf_idf_vec))
        # Second feed-forward layer
        hidden2 = F.relu(self.hidden2(hidden1))

        # Output layer
        return F.log_softmax(self.output(hidden2), -1)
    
    def reset_parameters(self):
        self.hidden1.reset_parameters()
        self.hidden2.reset_parameters()
        self.output.reset_parameters()
    
    def aggregate_ce(self, output, targets):
        loss = 0
        self.loss_function(output, targets)
        #for o, t in zip(output, targets):
        ##    print(o.size(), t.size())
    #    loss += self.loss_function(o.view(1, -1), t)
        return loss 

    
    def train_(self, seed_nodes, train_node_pairs, node2vec, node2label, 
               num_epoch, batch_size, learning_rate):
        print("Training...")
        self.train()

        loss_function = nn.NLLLoss()
        optimizer = optim.SGD(self.parameters(), lr=learning_rate)
        
        node2neighbors = defaultdict(list)
        for src, dest in train_node_pairs:
            node2neighbors[src].append(dest)
            node2neighbors[dest].append(src)
            
        labeled_nodes = dict()
        for node in seed_nodes:
            labeled_nodes[node] = node2label[node]

        iteration = 1
        while iteration < 3:
            print("=" * 80)
            print("Generation: {} (with {} labeled nodes)".format(iteration, len(labeled_nodes)))
            iteration += 1

            for e in range(NUM_EPOCH):
                train_node_pairs_cpy =  train_node_pairs[:]
                total_loss = 0
                
                while train_node_pairs_cpy:
                    optimizer.zero_grad()
                    loss = torch.tensor(0, dtype=torch.float32, device=self.device)
                    label_label_loss = defaultdict(list)
                    label_unlabel_loss = defaultdict(list)
                    
                    try:
                        batch = random.sample(train_node_pairs_cpy, batch_size)
                    except ValueError:
                        break
                        
                    
                    #labelled-labelled
                    first_batch = [(src, dest) for (src, dest) in batch if (src in labeled_nodes and dest in labeled_nodes)]
                    src_vectors = torch.tensor([node2vec[src] for (src, dest) in first_batch], device=self.device)
                    dest_vectors = torch.tensor([node2vec[dest] for (src, dest) in first_batch], device=self.device)
                    src_targets = torch.tensor([labeled_nodes[src] for (src, dest) in first_batch], device=self.device)
                    dest_targets = torch.tensor([labeled_nodes[src] for (src, dest) in first_batch], device=self.device)
                    if len(src_vectors) > 1:
                        src_softmax = self.forward(src_vectors)
                        l1_loss = self.aggregate_ce(src_softmax, src_targets)
                        dest_softmax = self.forward(dest_vectors)
                        l2_loss = self.aggregate_ce(dest_softmax, dest_targets)
                        # TO DO: GET INCIDENT EDGES TO SRC AND DEST
                        loss += (l1_loss + l2_loss)
                        for s, d in zip(src_softmax, dest_softmax):
                            total_loss += torch.dist(s, d) # TO DO: WEIGHTS
                        
                        
                    #labelled-unlabelled
                    second_batch = [(src, dest) for (src, dest) in batch if (src in labeled_nodes and (dest not in labeled_nodes))]
                    src_vectors = torch.tensor([node2vec[src] for (src, dest) in second_batch], device=self.device)
                    dest_vectors = torch.tensor([node2vec[dest] for (src, dest) in second_batch], device=self.device)
                    src_targets = torch.tensor([labeled_nodes[src] for (src, dest) in second_batch], device=self.device)
                    if len(src_vectors) > 1:
                        src_softmax = self.forward(src_vectors)
                        #print("vectors", src_vectors, "targets", src_targets)
                        l1_loss = self.aggregate_ce(src_softmax, src_targets)
                        dest_softmax = self.forward(dest_vectors)
                        # TO DO: GET INCIDENT EDGES TO SRC 
                        total_loss += (l1_loss)
                        for s, d in zip(src_softmax, dest_softmax):
                            loss += torch.dist(s, d) # TO DO: WEIGHTS
                        
                        
                    #unlabelled-unlabelled
                    third_batch = [(src, dest) for (src, dest) in batch if ((src not in labeled_nodes) and (dest not in labeled_nodes))]
                    src_vectors = torch.tensor([node2vec[src] for (src, dest) in third_batch], device=self.device)
                    dest_vectors = torch.tensor([node2vec[dest] for (src, dest) in third_batch], device=self.device)
                    if len(src_vectors) > 1:
                        src_softmax = self.forward(src_vectors)
                        dest_softmax = self.forward(dest_vectors)
                        for s, d in zip(src_softmax, dest_softmax):
                            loss += torch.dist(s, d) # TO DO: WEIGHTS
                    
                    train_node_pairs_cpy = [i for i in train_node_pairs_cpy if i not in batch]
                    
                   
                    if loss.item() != 0:
                        assert not torch.isnan(loss)
                        loss.backward()
                        optimizer.step()
                        total_loss += loss.item()
                        del loss

                avg_loss = total_loss / len(labeled_nodes)
                print("Epoch: {} Loss: {} (avg: {})".format(e + 1, total_loss, avg_loss))

         

    def predict(self, tf_idf_vec):
        return torch.argmax(self.forward(tf_idf_vec)).item()
        
    def evaluate(self, test_nodes, node2vec, node2label):
        self.eval()

        correct_count = 0
        for node in test_nodes:
            predicted = self.predict(torch.tensor(node2vec[node], device=self.device))
            if predicted == node2label[node]:
                correct_count += 1

        return float(correct_count) / len(test_nodes)

### Baseline feed-forward neural network

In [206]:
# Important variable from previous cells: node_pairs, node2vec, node2label, seed_nodes, train_node_pairs, test_nodes
from datetime import datetime
baseline_model = MY_NGM_FFNN(0, TF_IDF_DIM, HIDDEN_1_DIM, HIDDEN_2_DIM, NUM_CATEGORIES)
start = datetime.now()
baseline_model.train_(seed_nodes, train_node_pairs, node2vec, node2label, NUM_EPOCH, BATCH_SIZE, LEARNING_RATE)
baseline_time = (datetime.now()-start).total_seconds()

Training...
Generation: 1 (with 60 labeled nodes)
Epoch: 1 Loss: 77.44475434720516 (avg: 1.2907459057867527)
Epoch: 2 Loss: 64.01455010473728 (avg: 1.066909168412288)
Epoch: 3 Loss: 54.67511613667011 (avg: 0.9112519356111686)
Epoch: 4 Loss: 47.92311353236437 (avg: 0.7987185588727395)
Epoch: 5 Loss: 43.2147398814559 (avg: 0.7202456646909317)
Epoch: 6 Loss: 39.25141831487417 (avg: 0.6541903052479029)
Epoch: 7 Loss: 35.82105325907469 (avg: 0.5970175543179115)
Epoch: 8 Loss: 32.8569213822484 (avg: 0.5476153563708067)
Epoch: 9 Loss: 30.23717414587736 (avg: 0.5039529024312893)
Epoch: 10 Loss: 27.92315060645342 (avg: 0.4653858434408903)
Epoch: 11 Loss: 25.87398335337639 (avg: 0.4312330558896065)
Epoch: 12 Loss: 24.059869810938835 (avg: 0.40099783018231394)
Generation: 2 (with 60 labeled nodes)
Epoch: 1 Loss: 22.470564048737288 (avg: 0.3745094008122881)
Epoch: 2 Loss: 21.026044607162476 (avg: 0.35043407678604127)
Epoch: 3 Loss: 19.73231590911746 (avg: 0.32887193181862434)
Epoch: 4 Loss: 18.567

### Neural graph machine feed-forward neural network

In [None]:
# Important variable from previous cells: node_pairs, node2vec, node2label, seed_nodes, train_node_pairs, test_nodes

NGM_model = NGM_FFNN(ALPHA, TF_IDF_DIM, HIDDEN_1_DIM, HIDDEN_2_DIM, NUM_CATEGORIES)
start = datetime.now()
NGM_model.train_(seed_nodes, train_node_pairs, node2vec, node2label, NUM_EPOCH, BATCH_SIZE, LEARNING_RATE)
NGM_time = (datetime.now()-start).total_seconds()

### Time taken

In [207]:
print(baseline_time)

507.903482


NameError: name 'NGM_time' is not defined

### Evaluations

In [210]:
# Important variable from previous cells: node2vec, node2label, test_nodes

print(baseline_model.evaluate(test_nodes, node2vec, node2label))

0.3333333333333333


### Save model

In [9]:
baseline_model.save(MODEL_DIR, "PubMed_baseline")
NGM_model.save(MODEL_DIR, "PubMed_NGM")

Saving model...
Model saved.
Saving model...
Model saved.
