In [None]:
#!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
#!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
#!pip install -q torch-geometric
#!pip install -q git+https://github.com/snap-stanford/deepsnap.git
#!pip install -q ogb

In [None]:
import torch_geometric
import torch_geometric as pyg
torch_geometric.__version__

## Mount drive and load data folder

In [None]:
use_localenv = 1
if use_localenv == 0:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
if use_localenv == 0:
    data_dir = "/content/drive/MyDrive/CS224w-project/dataset/dataset/ogbn_products/"
    %cd drive/MyDrive/CS224w-project/dataset/dataset/ogbn_products/
    !ls
else:
    import os
    data_dir = "C:\\Users\\shaon\\Desktop\\CS224W\\CS224W_2021\\CS224W_PROJECT\\dataset\\ogbn_products"
    os.chdir(data_dir)


## Generate small graph for baseline

In [None]:
import random

def sample_edges(edge_file_name, output_file_name):
    d1 = {}
    l = []
    for i in range(1000):
        d1[random.randint(0, 61859140)] = 1

    f = open(edge_file_name)
    c = 0
    out = []
    while True:
        l = f.readline()
        if len(l) == 0:
            break
        if c in d1:
            out.append(l)
        c += 1

    w = open(output_file_name, 'w')
    for line in out:
        w.write(line)

    w.close()


def get_node_features(node_file, node_label_file, edge_file,
                      node_feature_output, node_label_output):
    f = open(edge_file)
    nodes = []
    for line in f.readlines():
        values = line.strip().split(',')
        for x in values:
            if len(x.strip()) > 0:
                nodes.append(int(x))

    print("Num nodes before dedup=", len(nodes))
    nodes = set(nodes)
    print("Num nodes=", len(nodes))

    c = 0
    nf = open(node_file)
    nl = open(node_label_file)
    outf = []
    outl = []
    while True:
        features = nf.readline()
        label = nl.readline()

        if not features:
            break
        if c in nodes:
            outf.append(str(c) + ',' + features)
            outl.append(str(c) + ',' + label)
        c = c + 1

    w = open(node_feature_output, 'w')
    for line in outf:
        w.write(line)
    w.close()

    w = open(node_label_output, 'w')
    for line in outl:
        w.write(line)
    w.close()
    
#!pwd
import os
data_dir = "raw/"
if not os.path.exists('raw/out'):
  os.makedirs('raw/out')

  edge_input_file =  data_dir + "edge.csv"
  edge_output_file = data_dir + "out/output.csv"

  sample_edges(edge_input_file, edge_output_file)

  node_file = data_dir + "node-feat.csv"
  node_label_file = data_dir + "node-label.csv"
  node_feature_output = data_dir + "out/node-feat.csv"
  node_label_output = data_dir + "out/node-label.csv"
  get_node_features(node_file, node_label_file, edge_output_file,
                  node_feature_output, node_label_output)
else:
  edge_output_file = data_dir + "out/output.csv"
  node_feature_output = data_dir + "out/node-feat.csv"
  node_label_output = data_dir + "out/node-label.csv"

## GNN Stack Module

Below is the implementation for a general GNN Module that could plugin any layers, including **GraphSage**, **GAT**, etc. This module is provided for you, and you own **GraphSage** and **GAT** layers will function as components in the GNNStack Module.



In [None]:
# GNN Stack Module 
import torch
import torch_scatter
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

from torch import Tensor
from typing import Union, Tuple, Optional
from torch_geometric.typing import (OptPairTensor, Adj, Size, NoneType, OptTensor)

from torch.nn import Parameter, Linear
from torch_sparse import SparseTensor, set_diag
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops, softmax

class GNNStack(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, args, emb=False):
        super(GNNStack, self).__init__()
        conv_model = self.build_conv_model(args.model_type)
        self.convs = nn.ModuleList()
        self.convs.append(conv_model(input_dim, hidden_dim))
        assert (args.num_layers >= 1), 'Number of layers is not >=1'
        for l in range(args.num_layers-1):
            self.convs.append(conv_model(args.heads * hidden_dim, hidden_dim))

        # post-message-passing
        self.post_mp = nn.Sequential(
            nn.Linear(args.heads * hidden_dim, hidden_dim), nn.Dropout(args.dropout), 
            nn.Linear(hidden_dim, output_dim))

        self.dropout = args.dropout
        self.num_layers = args.num_layers

        self.emb = emb

    def build_conv_model(self, model_type):
        if model_type == 'GraphSage':
            return GraphSage
        elif model_type == 'GAT':
          return GAT

    def forward(self, data):
        print(data)
        x, edge_index = data.x, data.edge_index
          
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout)

        x = self.post_mp(x)

        if self.emb == True:
            return x

        return F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)

## GraphSage Implementation

In [None]:
class GraphSage(MessagePassing):
    
    def __init__(self, in_channels, out_channels, normalize = True,
                 bias = None, **kwargs):  
        super(GraphSage, self).__init__(**kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.normalize = normalize
        self.bias = bias
        self.lin_l = None
        self.lin_r = None

        self.lin_l = torch.nn.Linear(self.in_channels, self.out_channels)
        self.lin_r = torch.nn.Linear(self.in_channels, self.out_channels)

        self.reset_parameters()

    def reset_parameters(self):
        self.lin_l.reset_parameters()
        self.lin_r.reset_parameters()

    def forward(self, x, edge_index, size = None):
        out = None
        h_l = self.lin_l(x)
        print("ZZZ:", edge_index.shape, x.shape, size)
        h_r = self.propagate(edge_index, x=(x, x))
        print("ZZZ:", edge_index.shape, x.shape, size)
        h_r = self.lin_r(x)
        out = h_l + h_r
        if self.normalize:
          out = F.normalize(out)
        return out

    def message(self, x_j):
        out = x_j

        return out

    def aggregate(self, inputs, index, dim_size = None):

        out = None
        node_dim = self.node_dim
        out = torch_scatter.scatter(src=inputs, index=index, dim=node_dim, reduce='mean')

        return out


## GAT Implementation

In [None]:
class GAT(MessagePassing):

    def __init__(self, in_channels, out_channels, heads = 2,
                 negative_slope = 0.2, dropout = 0., **kwargs):
        super(GAT, self).__init__(node_dim=0, **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.heads = heads
        self.negative_slope = negative_slope
        self.dropout = dropout
        self.lin_l = nn.Linear(in_channels, heads * out_channels)

        self.lin_r = self.lin_l

        self.att_r = nn.Parameter(torch.zeros([heads, out_channels, 1], dtype=torch.float))
        self.att_l = nn.Parameter(torch.zeros([heads, out_channels, 1], dtype=torch.float))

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.lin_l.weight)
        nn.init.xavier_uniform_(self.lin_r.weight)
        nn.init.xavier_uniform_(self.att_l)
        nn.init.xavier_uniform_(self.att_r)

    def forward(self, x, edge_index, size = None):

        H, C = self.heads, self.out_channels
        z1 = self.lin_l(x)
        z2 = self.lin_r(x)
        h1 = z1.reshape([z1.shape[0], H, C])
        h2 = z2.reshape([z2.shape[0], H, C])
        h1e = h1[edge_index[0]]
        h2e = h2[edge_index[1]]

        alpha_l = torch.matmul(self.att_l.reshape([1, H, 1, C]), h1.reshape([h1.shape[0], H, C, 1]))
        alpha_r = torch.matmul(self.att_r.reshape([1, H, 1, C]), h2.reshape([h2.shape[0], H, C, 1]))
        alpha_l = alpha_l.reshape([h1.shape[0], H])
        alpha_r = alpha_r.reshape([h2.shape[0], H])

        z = self.propagate(edge_index, x=(h1, h2), alpha=(alpha_l, alpha_r))
        out = z.reshape([z.shape[0], z.shape[1] * z.shape[2]])

        return out

    def message(self, x_j, alpha_j, alpha_i, index, ptr, size_i):
        ax = F.leaky_relu(alpha_i + alpha_j, negative_slope=self.negative_slope)
        a = pyg_utils.softmax(
            ax,
            index=index, ptr=ptr, num_nodes=size_i)
        a1 = F.dropout(a, p=self.dropout)
        a1 = a1.reshape([a1.shape[0], a1.shape[1], 1])
        out = torch.mul(a1, x_j)

        return out

    def aggregate(self, inputs, index, dim_size = None):

        out = torch_scatter.scatter(inputs, index, dim=0, dim_size=dim_size, reduce='sum')

        return out

## Building Optimizers

In [None]:
import torch.optim as optim

def build_optimizer(args, params):
    weight_decay = args.weight_decay
    filter_fn = filter(lambda p : p.requires_grad, params)
    if args.opt == 'adam':
        optimizer = optim.Adam(filter_fn, lr=args.lr, weight_decay=weight_decay)
    elif args.opt == 'sgd':
        optimizer = optim.SGD(filter_fn, lr=args.lr, momentum=0.95, weight_decay=weight_decay)
    elif args.opt == 'rmsprop':
        optimizer = optim.RMSprop(filter_fn, lr=args.lr, weight_decay=weight_decay)
    elif args.opt == 'adagrad':
        optimizer = optim.Adagrad(filter_fn, lr=args.lr, weight_decay=weight_decay)
    if args.opt_scheduler == 'none':
        return None, optimizer
    elif args.opt_scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.opt_decay_step, gamma=args.opt_decay_rate)
    elif args.opt_scheduler == 'cos':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.opt_restart)
    return scheduler, optimizer

## Training and testing

In [None]:
import time

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.data import DataLoader

import torch_geometric.nn as pyg_nn

import matplotlib.pyplot as plt


def train(dataset, args):
    
    test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
    # build model
    # model = GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args)
    model = GNNStack(100, args.hidden_dim, args.num_classes, args)
    scheduler, opt = build_optimizer(args, model.parameters())

    # train
    losses = []
    test_accs = []
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y.reshape(batch.y.shape[0])
            print("ZZZ20", pred.shape, label.shape, pred.dtype, label.dtype)
            #pred = pred[batch.train_mask]  
            #label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            #total_loss += loss.item() * batch.num_graphs
            total_loss += loss.item()
        # total_loss /= len(loader.dataset)
        losses.append(total_loss)

        # uncommented   # Arvind: commented test function 
        if epoch % 10 == 0:
          test_acc = test(test_loader, model)
          test_accs.append(test_acc)
        else:
          test_accs.append(test_accs[-1])
    return test_accs, losses

def test(loader, model, is_validation=True):
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            # max(dim=1) returns values, indices tuple; only need indices
            pred = model(data).max(dim=1)[1]
            label = data.y

        mask = data.val_mask if is_validation else data.test_mask
        # node classification: only evaluate on nodes in test set
        pred = pred[mask]
        label = data.y[mask]
            
        correct += pred.eq(label).sum().item()

    total = 0
    for data in loader.dataset:
        total += torch.sum(data.val_mask if is_validation else data.test_mask).item()
    return correct / total

class objectview(object):
    def __init__(self, d):
        self.__dict__ = d

## Train on ogbn-products

## Load dataset

In [None]:
# from ogb.nodeproppred import NodePropPredDataset
# dataset = NodePropPredDataset(name = 'ogbn-products')
# split_idx = dataset.get_idx_split()
# train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

# from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
# dataset = PygNodePropPredDataset('ogbn-products')
# split_idx = dataset.get_idx_split()
# data = dataset[0]
# train_idx = split_idx['train']

In [None]:
# Data subset

def remap_edges(edge_list, node_map):
    remapped_edges = torch.zeros(edge_list.shape, dtype=torch.long)
    for e in edge_list:
        remapped_edges[0] = node_map[e[0]]
        remapped_edges[1] = node_map[e[1]]
    return remapped_edges

def load_sparse_node_features(node_feature_file):
    node_map = {}
    features_list = []
    with open(node_feature_file) as f:
        while 1:
            line = f.readline()
            if not line:
                break
            values = line.strip().split(',')
            features = [float(x) for x in values[1:]]
            features_list.append(features)
            node_id = int(values[0])
            if not (node_id in node_map):
                node_map[node_id] = len(node_map)
    features_tensor = torch.Tensor(features_list)
    return (node_map, features_tensor)

edges = np.genfromtxt(edge_output_file, dtype=int, delimiter=',')
print("zzz1", edges.shape)
# labels = np.genfromtxt(node_label_output, dtype=float, delimiter=',')
node_map, features = load_sparse_node_features(node_feature_output)
print("zzz2", len(node_map), features.shape)
nmap2, labels = load_sparse_node_features(node_label_output)
print("zzz2", len(nmap2), labels.shape, labels.dtype)

## Sanity Check: Each node has a label. Each edge endpoint is in node_map
error_count = 0
for n in node_map:
    if not (n in nmap2):
        error_count = error_count + 1
        if error_count < 10:
            print("Problem with node:", n)

error_count = 0
for e in edges:
    if not ((e[0] in node_map) and (e[1] in node_map)):
        error_count = error_count + 1
        if error_count < 10:
            print("Problem with edge: ", e[0], e[1])
            
print("zzz3", torch.max(labels))
    
            
###

num_classes = int(torch.round(torch.max(labels)).item()) + 1
num_nodes = features.shape[0]
# Create the train, valid, test indexes
idx_mask = torch.rand(num_nodes)
train_mask = torch.lt(idx_mask, 0.8)
test_mask = torch.logical_and(torch.lt(idx_mask, 0.9), torch.ge(idx_mask, 0.8))
valid_mask = torch.ge(idx_mask, 0.9)
print("zzz3", num_classes, train_mask.shape, test_mask.shape, valid_mask.shape, train_mask.dtype)
num_masked = torch.sum(torch.logical_or(torch.logical_or(train_mask, test_mask), valid_mask).to(torch.int)).item()
assert num_masked == num_nodes, "{} {}".format(num_classes, num_masked)
num_common = torch.sum(torch.logical_and(torch.logical_and(train_mask, test_mask), valid_mask).to(torch.int)).item()
assert num_common == 0
###

edges = remap_edges(edges, node_map).t()
labels = torch.round(labels).to(torch.long)
features = torch.FloatTensor(features)

data = pyg.data.Data(x=features, edge_index=edges, y=labels, test_mask=test_mask, val_mask=valid_mask, train_mask=train_mask)
print(data)

In [None]:
def main():
    for args in [
        {'model_type': 'GraphSage',
         'num_classes': num_classes,
         'dataset': 'cora', 'num_layers': 2,
         'heads': 1, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.5,
         'epochs': 50, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0,
         'weight_decay': 5e-3, 'lr': 0.01},
    ]:
      args = objectview(args)
      for model in ['GraphSage', 'GAT']:
          args.model_type = model

          # Match the dimension.
          if model == 'GAT':
            args.heads = 2
          else:
            args.heads = 1

          test_accs, losses = train([data], args) 

          print("Maximum accuracy: {0}".format(max(test_accs)))
          print("Minimum loss: {0}".format(min(losses)))

          plt.plot(losses, label="training loss" + " - " + args.model_type)
          plt.plot(test_accs, label="test accuracy" + " - " + args.model_type)
      plt.legend()
      plt.show()

if __name__ == '__main__':
    main()

In [None]:
# loader = DataLoader([data], batch_size=32)
# print(loader)
# for batch in loader:
#   print(batch)

In [None]:
# loader = DataLoader(data, batch_size=32, shuffle=True)
# for batch in loader:
#   print(batch)
#   print('sd')

## Archived

## Inductive split using DeepSNAP

In [None]:
# !pip install -q deepsnap
# from deepsnap.graph import Graph
# from deepsnap.batch import Batch
# from deepsnap.dataset import GraphDataset
# graphs =  GraphDataset.pyg_to_graphs(dataset)

# task = 'node'
# dataset = GraphDataset(graphs, task=task)
# dataset_train, dataset_val, dataset_test = dataset.split(
#     transductive=False, split_ratio=[0.2, 0.2, 0.6])