In [1]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-geometric
!pip install -q git+https://github.com/snap-stanford/deepsnap.git
!pip install -U -q PyDrive

Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu111.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 2.9 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu111.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 2.7 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.12
Collecting torch-geometric
  Downloading torch_geometric-2.0.2.tar.gz (325 kB)
[K     |████████████████████████████████| 325 kB 5.1 MB/s 
Collecting rdflib
  Downloading rdflib-6.0.2-py3-none-any.whl (407 kB)
[K     |████████████████████████████████| 407 kB 41.9 MB/s 
Collecting 

In [59]:
import json
import pandas as pd
import networkx as nx
from torch_geometric.utils import from_networkx

In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCN
from torch.utils.data import Dataset, DataLoader

In [51]:
nx_graph = nx.read_gml('graph_with_features.gml')
G = from_networkx(nx_graph, group_node_attrs=['out_degree', 'in_degree', 'category_multi_hot'], group_edge_attrs=['tf_idf', 'num_link_clicked'])

path_data = pd.read_csv('data_by_index.tsv', sep='\t', header=None)

In [90]:
# manipulating data to yield paths in form (L, N, H_in)
# pinSAGE like training process - progressive harder multistage training
# train on entire path up to target, then start removing
class CustomPathDataset(Dataset):
    def __init__(self, path_data):
        self.x = path_data[0].apply(json.loads)
        self.labels = path_data[1]
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        x = torch.LongTensor(self.x[idx])
        label = self.labels[idx]
        sample = {"indices": x, "label": label}
        return sample

In [106]:
class Model(torch.nn.Module):
    def __init__(self, graph):
        super().__init__()
        self.gcn = GCN(in_channels=145, 
                       hidden_channels=128, 
                       num_layers=3, 
                       out_channels=64, 
                       dropout=0.1)
        self.node_emb = self.gcn(graph.x, graph.edge_index)
        # self.edge_feat = TODO
        self.lstm = nn.LSTM(input_size=64,
                            hidden_size=32,
                            batch_first=True)
        self.pred_head = nn.Linear(32, 4604)

    def forward(self, indices):
        # paths has shape (N, L, H_in) where
        # - N is the number of paths (i.e. batch size)
        # - L is the length of the paths
        # - H_in is the input dimension (node_emb + edge_features = 66)
        paths = self.node_emb[indices] # TODO: need to append edge features to data before passing into LSTM
        out, _ = self.lstm(paths)
        predictions = self.pred_head(torch.sum(out, dim=0))
        return F.log_softmax(predictions, dim=1)

In [113]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model(G).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

train_size = int(0.9 * len(indices))
test_size = len(indices) - train_size
dataset = CustomPathDataset(path_data)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=32,
                                          shuffle=True, num_workers=2)

model.train()
for epoch in range(200):  # loop over the dataset multiple times
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['indices'], data['label']

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = F.nll_loss(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        print('Epoch:', epoch)
        print('Loss:', loss.item())

Epoch: 0
Loss: 9.201899528503418


RuntimeError: ignored

In [103]:
test_inputs = None
test_labels = None

testloader = torch.utils.data.DataLoader(test_dataset, batch_size=1,
                                          shuffle=True, num_workers=2)

model.eval()
num_correct = 0
for i, data in enumerate(testloader, 0):
      # get the inputs; data is a list of [inputs, labels]
      inputs, labels = data

      outputs = model(inputs)

      pred = model(outputs).argmax(dim=1)
      correct = (pred == label).sum()
      num_correct += correct

acc = int(correct) / int(len(pred))
print(f'Accuracy: {acc:.4f}')

      # print statistics
      print('Epoch:', epoch)
      print('Loss:', loss)

pred = model(test_inputs).argmax(dim=1)
correct = (pred == torch.zeros_like(pred)).sum()
acc = int(correct) / int(len(pred))
print(f'Accuracy: {acc:.4f}')

IndentationError: ignored