In [3]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-geometric
!pip install -q git+https://github.com/snap-stanford/deepsnap.git
!pip install -U -q PyDrive

Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu111.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 2.9 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu111.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 2.6 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.12
Collecting torch-geometric
  Downloading torch_geometric-2.0.2.tar.gz (325 kB)
[K     |████████████████████████████████| 325 kB 5.0 MB/s 
Collecting rdflib
  Downloading rdflib-6.0.2-py3-none-any.whl (407 kB)
[K     |████████████████████████████████| 407 kB 41.5 MB/s 
Collecting 

In [4]:
import pandas as pd
import networkx as nx
from torch_geometric.utils import from_networkx

In [5]:
nx_graph = nx.read_gml('graph_with_features.gml')
G = from_networkx(nx_graph, group_node_attrs=['out_degree', 'in_degree', 'category_multi_hot'], group_edge_attrs=['tf_idf', 'num_link_clicked'])

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCN
from torch.utils.data import Dataset, DataLoader

In [27]:
class Model(torch.nn.Module):
    def __init__(self, graph):
        super().__init__()
        self.gcn = GCN(in_channels=145, 
                       hidden_channels=128, 
                       num_layers=3, 
                       out_channels=64, 
                       dropout=0.1)
        self.node_emb = self.gcn(graph.x, graph.edge_index)
        # self.edge_feat = 
        self.lstm = nn.LSTM(input_size=66,
                            hidden_size=2,
                            num_layers=1,
                            dropout=0.1)
        self.pred_head = nn.Linear(32, 4604)

    def forward(self, indices):
        # use node indices to get paths tensor
        # paths is (L, N, H_in)
        # where L is the length of the paths
        # N is the number of paths (i.e. batch size)
        # H_in is the input dimension (node_emd + edge_features = 66)
        # Compute path embeddings
        paths = None
        out, _ = self.lstm(paths)
        predictions = self.pred_head(torch.sum(out, dim=0))
        return F.log_softmax(predictions, dim=1)

In [28]:
# manipulating data to yield paths in form (L, N, H_in)
# pinSAGE like training process - progressive harder multistage training
# train on entire path up to target, then start removing
class CustomPathDataset(Dataset):
    def __init__(self, x, labels):
        self.labels = labels
        self.x = x
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        label = self.labels[idx]
        x = self.x[idx]
        sample = {"X": x, "Class": label}
        return sample

In [29]:
path_data = pd.read_csv('data_by_index.tsv', sep='\t', header=None)
indices = path_data[0]
labels = path_data[1]

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model(G).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

train_size = int(0.9 * len(indices))
test_size = int(len(indices) - train_size)
dataset = CustomPathDataset(indices, labels)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=64,
                                          shuffle=True, num_workers=2)

model.train()
for epoch in range(200):  # loop over the dataset multiple times
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        one_hot_labels = F.one_hot(labels, num_classes=4604)
        print(one_hot_labels)
        loss = F.nll_loss(outputs, one_hot_labels)
        loss.backward()
        optimizer.step()

        # print statistics
        print('Epoch:', epoch)
        print('Loss:', loss)

  "num_layers={}".format(dropout, num_layers))


NameError: ignored

In [None]:
test_inputs = None
test_labels = None

testloader = torch.utils.data.DataLoader(test_dataset, batch_size=1,
                                          shuffle=True, num_workers=2)

model.eval()
num_correct = 0
for i, data in enumerate(testloader, 0):
      # get the inputs; data is a list of [inputs, labels]
      inputs, labels = data

      outputs = model(inputs)

      pred = model(outputs).argmax(dim=1)
      correct = (pred == label).sum()
      num_correct += correct

acc = int(correct) / int(len(pred))
print(f'Accuracy: {acc:.4f}')

      # print statistics
      print('Epoch:', epoch)
      print('Loss:', loss)

pred = model(test_inputs).argmax(dim=1)
correct = (pred == torch.zeros_like(pred)).sum()
acc = int(correct) / int(len(pred))
print(f'Accuracy: {acc:.4f}')