In [1]:
import os.path as osp

import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
    
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, NNConv
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

from torch_geometric.utils import negative_sampling
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges

In [2]:
%run dataset_ep.ipynb

DATASET LOADED
Data(x=[26, 1], edge_index=[2, 26], edge_attr=[26, 1], y=[26, 19])


Processing...
Done!


In [6]:
torch.manual_seed(42)
dataset = dataset_ep.shuffle()

# train_dataset = dataset[:int(len(dataset) * 0.65)]
train_dataset = dataset[:int(len(dataset) * 1)]
test_dataset = dataset[int(len(dataset) * 0.35):]

print(f'Number of graphs total: {len(dataset)}')
print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')
print(dataset[0])
print(len(dataset))

Number of graphs total: 111
Number of training graphs: 111
Number of test graphs: 73
Data(x=[59, 1], edge_index=[2, 64], edge_attr=[64, 1], y=[64, 19])
111


In [8]:
from torch_geometric.loader import DataLoader
from torch_geometric.data import Dataset, Data

NUM_GRAPHS_PER_BATCH = len(dataset)
NUM_HIDDEN_CHANNELS = 64

train_loader = DataLoader(train_dataset, batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)

In [9]:
x = []
edge_index = []
edge_attr = []
y = []
for batch in train_loader:
    x = batch.x
    edge_index = batch.edge_index
    edge_attr = batch.edge_attr
    y = batch.y

data = Data(x=x, 
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=y,
            ) 

num_nodes = data.x.shape[0]
print(data)

Data(x=[3026, 1], edge_index=[2, 3220], edge_attr=[3220, 1], y=[3220, 19])


In [39]:
split_data = train_test_split_edges(data)
print(split_data)

Data(x=[1006, 1], y=[1053, 16], val_pos_edge_index=[2, 29], val_pos_edge_attr=[29, 1], test_pos_edge_index=[2, 58], test_pos_edge_attr=[58, 1], train_pos_edge_index=[2, 1000], train_pos_edge_attr=[1000, 1], train_neg_adj_mask=[1006, 1006], val_neg_edge_index=[2, 29], test_neg_edge_index=[2, 58])


In [8]:
class GNNStack(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout, emb=False):
        super(GNNStack, self).__init__()
        conv_model = torch_geometric.nn.SAGEConv

        self.convs = nn.ModuleList()
        self.convs.append(conv_model(input_dim, hidden_dim))
        self.dropout = dropout
        self.num_layers = num_layers
        self.emb = emb

        assert (self.num_layers >= 1), 'Number of layers is not >=1'
        for l in range(self.num_layers - 1):
            self.convs.append(conv_model(hidden_dim, hidden_dim))

        # post-message-passing
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(self.dropout),
            nn.Linear(hidden_dim, output_dim))

    def forward(self, x, edge_index):
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.post_mp(x)

        # Return node embeddings after post-message passing if specified
        if self.emb:
            return x

        # Else return class probabilities for each node
        return F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)

In [10]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(LinkPredictor, self).__init__()

        self.lins = nn.ModuleList()
        self.lins.append(nn.Linear(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.lins.append(nn.Linear(hidden_channels, hidden_channels))
        self.lins.append(nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()

    def forward(self, x_i, x_j):
        x = x_i * x_j
        for lin in self.lins[:-1]:
            x = lin(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return torch.sigmoid(x)

In [11]:
def train(model, link_predictor, emb, edge_index, pos_train_edge, batch_size, optimizer):
    """
    Runs offline training for model, link_predictor and node embeddings given the message
    edges and supervision edges.
    1. Updates node embeddings given the edge index (i.e. the message passing edges)
    2. Computes predictions on the positive supervision edges
    3. Computes predictions on the negative supervision edges (which are sampled)
    4. Computes the loss on the positive and negative edges and updates parameters
    """
    model.train()
    link_predictor.train()

    train_losses = []

    for edge_id in DataLoader(range(pos_train_edge.shape[0]), batch_size, shuffle=True):
        optimizer.zero_grad()

        node_emb = model(emb, edge_index)  # (N, d)

        pos_edge = pos_train_edge[edge_id].T  # (2, B)
        pos_pred = link_predictor(node_emb[pos_edge[0]], node_emb[pos_edge[1]])  # (B, )

        neg_edge = negative_sampling(edge_index, num_nodes=emb.shape[0],
                                     num_neg_samples=edge_id.shape[0], method='dense')  # (Ne,2)
        neg_pred = link_predictor(node_emb[neg_edge[0]], node_emb[neg_edge[1]])  # (Ne,)

        loss = -torch.log(pos_pred + 1e-15).mean() - torch.log(1 - neg_pred + 1e-15).mean()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    return sum(train_losses) / len(train_losses)

In [12]:
def test(model, predictor, emb, edge_index, split_edge, batch_size, evaluator):
    """
    Evaluates model on positive and negative test edges
    1. Computes the updated node embeddings given the edge index (i.e. the message passing edges)
    2. Computes predictions on the positive and negative edges
    3. Calculates hits @ k given predictions using the ogb evaluator
    """
    model.eval()
    predictor.eval()

    node_emb = model(emb, edge_index)

    pos_test_edge = split_edge['test']['edge'].to(emb.device)
    neg_test_edge = split_edge['test']['edge_neg'].to(emb.device)

    pos_test_preds = []
    for perm in DataLoader(range(pos_test_edge.size(0)), batch_size):
        edge = pos_test_edge[perm].t()
        pos_test_preds += [predictor(node_emb[edge[0]], node_emb[edge[1]]).squeeze().cpu()]
    pos_test_pred = torch.cat(pos_test_preds, dim=0)

    neg_test_preds = []
    for perm in DataLoader(range(neg_test_edge.size(0)), batch_size):
        edge = neg_test_edge[perm].t()
        neg_test_preds += [predictor(node_emb[edge[0]], node_emb[edge[1]]).squeeze().cpu()]
    neg_test_pred = torch.cat(neg_test_preds, dim=0)

    results = {}
    for K in [20, 50, 100]:
        evaluator.K = K #using the Evaluator function in the ogb.linkproppred package
        test_hits = evaluator.eval({
            'y_pred_pos': pos_test_pred,
            'y_pred_neg': neg_test_pred,
        })[f'hits@{K}']

        results[f'Hits@{K}'] = test_hits

    return results

In [None]:
import os
from ogb.linkproppred import PygLinkPropPredDataset, Evaluator
from torch.optim import optimizer
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optim_wd = 0
epochs = 300
hidden_dim = 256
dropout = 0.2
num_layers = 2
lr = 3e-3
node_emb_dim = 256
batch_size = 64 * 1024

edge_index = data.edge_index.to(device)

evaluator = Evaluator(name='ogbl-ddi')

# Create embedding, model, and optimizer
emb = torch.nn.Embedding(num_nodes, node_emb_dim).to(device)
model = GNNStack(node_emb_dim, hidden_dim, hidden_dim, num_layers, dropout, emb=True).to(device)
link_predictor = LinkPredictor(hidden_dim, hidden_dim, 1, num_layers + 1, dropout).to(device)

optimizer = optim.Adam(
    list(model.parameters()) + list(link_predictor.parameters()) + list(emb.parameters()),
    lr=lr, weight_decay=optim_wd
)

for e in range(epochs):
    loss = train(model, link_predictor, emb.weight, edge_index, pos_train_edge, batch_size, optimizer)
    print(f"Epoch {e + 1}: loss: {round(loss, 5)}")

    if (e + 1) % 10 == 0:
        result = test(model, link_predictor, emb.weight, edge_index, split_edge, batch_size, evaluator)
        print(f"{result}")



In [14]:
from ogb.linkproppred import PygLinkPropPredDataset, Evaluator

# Download and process data at './dataset/ogbl-ddi/'
dataset = PygLinkPropPredDataset(name="ogbl-ddi", root='./examples/')
split_edge = dataset.get_edge_split()
pos_train_edge = split_edge['train']['edge']

graph = dataset[0]

In [48]:
print(dataset[0])

Data(x=[48, 1], edge_index=[2, 51], edge_attr=[51, 1], y=[51, 16])


In [46]:
print(pos_train_edge.shape)
print(pos_train_edge.t().contiguous().t().shape)

torch.Size([1067911, 2])
torch.Size([1067911, 2])


- Unternehmensmodelle Beschreibung, Nutzung, Erstellung (4EM) etc.
- Einführung neurale Netze
- Einführung graph neural networks