In the `pyproject.toml` file, under `[project.optional-dependencies]`, you can find the group `graphs`. To install this dependency, run:

```bash
pdm install -G graphs
```

# Using torch_geometric

In [None]:
import torch
import matplotlib.pyplot as plt
import torch_geometric as tg
from torch_geometric.data import Data
import networkx as nx  # this is already a dependency of torch 2.1.0

We can create graphs

In [None]:

nodes = torch.tensor([[-1], [0], [1]], dtype=torch.float)
edges = torch.tensor([
    [0, 1, 1, 2],
    [1, 0, 2, 0]
], dtype=torch.long)
data = Data(x=nodes, edge_index=edges)
data

And cast them to networkx for visualization

In [None]:

from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(4, 4))
g = to_networkx(data)
nx.draw(g, with_labels=True, ax=ax)

- Note that edge_index, i.e. the tensor defining the source and target nodes of all edges, is not a list of index tuples.
- Note that it is necessary that the elements in edge_index only hold indices in the range `{ 0, ..., num_nodes - 1}`

Some basic properties of the graph

In [None]:

print(f"Amount of nodes is {data.num_nodes}")
print(f"Amount of edges is {data.num_edges}")
print(f"Amount of node features is {data.num_node_features}")
print(f"Amount of edge features is {data.num_edge_features}")
print(f"The data is directed: {data.is_directed()}")

And we can cast networkx back to torch_geometric

In [None]:
from torch_geometric.utils import from_networkx
data = from_networkx(g)
data

# Cora dataset

The cora dataset, visualized

![](https://graphsandnetworks.com/wp-content/uploads/2019/09/CoraBalloons.png)

In [None]:
from torch_geometric.datasets import Planetoid
cache = tg.get_home_dir()

dataset = Planetoid(root=cache, name='Cora')

print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]
print('======================')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

- 2708 scientific publications 
- classified into one of seven classes. 
- The citation network consists of 5429 links. 
- Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words

For the Cora dataset, the Data objects also holds a label for each node, and additional node-level attributes: train_mask, val_mask and test_mask, where
- `train_mask` denotes against which nodes to train (140 nodes),
- `val_mask` denotes which nodes to use for validation, e.g., to perform early stopping (500 nodes),
- `test_mask` denotes against which nodes to test (1000 nodes).

In [None]:
label_dict = {
    0: "Theory",
    1: "Reinforcement_Learning",
    2: "Genetic_Algorithms",
    3: "Neural_Networks",
    4: "Probabilistic_Methods",
    5: "Case_Based",
    6: "Rule_Learning"}

In [None]:
data.y

In [None]:
import collections
counter = collections.Counter(data.y.numpy())
counter = dict(counter)
print(counter)
count = [x[1] for x in sorted(counter.items())]
plt.bar(label_dict.values(), count)
plt.xlabel("class", size=20)
plt.xticks(rotation=60);

Lets create a helperfunction to visualize the output of the GCN

In [None]:
from sklearn.manifold import TSNE
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2", alpha=0.3)
    plt.show()

This is the shape of our inputdata

In [None]:
data.x.shape

This is a 2D tensor, and hopefully you remember that we can use Linear layers for this.
So lets create a simple network with 2 linear layers and a ReLU activation in between, and see how it performs.

In [None]:
import torch
from torch import nn
class NeuralNetwork(nn.Module):
    def __init__(self, size_in: int, hidden: int, size_out: int):
        super().__init__()
        self.linear1 = nn.Linear(size_in, hidden)
        self.linear2 = nn.Linear(hidden, size_out)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

model = NeuralNetwork(size_in=dataset.num_features, hidden=16, size_out=dataset.num_classes)
model

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data.x)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Loss {loss.item()}")
    optimizer.step()

In [None]:
model.eval()
pred = model(data.x).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

The accuracy is about 50 percent. Given that we would expect about 1/7th, or maybe 800 (majority class) / 2700 (total) = 30 percent, this is not too bad.
If we visualize it, we can see that it's ordered, but not very well.

In [None]:
out = model(data.x)
print(f"The shape of the output is {out.shape}")
visualize(out, color=data.y)

Now, lets use a basic convolutional layer instead of a linear layer.
We are going to use the additional information from the structure of the data in this way, and we will learn about the neighborhood of each node.
We are using 2 convolutions, so this will be a two-hop neighborhood accumulation of information.

In [None]:

import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, hidden: int):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden)
        self.conv2 = GCNConv(hidden, dataset.num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)

        return x
model = GCN(hidden=16)
model

In [None]:
data.train_mask.sum() , len(data.train_mask)

In [None]:
model = GCN(hidden=16)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Loss {loss.item()}")
    optimizer.step()

In [None]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

That has improved a lot!

In [None]:
out = model(data)
print(f"The shape of the output is {out.shape}")
visualize(out, color=data.y)

In addition, the visualisations shows how the data is much more organized, and the classes are much more separated.