# HW4: Graph Neural Networks

* There are two datasets in the `data` folder: `train.pt`, `test.pt`. You will train a GCNN on the train dataset, then make predictions on the test dataset.
* There are two parts in this notebook. `Part I` gives a custom `Dataset` object and loads the datasets. The `QM_Dataset` object inherites from torch geometric `Dataset` object. `Part II` is an example solution.
* This HW is implemented with [Pytorch Geometric (PyG)](https://pytorch-geometric.readthedocs.io/en/latest/index.html). Another popular library for implementing GNNs is [Deep Graph Library (DGL)](https://www.dgl.ai/)

In [None]:
!pip install torch_geometric

In [2]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.nn import Linear, ReLU, Sequential
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
from sklearn.metrics import mean_absolute_error
import torch_geometric.nn as pyg_nn
from torch_geometric.nn import TransformerConv

In [3]:
# only on google colab

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Part I. The training and testing dataset are provided

- The train and test datasets were pre-processed graphs. The train dataset contains 20,000 graphs, while the test dataset contains 2,000 graphs.
- Each graph contains the following components:

    - `x`, the matrix containing node features, `[num_of_nodes, num_node_features=11]`
    - `edge_index`, the matrix containing connection information about different nodes, `[2, num_of_edges]`
    - `y`, the label for the graph, `scaler`. The value is set to `0` in the test dataset
    - `pos`, the matrix containing the node positions, `[num_of_nodes, 3]`
    - `edge_attr`, the matrix containing the edge information, `[num_edges, 4]`
    - `names`, index for the graph. For example, `gdb_59377`

- Depending on the graph convolutional layer that is used, different components are needed. For the most basic application, `x`, `edge_index` and `y` will be used.


In [4]:
class QM_Dataset(Dataset):
    def __init__(self, path):
        super().__init__(root=".")
        self.data = torch.load(path)

    def len(self):
        return len(self.data)

    def get(self, idx):
        return self.data[idx]

train_path = "/content/drive/MyDrive/Colab Notebooks/train_path"
test_path = "/content/drive/MyDrive/Colab Notebooks/test_path"

train_data_ = QM_Dataset(train_path)

# train dataset can be split for validation purposes
train_data, validate_data = torch.utils.data.random_split(train_data_, [19000, 1000])
test_data = QM_Dataset(test_path)

## Part II. Example solution

In [5]:
# define the network
# many convolutional layers are available in torch_geometric.nn
# here NNConv is just used as an example

from torch_geometric.nn import NNConv, Set2Set, GCNConv, SGConv, TAGConv

class Net(torch.nn.Module):
    def __init__(self, num_features=11, dim=64):
        super().__init__()
        self.lin0 = torch.nn.Linear(num_features, dim)
        nn = Sequential(Linear(4, 128), ReLU(), Linear(128, dim))
        self.conv = TAGConv(dim, dim, K = 6)      # replace with your own convolutional layers here
        self.set2set = Set2Set(dim, processing_steps=3)    # set2set is used to map from nodes to graphs
        self.lin1 = torch.nn.Linear(2 * dim, dim)
        self.lin2 = torch.nn.Linear(dim, 1)

    def forward(self, data):
        out = F.relu(self.lin0(data.x))                    #data.x size [batch_num_nodes, num_node_features]
        for _ in range(3):
            out = F.relu(self.conv(out, data.edge_index))
        out = self.set2set(out, data.batch)                #[batch_num_nodes, dim] ==> [batch_num_graphs, dim*2]
        out = F.relu(self.lin1(out))
        out = self.lin2(out)
        return out.view(-1)

In [6]:
# define training and evaluation functions
def train(model, loader, optimizer, criterion, device):
    """Takes in training dataset loader,
    train the model one step,
    update the parameters,
    return the current loss"""
    model.train()
    total_loss = 0
    for data in loader:
      data = data.to(device)
      optimizer.zero_grad()       # Reset gradients
      output = model(data)        # Forward pass
      loss = criterion(output, data.y.to(device))  # Compute the loss
      loss.backward()             # Backpropagate the gradients
      optimizer.step()            # Update the weights
      total_loss += loss.item() * data.num_graphs  # Update total loss
    return total_loss / len(loader.dataset)

def eval(model, loader, criterion, device):
    """Takes the validation dataset loader,
    return the validation MAE"""
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
          data = data.to(device)
          output = model(data)
          loss = criterion(output, data.y.to(device))
          total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

In [7]:
# load the datasets
train_loader = DataLoader(train_data, batch_size=128)
validate_loader = DataLoader(validate_data, batch_size=128)
test_loader = DataLoader(test_data, batch_size=8)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Net(num_features=11, dim=64).to(device)
optimizer = torch.optim.Adamax(model.parameters(), lr=0.005)
criterion = torch.nn.L1Loss()

In [8]:
print(device)

cuda:0


In [None]:
# training
num_epochs = 200
for epoch in range(1, num_epochs):
    """Calculate loss and
    validation MAE"""
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = eval(model, validate_loader, criterion, device)
    print(f"Epoch: {epoch:02d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

In [10]:
# predict
y_pred = []
Idx = []

model.eval()
with torch.no_grad():
    for data in test_loader:
      output = model(data.to(device))
      y_pred.extend(output.cpu().numpy())
      Idx.extend(data.name)

assert(len(Idx) == len(y_pred))
df = pd.DataFrame({"Idx": Idx, "labels": y_pred})

In [11]:
# upload solution
df.columns = ['Idx', 'labels']
df.to_csv("template", index=False)