In [1]:
import os
import torch
import pandas as pd
import torch.nn.functional as F

# Colab - Pytorch Geometric installation according to Pytorch documentation
'''
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
'''

from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch.nn import Linear
from torch_geometric.nn import GCNConv, global_mean_pool

from IPython.display import Javascript

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Local machine
%cd C:\Users\yuval\Projects\MolecularGraphs

# Colab
'''
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/MolecularGraphs/
'''

C:\Users\yuval\Projects\MolecularGraphs


"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n%cd drive/MyDrive/MolecularGraphs/\n"

## Preparing the dataset:

### Defining custom Dataset:

The `Dataset` class is designed to sample batches from storage without uploading all data into the RAM.
To create our own custom operation we need to create a class that inherent from `Dataset` class.

In the `init` method, the arguments that are pass to `Dataset` are:
* `root` (str, optional) - The root directory where the data should be saved.
This directory is going to have `raw` directory and `processed` directory.
The `raw` directory is where you have all files of the data, a file per instance.
The `processed` directory is where the class is going to saved all processed files.
The processing of files in our case is the convertion of the file into a `Data` object (including node features, edge index, label/s, and optional of edges features).
* `transform` (callable, optional) - not used - a function/transform that takes in an `Data` object and returns a transformed version. The `Data` object will be **transformed before every access**.
* `pre_transform` (callable, optional) - not used – a function/transform that takes in an `Data` object and returns a transformed version. The `Data` object will be **transformed before being saved to disk**. (default: None)
* `pre_filter` (callable, optional) - not used - a function that takes in an `Data` object and returns a boolean value, indicating whether the `Data` object should be included in the final dataset. 
* `log` (bool, optional) - whether to print any console output while downloading and processing the dataset.

Following the `init` method, we have two method decorated as property.
The decorator define the method as a "getter", i.e., getting an attribute of the class.
That means we can treat such method as an attribute and call it without parentheses.
Those two properties return all files names inside the previously mentioned directories - `raw` and `processed`.
Those two properties are designed for the class to check if the raw/processes files exsit before performing the dolwnloading/processing.

The `process` method is called with calling the `Dataset`'s `init` method (I think).
In this method you iterate over all of the raw files and turn them into `Data` object of graph, including the `pre_transform` and `pre_filter` functions calls.

The `len` and `get` are self-explanable.


In [26]:
class MyDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return os.listdir(self.raw_dir)

    @property
    def processed_file_names(self):
        data_file_names = [os.path.splitext(file_name)[0]+ '.pt' for file_name in self.raw_file_names]
        return data_file_names + ['pre_filter.pt', 'pre_transform.pt']


    def process(self):
        idx = 0
        for raw_path in self.raw_paths:

            # Load the two arrays and scaler from the saved file using read_pickle()
            with open(raw_path, 'rb') as f:
                x, edge_index, y = pd.read_pickle(f)
            
            data_i = Data(x=torch.tensor(x),
                          edge_index=torch.tensor(edge_index),
                          y=torch.tensor([int(y)])) # You want the y as 1D int and not a scaler

            if self.pre_filter is not None and not self.pre_filter(data_i):
                continue

            if self.pre_transform is not None:
                data_i = self.pre_transform(data_i)

            torch.save(data_i, os.path.join(self.processed_dir, f'data_{idx}.pt'))
            idx += 1

    def len(self):
        return len(self.processed_file_names) - 2 # minus the pre_filter and pre_transform

    def get(self, idx):
        data_i = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data_i

In [27]:
# Local machine
dataset = MyDataset('Zinc\GraphData')

# Colab
#dataset = MyDataset('Zinc/GraphData')

Processing...
Done!


### Examination:

In [14]:
data = dataset.get(10)
data

Data(x=[19, 11], edge_index=[2, 40], y=0)

In [15]:
data.x

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.,

In [16]:
data.edge_index

tensor([[ 0,  1,  1,  1,  2,  3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  8,
          8,  9,  9,  9, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16,
         17, 17, 18, 18],
        [ 1,  0,  2,  3,  1,  1,  4,  3,  5,  4,  6, 13,  5,  7, 12,  6,  8,  7,
          9,  8, 10, 11,  9,  9, 12,  6, 11,  5, 14, 18, 13, 15, 14, 16, 15, 17,
         16, 18, 13, 17]])

In [17]:
data.y

tensor(0)

In [10]:
loader = DataLoader(dataset, batch_size=32, shuffle=True)

To split the data into training, validation and test sets, we use the `index_select` method which creates a subset of the dataset from specified indices idx.

In [None]:
N = dataset.len()
idx = torch.randperm(N) # Random permutation of integers from 0 to N - 1
idx_train, idx_val, idx_test = idx[:int(0.8 * N)], idx[int(0.8 * N): int(0.9 * N)], idx[int(0.9 * N):]

train_dataset = dataset.index_select(idx_train)
val_dataset = dataset.index_select(idx_val)
test_dataset = dataset.index_select(idx_test)

In [None]:
dataset.len()

Now we define the `Dataloader`.
Note for thet `shuffle` parameter, if set to True, the data will be reshuffled at every epoch.
We do not want such thing for the validation and test sets.

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

## Training:

In [29]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(11, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [None]:
def train(criterion, optimizer, train_loader):
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.
         loss_sum += loss

def eval(criterion, loader):
    model.eval()

    loss_sum = 0
    correct = 0

    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)
         
        # Compute the loss sum
        loss_sum += criterion(out, data.y)  
        
        # Compute accuracy
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.

    acc = correct / len(loader.dataset)  # Derive ratio of correct predictions.
    loss_mean = loss_sum / len(loader.dataset)
    return acc, loss_mean

In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GCN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

train_dataset = dataset.index_select(idx_train).to(device)
val_dataset = dataset.index_select(idx_val).to(device)
test_dataset = dataset.index_select(idx_test).to(device)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for epoch in range(1, 171):
    train(criterion, optimizer, train_loader)
    train_acc, train_loss = eval(criterion, train_loader)
    val_acc, val_loss = eval(criterion, val_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')