# Zinc Tutorial 1 - 200 Dalton Dataset

## Preliminaries:

In [14]:
import os
import torch
import pandas as pd
import torch.nn.functional as F

# Colab - Pytorch Geometric installation according to Pytorch documentation

#os.environ['TORCH'] = torch.__version__
#print(torch.__version__)
#!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
#!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
#!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git


from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch.nn import Linear
from torch_geometric.nn import GCNConv, global_mean_pool

from IPython.display import Javascript

In [15]:
# Local machine
%cd C:\Users\yuval\Projects\MolecularGraphs

# Colab

#from google.colab import drive
#drive.mount('/content/drive')
#%cd drive/MyDrive/MolecularGraphs/


C:\Users\yuval\Projects\MolecularGraphs


## Preparing the dataset:

### Defining custom Dataset:

The `Dataset` class is designed to sample batches from storage without uploading all data into the RAM.
To create our own custom operation we need to create a class that inherent from `Dataset` class.

In the `init` method, the arguments that are pass to `Dataset` are:
* `root` (str, optional) - The root directory where the data should be saved.
This directory is going to have `raw` directory and `processed` directory.
The `raw` directory is where you have all files of the data, a file per instance.
The `processed` directory is where the class is going to saved all processed files.
The processing of files in our case is the convertion of the file into a `Data` object (including node features, edge index, label/s, and optional of edges features).
* `transform` (callable, optional) - not used - a function/transform that takes in an `Data` object and returns a transformed version. The `Data` object will be **transformed before every access**.
* `pre_transform` (callable, optional) - not used – a function/transform that takes in an `Data` object and returns a transformed version. The `Data` object will be **transformed before being saved to disk**. (default: None)
* `pre_filter` (callable, optional) - not used - a function that takes in an `Data` object and returns a boolean value, indicating whether the `Data` object should be included in the final dataset. 
* `log` (bool, optional) - whether to print any console output while downloading and processing the dataset.

Following the `init` method, we have two method decorated as property.
The decorator define the method as a "getter", i.e., getting an attribute of the class.
That means we can treat such method as an attribute and call it without parentheses.
Those two properties return all files names inside the previously mentioned directories - `raw` and `processed`.
Those two properties are designed for the class to check if the raw/processes files exsit before performing the dolwnloading/processing.

The `process` method is called with calling the `Dataset`'s `init` method (I think).
In this method you iterate over all of the raw files and turn them into `Data` object of graph, including the `pre_transform` and `pre_filter` functions calls.

The `len` and `get` are self-explanable.


In [16]:
class MyDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return os.listdir(self.raw_dir)

    @property
    def processed_file_names(self):
        data_file_names = [os.path.splitext(file_name)[0]+ '.pt' for file_name in self.raw_file_names]
        return data_file_names + ['pre_filter.pt', 'pre_transform.pt']

    @property
    def num_classes(self):
      return 2

    def process(self):
        idx = 0
        for raw_path in self.raw_paths:

            # Load the two arrays and scaler from the saved file using read_pickle()
            with open(raw_path, 'rb') as f:
                x, edge_index, y = pd.read_pickle(f)

            data_i = Data(x=torch.tensor(x, dtype=torch.float), 
                          edge_index=torch.tensor(edge_index, dtype=torch.long),
                          y=torch.tensor([int(y)], dtype=torch.long)) # You want the y as 1D int and not a scaler

            if self.pre_filter is not None and not self.pre_filter(data_i):
                continue

            if self.pre_transform is not None:
                data_i = self.pre_transform(data_i)

            torch.save(data_i, os.path.join(self.processed_dir, f'data_{idx}.pt'))
            idx += 1

    def len(self):
        return len(self.processed_file_names) - 2 # minus the pre_filter and pre_transform

    def get(self, idx):
        data_i = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data_i

In [17]:
dataset = MyDataset('Zinc_200D/GraphData')

### Examination:

In [5]:
data = dataset.get(10)
data

Data(x=[19, 11], edge_index=[2, 40], y=[1])

In [6]:
data.x

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.,

In [7]:
data.edge_index

tensor([[ 0,  1,  1,  1,  2,  3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  8,
          8,  9,  9,  9, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16,
         17, 17, 18, 18],
        [ 1,  0,  2,  3,  1,  1,  4,  3,  5,  4,  6, 13,  5,  7, 12,  6,  8,  7,
          9,  8, 10, 11,  9,  9, 12,  6, 11,  5, 14, 18, 13, 15, 14, 16, 15, 17,
         16, 18, 13, 17]])

In [8]:
data.y

tensor([0])

In [9]:
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [10]:
data = next(iter(loader))

In [12]:
type(data)

torch_geometric.data.batch.DataBatch

In [14]:
for data in loader:
    print(len(data))

32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
24


To split the data into training, validation and test sets, we use the `index_select` method which creates a subset of the dataset from specified indices idx.

In [None]:
N = dataset.len()
idx = torch.randperm(N) # Random permutation of integers from 0 to N - 1
idx_train, idx_val, idx_test = idx[:int(0.8 * N)], idx[int(0.8 * N): int(0.9 * N)], idx[int(0.9 * N):]

train_dataset = dataset.index_select(idx_train)
val_dataset = dataset.index_select(idx_val)
test_dataset = dataset.index_select(idx_test)

In [None]:
dataset.len()

Now we define the `Dataloader`.
Note for thet `shuffle` parameter, if set to True, the data will be reshuffled at every epoch.
We do not want such thing for the validation and test sets.

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

## Training:

In [18]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, dropout_p):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.dropout_p = dropout_p
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2= GCNConv(hidden_channels, hidden_channels)
        self.conv3= GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.conv4(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=self.dropout_p, training=self.training)
        x = self.lin(x)
        
        return x

In [19]:
def progress_bar(iteration, total, size=30):
    running = iteration < total
    c = ">" if running else "="
    p = (size - 1) * iteration // total
    fmt = "{{:-{}d}}/{{}} [{{}}]".format(len(str(total)))
    params = [iteration, total, "=" * p + c + "." * (size - p - 1)]
    return fmt.format(*params)

def print_status_bar(iteration, total, metrics):
    end = "" if iteration < total else "\n"
    metric_str = " - ".join(["{}: {:.4f}".format(m, metrics[m]) for m in metrics])
    print("\r{} - {}".format(progress_bar(iteration, total), metric_str), end=end)

def train(model, criterion, optimizer, loader, device, frac_batches=1):
    
    model.train()
    total_batch_num = int(len(loader) * frac_batches) # total number of batches used in the evaluation
    
    # running metrics
    weighted_loss_sum = 0
    total_instances_num = 0

    # Iterate in batches over the training dataset. data is a DataBatch object
    for current_batch_i, data in enumerate(loader):  
        if current_batch_i + 1 > total_batch_num:
           break

        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
        
        # printing progress
        weighted_loss_sum += len(data) * loss
        total_instances_num += len(data)
        metrics = {'loss': weighted_loss_sum / total_instances_num} # weighted mean with respect to batches
        print_status_bar(current_batch_i + 1, total_batch_num, metrics)
         

@torch.no_grad()
def eval(model, criterion, loader, device):
    model.eval()

    # running metrics
    weighted_loss_sum = 0
    total_instances_num = 0
    correct_sum = 0

    # Iterate in batches over the training/test dataset. Each data is 
    for data in loader:  

        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)
         
        # Compute the loss sum
        weighted_loss_sum += len(data) * criterion(out, data.y)  
        
        # Compute accuracy
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct_sum += int((pred == data.y).sum())  # summing the number of correct instances predictions
        total_instances_num += len(data)

    acc = correct_sum / total_instances_num  # number of correct instances predictions devided by the number of instances
    loss_mean = weighted_loss_sum / total_instances_num # weighted mean with respect to batches
    return acc, loss_mean

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lr = 0.008
batch_size = 32
hidden_channels=64
dropout_p = 0.15

N = dataset.len()
idx = torch.randperm(N) # Random permutation of integers from 0 to N - 1
idx_train, idx_val, idx_test = idx[:int(0.8 * N)], idx[int(0.8 * N): int(0.9 * N)], idx[int(0.9 * N):]

model = GCN(hidden_channels=hidden_channels, dropout_p=dropout_p).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()

train_dataset = dataset.index_select(idx_train)
val_dataset = dataset.index_select(idx_val)
test_dataset = dataset.index_select(idx_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [73]:
training_metrics = []
for epoch in range(1, 25):
    print(f'Epoch: {epoch}')
    train(criterion, optimizer, train_loader, device)

    train_acc, train_loss = eval(model, criterion, train_loader, device)
    val_acc, val_loss = eval(model, criterion, val_loader, device)
    training_metrics.append([train_acc, train_loss, val_acc, val_loss])
    print(f'Train Acc: {train_acc:.4f}, Train loss: {train_loss:.4f}\n Val Acc: {val_acc:.4f}, Val loss: {val_loss:.4f}')

Epoch: 1
Train Acc: 28.3243, Train loss: 0.2396
 Val Acc: 25.0000 Val loss: 0.2750
Epoch: 2
Train Acc: 28.9595, Train loss: 0.2263
 Val Acc: 25.8000 Val loss: 0.2552
Epoch: 3
Train Acc: 28.9324, Train loss: 0.2329
 Val Acc: 26.1000 Val loss: 0.2608
Epoch: 4
Train Acc: 28.9054, Train loss: 0.2368
 Val Acc: 26.2000 Val loss: 0.2633
Epoch: 5
Train Acc: 29.0270, Train loss: 0.2304
 Val Acc: 26.2000 Val loss: 0.2663
Epoch: 6
Train Acc: 29.1757, Train loss: 0.2144
 Val Acc: 26.5000 Val loss: 0.2395
Epoch: 7
Train Acc: 29.1351, Train loss: 0.2145
 Val Acc: 26.3000 Val loss: 0.2407
Epoch: 8
Train Acc: 29.2568, Train loss: 0.2225
 Val Acc: 26.5000 Val loss: 0.2314
Epoch: 9
Train Acc: 28.8108, Train loss: 0.2309
 Val Acc: 25.8000 Val loss: 0.2636
Epoch: 10
Train Acc: 29.0000, Train loss: 0.2159
 Val Acc: 26.2000 Val loss: 0.2494
Epoch: 11
Train Acc: 29.0811, Train loss: 0.2120
 Val Acc: 26.3000 Val loss: 0.2358
Epoch: 12
Train Acc: 28.9595, Train loss: 0.2272
 Val Acc: 26.4000 Val loss: 0.2560
E