# Benchmarking MolGraph

### Paper

The benchmarking in this notebook aims to reproduce, with an up-to-date version of MolGraph, the benchmarking results of the published paper [MolGraph: a Python package for the implementation of molecular graphs and graph neural networks with TensorFlow and Keras](https://doi.org/10.48550/arXiv.2208.09944). The paper is also an introduction to MolGraph.

### Dataset
The datasets used for benchmarking are mostly datasets from [MoleculeNet](https://moleculenet.org/datasets-1).

### Requirements
The latest benchmarking was performed with:

- Ubuntu 22.04
    - Python 3.10 
        - Keras 2.15
        - TensorFlow 2.15
        - MolGraph 0.7.8

(Python 3.10 and TensorFlow/Keras 2.15 are currently required.)

### Installation

For CPU users, install MolGraph as follows:

```
pip install molgraph
```
For GPU support:
```
pip install molgraph[gpu]
```

(TensorFlow, Keras, and Pandas are installed automatically.)


## Import modules

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from molgraph import layers
from molgraph import chemistry

## Select GNN layer

In [None]:
GNNConv = layers.GATConv

## Select dataset 

In [None]:
dataset_name = 'lipophilicity'
dataset_config = getattr(chemistry.benchmark.configs, dataset_name)
dataset = chemistry.datasets.get(dataset_name)

## Specify hyperparameters

In [None]:
PATH = Path('./')

NUM_EPOCHS = 300
BATCH_SIZE = 32

LR_INITIAL = 1e-4
LR_END = 1e-6
LR_PATIENCE = 10
LR_DECAY = 0.1

GNN_KWARGS = {
    'units': 128,
    'normalization': 'batch_norm', 
    # 'kernel_initializer': keras.initializers.TruncatedNormal(stddev=0.005),
}

DNN_KWARGS = {
    'units': 1024,
    'activation': 'relu'
}

## Specify molecular graph encoder

In [None]:
encoder = chemistry.MolecularGraphEncoder(
    atom_encoder=chemistry.Featurizer([
        chemistry.features.Symbol(),
        chemistry.features.Hybridization(),
        chemistry.features.FormalCharge(),
        chemistry.features.TotalNumHs(),
        chemistry.features.TotalValence(),
        chemistry.features.NumRadicalElectrons(),
        chemistry.features.Degree(),
        chemistry.features.ChiralCenter(),
        chemistry.features.Aromatic(),
        chemistry.features.Ring(),
        chemistry.features.Hetero(),
        chemistry.features.HydrogenDonor(),
        chemistry.features.HydrogenAcceptor(),
        chemistry.features.CIPCode(),
        chemistry.features.ChiralCenter(),
        chemistry.features.RingSize(),
        chemistry.features.Ring(),
        chemistry.features.CrippenLogPContribution(),
        chemistry.features.CrippenMolarRefractivityContribution(),
        chemistry.features.TPSAContribution(),
        chemistry.features.LabuteASAContribution(),
        chemistry.features.GasteigerCharge(),
    ]),
    bond_encoder=chemistry.Featurizer([
        chemistry.features.BondType(),
        chemistry.features.Conjugated(),
        chemistry.features.Rotatable(),
        chemistry.features.Ring(),
        chemistry.features.Stereo(),
    ]),
    positional_encoding_dim=16,
    self_loops=False
)

record_writer = chemistry.tf_records.writer 
record_loader = chemistry.tf_records.load

## Benchmark models on dataset

In [None]:
# Write and load tf records:
data = {}
for subset_name, subset in dataset.items():
    path = PATH / f'cache/tf_records/{dataset_name}/{subset_name}/'
    with record_writer(path) as writer:
        writer.write(subset, encoder)

    keys = list(subset.keys())
    keys.remove('index')

    data[subset_name] = record_loader(
        path=path, 
        extract_tuple=keys, 
        shuffle_tf_records=True if subset_name == 'train' else False)

    if subset_name == 'train':
        data[subset_name] = data[subset_name].shuffle(4096)

    data[subset_name] = data[subset_name].batch(BATCH_SIZE).prefetch(-1)

# Build model:
node_preprocessing = layers.NodeMinMaxScaling(
    feature_range=(0, 1), threshold=True)
edge_preprocessing = layers.EdgeMinMaxScaling(
    feature_range=(0, 1), threshold=True)
node_preprocessing.adapt(data['train'].map(lambda x, *args: x))
edge_preprocessing.adapt(data['train'].map(lambda x, *args: x))

model = tf.keras.Sequential([
    layers.GNNInput(type_spec=data['train'].element_spec[0]),
    node_preprocessing,
    edge_preprocessing,
    layers.LaplacianPositionalEncoding(),
    GNNConv(**GNN_KWARGS),
    GNNConv(**GNN_KWARGS),
    GNNConv(**GNN_KWARGS),
    layers.Readout(),
    keras.layers.Dense(**DNN_KWARGS),
    keras.layers.Dense(**DNN_KWARGS),
    keras.layers.Dense(
        units=dataset_config['num_tasks'],
        activation='sigmoid' if dataset_config['task_type'] == 'classification' else 'linear'
    )
])

# Train and evaluate model:
optimizer = keras.optimizers.Adam(LR_INITIAL)
loss = keras.losses.deserialize(dataset_config['loss'])
metrics = [keras.metrics.deserialize(dataset_config['metric'])]
callbacks_list = [
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_' + metrics[0].name,
        factor=LR_DECAY,
        patience=LR_PATIENCE,
        min_lr=LR_END,
        mode='min' if not metrics[0].name.endswith('auc') else 'max',
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_' + metrics[0].name,
        patience=LR_PATIENCE * 2,
        mode='min' if not metrics[0].name.endswith('auc') else 'max',
        restore_best_weights=True,
    )
]

model.compile(optimizer, loss, metrics)

history = model.fit(
    data['train'], 
    validation_data=data['validation'], 
    epochs=NUM_EPOCHS, 
    callbacks=callbacks_list,
    verbose=2)

result = model.evaluate(data['test'], verbose=2)

## Run cell below to remove cache (tf records)

In [None]:
import shutil

cache_path = PATH / 'cache'

if cache_path.exists() and cache_path.is_dir():
    shutil.rmtree(cache_path)
    print(f"The folder '{cache_path}' and its subfolders have been removed.")
else:
    print(f"The folder '{cache_path}' does not exist.")