# In preparation ...

In [1]:
%%capture
import tensorflow as tf

2023-05-11 13:49:40.080958: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 13:49:40.168920: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-11 13:49:40.171467: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-11 13:49:40.171476: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

just for downloading the some data in place of your data.

In [2]:
%%capture
from kgcnn.data.datasets.ESOLDataset import ESOLDataset
ESOLDataset()

move the data from "~/.kgcnn.datasets.esol" to your local "esol" folder. The ESOL data is just some example data for this notebook.

In [3]:
from kgcnn.data.moleculenet import MoleculeNetDataset

In [4]:
data = MoleculeNetDataset(
    data_directory="esol/",
    dataset_name="esol",
    file_name="delaney-processed.csv",
)

In [5]:
data.prepare_data(
    overwrite=True,
    smiles_column_name="smiles",
    add_hydrogen=True, 
    sanitize=True,
    make_conformers=True, 
    optimize_conformer=True,
    external_program=None, 
    num_workers=None
);

In [6]:
%%capture
from kgcnn.molecule.encoder import OneHotEncoder
data.read_in_memory(
    nodes = [
        'Symbol', 'TotalDegree', 'FormalCharge', 'NumRadicalElectrons', 'Hybridization',
        'IsAromatic', 'IsInRing', 'TotalNumHs', 'CIPCode', "ChiralityPossible", "ChiralTag"
    ],
    encoder_nodes = {
        'Symbol': OneHotEncoder(
            ['B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te', 'I', 'At'],
            dtype="str"
        ),
        'Hybridization': OneHotEncoder([2, 3, 4, 5, 6]),
        'TotalDegree': OneHotEncoder([0, 1, 2, 3, 4, 5], add_unknown=False),
        'TotalNumHs': OneHotEncoder([0, 1, 2, 3, 4], add_unknown=False),
        'CIPCode': OneHotEncoder(['R', 'S'], add_unknown=False, dtype='str'),
        "ChiralityPossible": OneHotEncoder(["1"], add_unknown=False, dtype='str'),
    },
    edges = ['BondType', 'IsAromatic', 'IsConjugated', 'IsInRing', 'Stereo'],
    encoder_edges = {
        'BondType': OneHotEncoder([1, 2, 3, 12], add_unknown=False),
        'Stereo': OneHotEncoder([0, 1, 2, 3], add_unknown=False)
    },
    graph=['ExactMolWt', 'NumAtoms'],
    encoder_graph = {},
    add_hydrogen=False,
    make_directed=False,
    has_conformers=True,
    sanitize=False,
    compute_partial_charges=None,
    label_column_name="measured log solubility in mols per litre"
)

In [7]:
from kgcnn.graph.preprocessor import SetRange, SetEdgeIndicesReverse
data.map_list(SetRange(max_distance=5.0, in_place=True));
data.map_list(SetEdgeIndicesReverse(in_place=True));

In [8]:
data[0].keys()

dict_keys(['node_symbol', 'node_number', 'edge_indices', 'edge_number', 'graph_size', 'node_coordinates', 'graph_labels', 'node_attributes', 'edge_attributes', 'graph_attributes', 'range_indices', 'range_attributes', 'edge_indices_reverse'])

In [9]:
import numpy as np
labels = np.array(data.obtain_property("graph_labels"))
if len(labels.shape) <= 1:
    labels = np.expand_dims(labels, axis=-1)

In [10]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)
train_test_indices = [
    [train_index, test_index] for train_index, test_index in kf.split(X=np.zeros((len(data), 1)), y=labels)]

In [11]:
model_config= {
    "class_name": "make_model",
    "module_name": "kgcnn.literature.DMPNN",
    "config": {
        "name": "DMPNN",
        "inputs": [
            {"shape": [None, 41], "name": "node_attributes", "dtype": "float32", "ragged": True},
            {"shape": [None, 11], "name": "edge_attributes", "dtype": "float32", "ragged": True},
            {"shape": [None, 2], "name": "edge_indices", "dtype": "int64", "ragged": True},
            {"shape": [None, 1], "name": "edge_indices_reverse", "dtype": "int64", "ragged": True}
        ],
        "input_embedding": {"node": {"input_dim": 95, "output_dim": 64},
                            "edge": {"input_dim": 5, "output_dim": 64}},
        "pooling_args": {"pooling_method": "sum"},
        "edge_initialize": {"units": 128, "use_bias": True, "activation": "relu"},
        "edge_dense": {"units": 128, "use_bias": True, "activation": "linear"},
        "edge_activation": {"activation": "relu"},
        "node_dense": {"units": 128, "use_bias": True, "activation": "relu"},
        "verbose": 10, "depth": 5,
        "dropout": {"rate": 0.1},
        "output_embedding": "graph",
        "output_mlp": {
            "use_bias": [True, True, False], "units": [64, 32, 1],
            "activation": ["relu", "relu", "linear"]
        }
    }
}

In [None]:
import time
from kgcnn.model.utils import get_model_class
from tensorflow.keras.optimizers import Adam
from kgcnn.training.scheduler import LinearLearningRateScheduler
from kgcnn.literature.DMPNN import make_model
from kgcnn.data.transform.scaler.molecule import QMGraphLabelScaler
from kgcnn.data.transform.scaler.standard import StandardLabelScaler, StandardScaler
from kgcnn.metrics.metrics import ScaledMeanAbsoluteError, ScaledRootMeanSquaredError
from datetime import timedelta

history_list, test_indices_list = [], []
model, hist, x_test, y_test, scaler, atoms_test = None, None, None, None, None, None
splits_done = 0
for i, (train_index, test_index) in enumerate(train_test_indices):
    print("Running training on fold: %s" % i)

    # Make the model for current split using model kwargs from hyperparameter.
    # They are always updated on top of the models default kwargs.
    model = make_model(**model_config["config"])

    # First select training and test graphs from indices, then convert them into tensorflow tensor
    # representation. Which property of the dataset and whether the tensor will be ragged is retrieved from the
    # kwargs of the keras `Input` layers ('name' and 'ragged').
    dataset_train, dataset_test = data[train_index], data[test_index]
    x_train, y_train = dataset_train.tensor(model_config["config"]["inputs"]), labels[train_index]
    x_test, y_test = dataset_test.tensor(model_config["config"]["inputs"]), labels[test_index]
    
    atoms_test = dataset_test.get("node_number")
    atoms_train = dataset_train.get("node_number")
    
    scaler = StandardScaler(with_std=True,with_mean=True, copy=True)
    scaler.fit(y_train, atomic_number=atoms_train)
    y_train = scaler.transform(y_train, atomic_number=atoms_train)
    y_test = scaler.transform(y_test, atomic_number=atoms_test)

    # If scaler was used we add rescaled standard metrics to compile.
    scaler_scale = scaler.get_scaling()
    mae_metric = ScaledMeanAbsoluteError(scaler_scale.shape, name="scaled_mean_absolute_error")
    rms_metric = ScaledRootMeanSquaredError(scaler_scale.shape, name="scaled_root_mean_squared_error")
    if scaler.scale_ is not None:
        mae_metric.set_scale(scaler_scale)
        rms_metric.set_scale(scaler_scale)
    metrics = [mae_metric, rms_metric]

    # Compile model with optimizer and loss
    model.compile(loss="mean_absolute_error", metrics=metrics, optimizer=Adam(lr=5e-04))
    print(model.summary())

    # Start and time training
    start = time.process_time()
    hist = model.fit(x_train, y_train,
                     validation_data=(x_test, y_test),
                     batch_size=32, 
                     epochs=300, 
                     validation_freq=10, 
                     # Change to verbose = 2 to see progress
                     verbose=2,
                     callbacks= [
                         LinearLearningRateScheduler(
                             learning_rate_start=0.001, learning_rate_stop=1e-05, epo_min=100, epo=300)
                     ])
    stop = time.process_time()
    print("Print Time for training: ", str(timedelta(seconds=stop - start)))

    # Get loss from history
    history_list.append(hist)
    test_indices_list.append([train_index, test_index])
    splits_done = splits_done + 1

Running training on fold: 0


2023-05-11 13:49:53.148831: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-05-11 13:49:53.148848: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-11 13:49:53.148863: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (patrick-tuxedoinfinitybookpro14gen6): /proc/driver/nvidia/version does not exist
2023-05-11 13:49:53.148998: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "DMPNN"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 node_attributes (InputLayer)   [(None, None, 41)]   0           []                               
                                                                                                  
 optional_input_embedding (Opti  (None, None, 41)    0           ['node_attributes[0][0]']        
 onalInputEmbedding)                                                                              
                                                                                                  
 edge_indices (InputLayer)      [(None, None, 2)]    0           []                               
                                                                                                  
 edge_attributes (InputLayer)   [(None, None, 11)]   0           []                           

                                                                  'edge_indices[0][0]',           
                                                                  'edge_indices_reverse[0][0]']   
                                                                                                  
 lazy_add_3 (LazyAdd)           (None, None, 128)    0           ['dense_embedding_1[3][0]',      
                                                                  'dense_embedding[0][0]']        
                                                                                                  
 activation_embedding_3 (Activa  (None, None, 128)   0           ['lazy_add_3[0][0]']             
 tionEmbedding)                                                                                   
                                                                                                  
 dropout_embedding_3 (DropoutEm  (None, None, 128)   0           ['activation_embedding_3[0][0]'] 
 bedding) 

Epoch 24/300
29/29 - 1s - loss: 0.3578 - scaled_mean_absolute_error: 0.7425 - scaled_root_mean_squared_error: 0.9773 - lr: 0.0010 - 727ms/epoch - 25ms/step
Epoch 25/300
29/29 - 1s - loss: 0.3207 - scaled_mean_absolute_error: 0.6656 - scaled_root_mean_squared_error: 0.8598 - lr: 0.0010 - 750ms/epoch - 26ms/step
Epoch 26/300
29/29 - 1s - loss: 0.3679 - scaled_mean_absolute_error: 0.7636 - scaled_root_mean_squared_error: 0.9827 - lr: 0.0010 - 730ms/epoch - 25ms/step
Epoch 27/300
29/29 - 1s - loss: 0.3185 - scaled_mean_absolute_error: 0.6610 - scaled_root_mean_squared_error: 0.8709 - lr: 0.0010 - 739ms/epoch - 25ms/step
Epoch 28/300
29/29 - 1s - loss: 0.3034 - scaled_mean_absolute_error: 0.6297 - scaled_root_mean_squared_error: 0.8199 - lr: 0.0010 - 732ms/epoch - 25ms/step
Epoch 29/300
29/29 - 1s - loss: 0.2966 - scaled_mean_absolute_error: 0.6156 - scaled_root_mean_squared_error: 0.8142 - lr: 0.0010 - 730ms/epoch - 25ms/step
Epoch 30/300
29/29 - 1s - loss: 0.2955 - scaled_mean_absolute_er

Epoch 74/300
29/29 - 1s - loss: 0.1857 - scaled_mean_absolute_error: 0.3853 - scaled_root_mean_squared_error: 0.5374 - lr: 0.0010 - 653ms/epoch - 23ms/step
Epoch 75/300
29/29 - 1s - loss: 0.1840 - scaled_mean_absolute_error: 0.3819 - scaled_root_mean_squared_error: 0.5324 - lr: 0.0010 - 656ms/epoch - 23ms/step
Epoch 76/300
29/29 - 1s - loss: 0.1844 - scaled_mean_absolute_error: 0.3826 - scaled_root_mean_squared_error: 0.5309 - lr: 0.0010 - 661ms/epoch - 23ms/step
Epoch 77/300
29/29 - 1s - loss: 0.1811 - scaled_mean_absolute_error: 0.3759 - scaled_root_mean_squared_error: 0.5246 - lr: 0.0010 - 673ms/epoch - 23ms/step
Epoch 78/300
29/29 - 1s - loss: 0.1792 - scaled_mean_absolute_error: 0.3719 - scaled_root_mean_squared_error: 0.5196 - lr: 0.0010 - 668ms/epoch - 23ms/step
Epoch 79/300
29/29 - 1s - loss: 0.1927 - scaled_mean_absolute_error: 0.3999 - scaled_root_mean_squared_error: 0.5468 - lr: 0.0010 - 660ms/epoch - 23ms/step
Epoch 80/300
29/29 - 1s - loss: 0.1804 - scaled_mean_absolute_er

Epoch 123/300
29/29 - 1s - loss: 0.1383 - scaled_mean_absolute_error: 0.2869 - scaled_root_mean_squared_error: 0.4114 - lr: 8.9110e-04 - 704ms/epoch - 24ms/step
Epoch 124/300
29/29 - 1s - loss: 0.1412 - scaled_mean_absolute_error: 0.2930 - scaled_root_mean_squared_error: 0.4240 - lr: 8.8615e-04 - 685ms/epoch - 24ms/step
Epoch 125/300
29/29 - 1s - loss: 0.1394 - scaled_mean_absolute_error: 0.2893 - scaled_root_mean_squared_error: 0.4120 - lr: 8.8120e-04 - 737ms/epoch - 25ms/step
Epoch 126/300
29/29 - 1s - loss: 0.1365 - scaled_mean_absolute_error: 0.2834 - scaled_root_mean_squared_error: 0.4066 - lr: 8.7625e-04 - 804ms/epoch - 28ms/step
Epoch 127/300
29/29 - 1s - loss: 0.1453 - scaled_mean_absolute_error: 0.3015 - scaled_root_mean_squared_error: 0.4300 - lr: 8.7130e-04 - 733ms/epoch - 25ms/step
Epoch 128/300
29/29 - 1s - loss: 0.1413 - scaled_mean_absolute_error: 0.2933 - scaled_root_mean_squared_error: 0.4264 - lr: 8.6635e-04 - 688ms/epoch - 24ms/step
Epoch 129/300


In [None]:
from kgcnn.utils.plots import plot_train_test_loss, plot_predict_true

plot_train_test_loss(history_list, loss_name=None, val_loss_name=None,
                     model_name="DMPNN", data_unit="mol/L", dataset_name="esol",
                     filepath="", file_name=f"loss.png");

In [None]:
# Plot prediction for the last split.
predicted_y = model.predict(x_test)
true_y = y_test

# Predictions must be rescaled to original values.
predicted_y = scaler.inverse_transform(predicted_y)
true_y = scaler.inverse_transform(true_y)

# Plotting the prediction vs. true test targets for last split. Note for classification this is also done but
# can be ignored.
plot_predict_true(predicted_y, true_y,
                  filepath="", data_unit="mol/L",
                  model_name="DMPNN", dataset_name="ESOL",
                  file_name=f"predict.png")