In [1]:
import tensorflow as tf

2023-05-11 13:38:22.500311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 13:38:22.590136: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-11 13:38:22.592717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-11 13:38:22.592727: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [2]:
%%capture
from kgcnn.data.datasets.ESOLDataset import ESOLDataset
ESOLDataset()

In [3]:
from kgcnn.data.moleculenet import MoleculeNetDataset

In [4]:
data = MoleculeNetDataset(
    data_directory="esol/",
    dataset_name="esol",
    file_name="delaney-processed.csv",
)

In [5]:
data.prepare_data(
    overwrite=True,
    smiles_column_name="smiles",
    add_hydrogen=True, 
    sanitize=True,
    make_conformers=True, 
    optimize_conformer=True,
    external_program=None, 
    num_workers=None
);

In [16]:
%%capture
from kgcnn.molecule.encoder import OneHotEncoder
data.read_in_memory(
    nodes = [
        'Symbol', 'TotalDegree', 'FormalCharge', 'NumRadicalElectrons', 'Hybridization',
        'IsAromatic', 'IsInRing', 'TotalNumHs', 'CIPCode', "ChiralityPossible", "ChiralTag"
    ],
    encoder_nodes = {
        'Symbol': OneHotEncoder(
            ['B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te', 'I', 'At'],
            dtype="str"
        ),
        'Hybridization': OneHotEncoder([2, 3, 4, 5, 6]),
        'TotalDegree': OneHotEncoder([0, 1, 2, 3, 4, 5], add_unknown=False),
        'TotalNumHs': OneHotEncoder([0, 1, 2, 3, 4], add_unknown=False),
        'CIPCode': OneHotEncoder(['R', 'S'], add_unknown=False, dtype='str'),
        "ChiralityPossible": OneHotEncoder(["1"], add_unknown=False, dtype='str'),
    },
    edges = ['BondType', 'IsAromatic', 'IsConjugated', 'IsInRing', 'Stereo'],
    encoder_edges = {
        'BondType': OneHotEncoder([1, 2, 3, 12], add_unknown=False),
        'Stereo': OneHotEncoder([0, 1, 2, 3], add_unknown=False)
    },
    graph=['ExactMolWt', 'NumAtoms'],
    encoder_graph = {},
    add_hydrogen=False,
    make_directed=False,
    has_conformers=True,
    sanitize=False,
    compute_partial_charges=None,
    label_column_name="measured log solubility in mols per litre"
)

In [19]:
from kgcnn.graph.preprocessor import SetRange, SetEdgeIndicesReverse
data.map_list(SetRange(max_distance=5.0, in_place=True));
data.map_list(SetEdgeIndicesReverse(in_place=True));

In [20]:
data[0].keys()

dict_keys(['node_symbol', 'node_number', 'edge_indices', 'edge_number', 'graph_size', 'node_coordinates', 'graph_labels', 'node_attributes', 'edge_attributes', 'graph_attributes', 'range_indices', 'range_attributes', 'edge_indices_reverse'])

In [21]:
import numpy as np
labels = np.array(data.obtain_property("graph_labels"))
if len(labels.shape) <= 1:
    labels = np.expand_dims(labels, axis=-1)

In [22]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)
train_test_indices = [
    [train_index, test_index] for train_index, test_index in kf.split(X=np.zeros((len(data), 1)), y=labels)]

In [23]:
model_config= {
    "class_name": "make_model",
    "module_name": "kgcnn.literature.DMPNN",
    "config": {
        "name": "DMPNN",
        "inputs": [
            {"shape": [None, 41], "name": "node_attributes", "dtype": "float32", "ragged": True},
            {"shape": [None, 11], "name": "edge_attributes", "dtype": "float32", "ragged": True},
            {"shape": [None, 2], "name": "edge_indices", "dtype": "int64", "ragged": True},
            {"shape": [None, 1], "name": "edge_indices_reverse", "dtype": "int64", "ragged": True}
        ],
        "input_embedding": {"node": {"input_dim": 95, "output_dim": 64},
                            "edge": {"input_dim": 5, "output_dim": 64}},
        "pooling_args": {"pooling_method": "sum"},
        "edge_initialize": {"units": 128, "use_bias": True, "activation": "relu"},
        "edge_dense": {"units": 128, "use_bias": True, "activation": "linear"},
        "edge_activation": {"activation": "relu"},
        "node_dense": {"units": 128, "use_bias": True, "activation": "relu"},
        "verbose": 10, "depth": 5,
        "dropout": {"rate": 0.1},
        "output_embedding": "graph",
        "output_mlp": {
            "use_bias": [True, True, False], "units": [64, 32, 1],
            "activation": ["relu", "relu", "linear"]
        }
    }
}

In [None]:
import time
from kgcnn.model.utils import get_model_class
from tensorflow.keras.optimizers import Adam
from kgcnn.training.scheduler import LinearLearningRateScheduler
from kgcnn.literature.DMPNN import make_model
from kgcnn.data.transform.scaler.molecule import QMGraphLabelScaler
from kgcnn.data.transform.scaler.standard import StandardLabelScaler, StandardScaler
from kgcnn.metrics.metrics import ScaledMeanAbsoluteError, ScaledRootMeanSquaredError
from datetime import timedelta

history_list, test_indices_list = [], []
model, hist, x_test, y_test, scaler, atoms_test = None, None, None, None, None, None
splits_done = 0
for i, (train_index, test_index) in enumerate(train_test_indices):
    print("Running training on fold: %s" % i)

    # Make the model for current split using model kwargs from hyperparameter.
    # They are always updated on top of the models default kwargs.
    model = make_model(**model_config["config"])

    # First select training and test graphs from indices, then convert them into tensorflow tensor
    # representation. Which property of the dataset and whether the tensor will be ragged is retrieved from the
    # kwargs of the keras `Input` layers ('name' and 'ragged').
    dataset_train, dataset_test = data[train_index], data[test_index]
    x_train, y_train = dataset_train.tensor(model_config["config"]["inputs"]), labels[train_index]
    x_test, y_test = dataset_test.tensor(model_config["config"]["inputs"]), labels[test_index]
    
    atoms_test = dataset_test.get("node_number")
    atoms_train = dataset_train.get("node_number")
    
    scaler = StandardScaler(with_std=True,with_mean=True, copy=True)
    scaler.fit(y_train, atomic_number=atoms_train)
    y_train = scaler.transform(y_train, atomic_number=atoms_train)
    y_test = scaler.transform(y_test, atomic_number=atoms_test)

    # If scaler was used we add rescaled standard metrics to compile.
    scaler_scale = scaler.get_scaling()
    mae_metric = ScaledMeanAbsoluteError(scaler_scale.shape, name="scaled_mean_absolute_error")
    rms_metric = ScaledRootMeanSquaredError(scaler_scale.shape, name="scaled_root_mean_squared_error")
    if scaler.scale_ is not None:
        mae_metric.set_scale(scaler_scale)
        rms_metric.set_scale(scaler_scale)
    metrics = [mae_metric, rms_metric]

    # Compile model with optimizer and loss
    model.compile(loss="mean_absolute_error", metrics=metrics, optimizer=Adam(lr=5e-04))
    print(model.summary())

    # Start and time training
    start = time.process_time()
    hist = model.fit(x_train, y_train,
                     validation_data=(x_test, y_test),
                     batch_size=32, 
                     epochs=300, 
                     validation_freq=10, 
                     # Change to verbose = 2 to see progress
                     verbose=2,
                     callbacks= [
                         LinearLearningRateScheduler(
                             learning_rate_start=0.001, learning_rate_stop=1e-05, epo_min=100, epo=300)
                     ])
    stop = time.process_time()
    print("Print Time for training: ", str(timedelta(seconds=stop - start)))

    # Get loss from history
    history_list.append(hist)
    test_indices_list.append([train_index, test_index])
    splits_done = splits_done + 1

Running training on fold: 0
Model: "DMPNN"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 node_attributes (InputLayer)   [(None, None, 41)]   0           []                               
                                                                                                  
 optional_input_embedding_2 (Op  (None, None, 41)    0           ['node_attributes[0][0]']        
 tionalInputEmbedding)                                                                            
                                                                                                  
 edge_indices (InputLayer)      [(None, None, 2)]    0           []                               
                                                                                                  
 edge_attributes (InputLayer)   [(None, None, 11)]   0           [

 bedding)                                                                                         
                                                                                                  
 dmpnnp_pooling_edges_directed_  (None, None, 128)   0           ['optional_input_embedding_2[0][0
 8 (DMPNNPPoolingEdgesDirected)                                  ]',                              
                                                                  'dropout_embedding_7[0][0]',    
                                                                  'edge_indices[0][0]',           
                                                                  'edge_indices_reverse[0][0]']   
                                                                                                  
 lazy_add_8 (LazyAdd)           (None, None, 128)    0           ['dense_embedding_4[3][0]',      
                                                                  'dense_embedding_3[0][0]']      
          

Epoch 21/300
29/29 - 0s - loss: 0.3404 - scaled_mean_absolute_error: 0.7064 - scaled_root_mean_squared_error: 0.9125 - lr: 0.0010 - 464ms/epoch - 16ms/step
Epoch 22/300
29/29 - 0s - loss: 0.2963 - scaled_mean_absolute_error: 0.6150 - scaled_root_mean_squared_error: 0.7983 - lr: 0.0010 - 459ms/epoch - 16ms/step
Epoch 23/300
29/29 - 0s - loss: 0.3288 - scaled_mean_absolute_error: 0.6823 - scaled_root_mean_squared_error: 0.8847 - lr: 0.0010 - 460ms/epoch - 16ms/step
Epoch 24/300
29/29 - 0s - loss: 0.3124 - scaled_mean_absolute_error: 0.6483 - scaled_root_mean_squared_error: 0.8396 - lr: 0.0010 - 470ms/epoch - 16ms/step
Epoch 25/300
29/29 - 1s - loss: 0.3188 - scaled_mean_absolute_error: 0.6617 - scaled_root_mean_squared_error: 0.8588 - lr: 0.0010 - 612ms/epoch - 21ms/step
Epoch 26/300
29/29 - 1s - loss: 0.2867 - scaled_mean_absolute_error: 0.5950 - scaled_root_mean_squared_error: 0.7824 - lr: 0.0010 - 701ms/epoch - 24ms/step
Epoch 27/300
29/29 - 1s - loss: 0.3006 - scaled_mean_absolute_er

In [None]:
from kgcnn.utils.plots import plot_train_test_loss, plot_predict_true

plot_train_test_loss(history_list, loss_name=None, val_loss_name=None,
                     model_name="Schnet", data_unit="kcal", dataset_name="qm7",
                     filepath="", file_name=f"loss.png");