In [1]:
import os
from pathlib import Path

chemprop_path = Path(".") / "chemprop"
os.chdir(chemprop_path)

import numpy as np
from rdkit import RDLogger
from sklearn.metrics import mean_squared_error, r2_score
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint
import pandas as pd

from chemprop import data, featurizers, models, nn

In [3]:
# Reference: https://chemprop.readthedocs.io/en/latest/training.html
input_path = "../data/Assays-pXC50/{target}.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'SMILES' # name of the column containing SMILES strings
target_columns = ['pXC50'] # list of names of the columns containing targets
targets = ["AChE", "D2R", "D3R", "_5HT2A", "MAOB"]

TRAIN = True

if TRAIN:
    for target in targets:
        print(f"Training model for {target}")
        df_input = pd.read_csv(input_path.format(target=target))
        smis = df_input.loc[:, smiles_column].values
        ys = df_input.loc[:, target_columns].values
        splits = df_input.loc[:, "split"].values
        all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

        mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
        
        # Get indices for train, val, and test from splits column
        train_indices, val_indices, test_indices = (
            np.array(np.where(splits == "train")), 
            np.array(np.where(splits == "val")),
            np.array(np.where(splits == "test"))
        )
        
        train_data, val_data, test_data = data.split_data_by_indices(
            all_data, train_indices, val_indices, test_indices
        )

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

        train_dset = data.MoleculeDataset(train_data[0], featurizer)
        scaler = train_dset.normalize_targets()

        val_dset = data.MoleculeDataset(val_data[0], featurizer)
        val_dset.normalize_targets(scaler)

        test_dset = data.MoleculeDataset(test_data[0], featurizer)

        train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
        val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
        test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

        mp = nn.BondMessagePassing()
        agg = nn.MeanAggregation()
        output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
        ffn = nn.RegressionFFN(output_transform=output_transform)
        batch_norm = True

        metric_list = [nn.metrics.MSE(), nn.metrics.MAE()] # , nn.metrics.R2Score()] # Only the first metric is used for training and early stopping
        mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

        # Configure model checkpointing
        checkpointing = ModelCheckpoint(
            f"../checkpoints/{target}",  # Directory where model checkpoints will be saved
            "best-{epoch}-{val_loss:.2f}",  # Filename format for checkpoints, including epoch and validation loss
            "val_loss",  # Metric used to select the best checkpoint (based on validation loss)
            mode="min",  # Save the checkpoint with the lowest validation loss (minimization objective)
            save_last=True,  # Always save the most recent checkpoint, even if it's not the best
            enable_version_counter=False
        )

        trainer = pl.Trainer(
            logger=False,
            enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
            enable_progress_bar=True,
            accelerator="auto",
            devices=1,
            max_epochs=20, # number of epochs to train for
            callbacks=[checkpointing], # Use the configured checkpoint callback
        )

        trainer.fit(mpnn, train_loader, val_loader)
        results = trainer.test(dataloaders=test_loader)
        print(results)

Training model for AChE


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/AChE exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 42/42 [00:01<00:00, 25.47it/s, train_loss_step=0.225, val_loss=0.306, train_loss_epoch=0.189] 

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 42/42 [00:01<00:00, 25.26it/s, train_loss_step=0.225, val_loss=0.306, train_loss_epoch=0.189]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/AChE/best-epoch=16-val_loss=0.30.ckpt





LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/AChE/best-epoch=16-val_loss=0.30.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 12/12 [00:00<00:00, 36.44it/s]


[{'test/mse': 0.7740098237991333, 'test/mae': 0.6329134106636047}]
Training model for D2R


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/D2R exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation 

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 84/84 [00:03<00:00, 25.82it/s, train_loss_step=0.171, val_loss=0.378, train_loss_epoch=0.235]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 84/84 [00:03<00:00, 25.59it/s, train_loss_step=0.171, val_loss=0.378, train_loss_epoch=0.235]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/D2R/best-epoch=19-val_loss=0.38.ckpt





LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/D2R/best-epoch=19-val_loss=0.38.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 24/24 [00:00<00:00, 35.22it/s]


[{'test/mse': 0.5661064386367798, 'test/mae': 0.5599697232246399}]
Training model for D3R


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/D3R exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation 

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 35/35 [00:01<00:00, 26.55it/s, train_loss_step=0.273, val_loss=0.475, train_loss_epoch=0.198]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 35/35 [00:01<00:00, 26.21it/s, train_loss_step=0.273, val_loss=0.475, train_loss_epoch=0.198]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/D3R/best-epoch=14-val_loss=0.45.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/D3R/best-epoch=14-val_loss=0.45.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



Testing DataLoader 0: 100%|██████████| 10/10 [00:00<00:00, 35.83it/s]


[{'test/mse': 0.7118385434150696, 'test/mae': 0.6453957557678223}]
Training model for _5HT2A


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/_5HT2A exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregati

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 44/44 [00:01<00:00, 28.39it/s, train_loss_step=0.0587, val_loss=0.424, train_loss_epoch=0.275]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 44/44 [00:01<00:00, 28.07it/s, train_loss_step=0.0587, val_loss=0.424, train_loss_epoch=0.275]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/_5HT2A/best-epoch=17-val_loss=0.42.ckpt





LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/_5HT2A/best-epoch=17-val_loss=0.42.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 12/12 [00:00<00:00, 37.70it/s]


[{'test/mse': 0.6624202728271484, 'test/mae': 0.6171982288360596}]
Training model for MAOB


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/MAOB exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregation

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 26/26 [00:00<00:00, 32.57it/s, train_loss_step=0.369, val_loss=0.541, train_loss_epoch=0.275] 

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 26/26 [00:00<00:00, 32.06it/s, train_loss_step=0.369, val_loss=0.541, train_loss_epoch=0.275]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/MAOB/best-epoch=16-val_loss=0.50.ckpt





LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/MAOB/best-epoch=16-val_loss=0.50.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 44.74it/s] 


[{'test/mse': 0.8887014985084534, 'test/mae': 0.7204970717430115}]


In [4]:
# Reference: https://chemprop.readthedocs.io/en/latest/predicting.html
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import torch
from lightning import pytorch as pl
from chemprop import data, featurizers, models


targets = ['AChE', 'D2R', 'D3R', '_5HT2A', 'MAOB']
mpnn_metrics = dict()

def run_mpnn_on_smiles(smiles_input, mpnn):
    test_data = [data.MoleculeDatapoint.from_smi(smi) for smi in smiles_input]

    featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
    test_dset = data.MoleculeDataset(test_data, featurizer=featurizer)
    test_loader = data.build_dataloader(test_dset, shuffle=False)

    with torch.inference_mode():
        trainer = pl.Trainer(
            logger=False,
            # enable_progress_bar=True,
            accelerator="cuda",
            devices=1,
        )
        test_preds = trainer.predict(mpnn, test_loader)
        test_preds = np.concatenate(test_preds, axis=0)  # Concatenate batches

    # Check if the model is single or multi-target
    tasks = test_preds.shape[1]
    if tasks == 1:
        # Single target model: return a 1D array (# of samples,)
        return test_preds[:, 0]
    
    # Multi-target model: return a 2D array (# of targets, # of samples)
    # Can be decomposed into multiple 1D arrays
    preds_tasks = np.array([test_preds[:, i] for i in range(tasks)])
    return preds_tasks

for target in targets:
    print(f"Evaluating {target} - pXC50")
    df_test = pd.read_csv(f"../data/Assays-pXC50/{target}.csv").query("split == 'test'")
    mpnn = models.MPNN.load_from_checkpoint(f'../checkpoints/{target}/last.ckpt')
    
    smiles_input = df_test["SMILES"].tolist()
    test_preds = run_mpnn_on_smiles(smiles_input, mpnn)

    # Skip molecules that failed to be processed (prediction is nan)
    df_test["pXC50_pred"] = test_preds
    print(df_test['pXC50_pred'].isna().sum())
    df_test = df_test.dropna(subset=["pXC50_pred"])
    y_true = df_test["pXC50"].values
    test_preds = df_test["pXC50_pred"].values

    mse = mean_squared_error(y_true, test_preds)
    r2 = r2_score(y_true, test_preds)
    mae = mean_absolute_error(y_true, test_preds)

    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")
    print(f"Mean Absolute Error: {mae}\n")
    mpnn_metrics[target] = {"mse": mse, "r2": r2, "mae": mae}

metrics_df = pd.DataFrame(mpnn_metrics).T
metrics_df

Evaluating AChE - pXC50


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 12/12 [00:00<00:00, 38.11it/s]
0
Mean Squared Error: 0.7809265835244809
R2 Score: 0.6444298317718669
Mean Absolute Error: 0.6370649826808609

Evaluating D2R - pXC50


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 24/24 [00:00<00:00, 36.35it/s]
0
Mean Squared Error: 0.5661063647845489
R2 Score: 0.5604960482204995
Mean Absolute Error: 0.55996967456258

Evaluating D3R - pXC50


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 10/10 [00:00<00:00, 39.07it/s]
0
Mean Squared Error: 0.7005197744556219
R2 Score: 0.5589170066868601
Mean Absolute Error: 0.6386870237333011

Evaluating _5HT2A - pXC50


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 12/12 [00:00<00:00, 39.54it/s]
0
Mean Squared Error: 0.6631354193855045
R2 Score: 0.5542842896255107
Mean Absolute Error: 0.6188471763604864

Evaluating MAOB - pXC50


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 47.57it/s] 
0
Mean Squared Error: 0.8941769350421328
R2 Score: 0.5860250536719389
Mean Absolute Error: 0.7292846486737211



Unnamed: 0,mse,r2,mae
AChE,0.780927,0.64443,0.637065
D2R,0.566106,0.560496,0.55997
D3R,0.70052,0.558917,0.638687
_5HT2A,0.663135,0.554284,0.618847
MAOB,0.894177,0.586025,0.729285


In [5]:
"""
The multi-target datasets are created by concatenating the pXC50 values of the two targets.
For each target combination, we will have a CSV file with the following columns:
- SMILES, pXC50_{target1}, pXC50_{target2}, split
The pXC50_{target1} and pXC50_{target2} columns are the pXC50 values of the two targets.
To fill the missing values, we will predict the pXC50 values using the trained MPNN models.
We must maintain the same train/val/test split for the multi-target datasets.
"""
target_combinations = (
    # Alzheimers
    ("AChE", "MAOB"),
    # Schizophrenia
    ("D2R", "_5HT2A"),
    # Parkinsons
    ("D2R", "D3R"),
)

CREATE_MULTITARGET_DATASETS = True

if CREATE_MULTITARGET_DATASETS:
    for targets in target_combinations:
        output_csv = f"../data/Multitarget-pXC50/{targets[0]}-{targets[1]}.csv"
        print(f"Creating {output_csv}")

        # Load the datasets
        df_target0 = pd.read_csv(f"../data/Assays-pXC50/{targets[0]}.csv")
        df_target1 = pd.read_csv(f"../data/Assays-pXC50/{targets[1]}.csv")

        print(f"Number of molecules in {targets[0]}: {len(df_target0)}")
        print(f"Number of molecules in {targets[1]}: {len(df_target1)}")

        print("SMILES intersection:", len(set(df_target0["SMILES"]).intersection(set(df_target1["SMILES"]))))

        # Merge the datasets
        df = pd.merge(df_target0, df_target1, on="SMILES", suffixes=(f"_{targets[0]}", f"_{targets[1]}"), how="outer")

        print(f"Number of molecules in the merged dataset: {len(df)}")

        # Predict pXC50_{target0} using the MPNN model
        mpnn_target0 = models.MPNN.load_from_checkpoint(f'../checkpoints/{targets[0]}/last.ckpt')
        smiles_input = df["SMILES"].tolist()
        test_preds = run_mpnn_on_smiles(smiles_input, mpnn_target0)
        preds_target0 = pd.Series(test_preds)
        df[f"pXC50_{targets[0]}"] = df[f"pXC50_{targets[0]}"].fillna(preds_target0)

        # Predict pXC50_{target1} using the MPNN model
        mpnn_target1 = models.MPNN.load_from_checkpoint(f'../checkpoints/{targets[1]}/last.ckpt')
        smiles_input = df["SMILES"].tolist()
        test_preds = run_mpnn_on_smiles(smiles_input, mpnn_target1)
        preds_target1 = pd.Series(test_preds)
        df[f"pXC50_{targets[1]}"] = df[f"pXC50_{targets[1]}"].fillna(preds_target1)

        # Drop lines where pXC50_{target0} or pXC50_{target1} are missing
        df = df.dropna(subset=[f"pXC50_{targets[0]}", f"pXC50_{targets[1]}"])

        df["split"] = df[f"split_{targets[0]}"].fillna(df[f"split_{targets[1]}"])
        df[["SMILES", f"pXC50_{targets[0]}", f"pXC50_{targets[1]}", "split"]].to_csv(output_csv, index=False)

Creating ../data/Multitarget-pXC50/AChE-MAOB.csv
Number of molecules in AChE: 3688
Number of molecules in MAOB: 2232
SMILES intersection: 65
Number of molecules in the merged dataset: 5855


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:02<00:00, 36.93it/s]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:02<00:00, 38.11it/s]
Creating ../data/Multitarget-pXC50/D2R-_5HT2A.csv
Number of molecules in D2R: 7427
Number of molecules in _5HT2A: 3826
SMILES intersection: 1198
Number of molecules in the merged dataset: 10055


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 158/158 [00:04<00:00, 35.97it/s]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 158/158 [00:04<00:00, 36.00it/s]
Creating ../data/Multitarget-pXC50/D2R-D3R.csv
Number of molecules in D2R: 7427
Number of molecules in D3R: 3060
SMILES intersection: 2615
Number of molecules in the merged dataset: 7872


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 123/123 [00:03<00:00, 35.73it/s]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 123/123 [00:03<00:00, 35.08it/s]


In [6]:
# Train the multi-target MPNN models
target_combinations = (
    # Alzheimers
    ("AChE", "MAOB"),
    # Schizophrenia
    ("D2R", "_5HT2A"),
    # Parkinsons
    ("D2R", "D3R"),
)

input_path = "../data/Multitarget-pXC50/{target0}-{target1}.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'SMILES' # name of the column containing SMILES strings
target_columns = ['pXC50_{target0}', 'pXC50_{target1}'] # list of names of the columns containing targets

TRAIN = True

if TRAIN:
    for target0, target1 in target_combinations:
        print(f"Training model for {(target0, target1)}")
        df_input = pd.read_csv(input_path.format(target0=target0, target1=target1))
        smis = df_input.loc[:, smiles_column].values

        multitarget_columns = [f"pXC50_{target0}", f"pXC50_{target1}"]
        ys = df_input.loc[:, multitarget_columns].values
        
        splits = df_input.loc[:, "split"].values
        all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]
        mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
        
        # Get indices for train, val, and test from splits column
        train_indices, val_indices, test_indices = (
            np.array(np.where(splits == "train")), 
            np.array(np.where(splits == "val")),
            np.array(np.where(splits == "test"))
        )
        
        train_data, val_data, test_data = data.split_data_by_indices(
            all_data, train_indices, val_indices, test_indices
        )

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

        train_dset = data.MoleculeDataset(train_data[0], featurizer)
        scaler = train_dset.normalize_targets()

        val_dset = data.MoleculeDataset(val_data[0], featurizer)
        val_dset.normalize_targets(scaler)

        test_dset = data.MoleculeDataset(test_data[0], featurizer)

        train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
        val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
        test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

        mp = nn.BondMessagePassing()
        agg = nn.MeanAggregation()
        output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
        ffn = nn.RegressionFFN(output_transform=output_transform)
        batch_norm = True

        metric_list = [nn.metrics.MSE(), nn.metrics.MAE()] # , nn.metrics.R2Score()] # Only the first metric is used for training and early stopping
        mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

        # Configure model checkpointing
        checkpointing = ModelCheckpoint(
            f"../checkpoints/{target0}-{target1}",  # Directory where model checkpoints will be saved
            "best-{epoch}-{val_loss:.2f}",  # Filename format for checkpoints, including epoch and validation loss
            "val_loss",  # Metric used to select the best checkpoint (based on validation loss)
            mode="min",  # Save the checkpoint with the lowest validation loss (minimization objective)
            save_last=True,  # Always save the most recent checkpoint, even if it's not the best
            enable_version_counter=False
        )

        trainer = pl.Trainer(
            logger=False,
            enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
            enable_progress_bar=True,
            accelerator="auto",
            devices=1,
            max_epochs=20, # number of epochs to train for
            callbacks=[checkpointing], # Use the configured checkpoint callback
        )

        trainer.fit(mpnn, train_loader, val_loader)
        results = trainer.test(dataloaders=test_loader)
        print(results)

Training model for ('AChE', 'MAOB')


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/AChE-MAOB exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggreg

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
  return F.mse_loss(preds, targets, reduction="none")


Epoch 0: 100%|██████████| 66/66 [00:02<00:00, 30.97it/s, train_loss_step=0.864]

  return F.mse_loss(preds, targets, reduction="none")


Epoch 1:   8%|▊         | 5/66 [00:00<00:01, 31.53it/s, train_loss_step=0.754, val_loss=0.999, train_loss_epoch=0.854] 

  return F.mse_loss(preds, targets, reduction="none")


Epoch 19: 100%|██████████| 66/66 [00:02<00:00, 28.05it/s, train_loss_step=0.459, val_loss=0.677, train_loss_epoch=0.616]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 66/66 [00:02<00:00, 27.73it/s, train_loss_step=0.459, val_loss=0.677, train_loss_epoch=0.616]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/AChE-MAOB/best-epoch=19-val_loss=0.68.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/AChE-MAOB/best-epoch=19-val_loss=0.68.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



Testing DataLoader 0: 100%|██████████| 19/19 [00:00<00:00, 37.05it/s]


[{'test/mse': 1.071441411972046, 'test/mae': 0.8170800805091858}]
Training model for ('D2R', '_5HT2A')


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/D2R-_5HT2A exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggre

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
  return F.mse_loss(preds, targets, reduction="none")


Epoch 0: 100%|██████████| 114/114 [00:03<00:00, 30.03it/s, train_loss_step=1.120]

  return F.mse_loss(preds, targets, reduction="none")


Epoch 1:   4%|▍         | 5/114 [00:00<00:03, 30.31it/s, train_loss_step=0.725, val_loss=1.690, train_loss_epoch=0.870]  

  return F.mse_loss(preds, targets, reduction="none")


Epoch 19: 100%|██████████| 114/114 [00:04<00:00, 27.16it/s, train_loss_step=0.317, val_loss=0.558, train_loss_epoch=0.523]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 114/114 [00:04<00:00, 26.99it/s, train_loss_step=0.317, val_loss=0.558, train_loss_epoch=0.523]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/D2R-_5HT2A/best-epoch=19-val_loss=0.56.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/D2R-_5HT2A/best-epoch=19-val_loss=0.56.ckpt





/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 32/32 [00:00<00:00, 35.17it/s]


[{'test/mse': 0.6428846716880798, 'test/mae': 0.6115320920944214}]
Training model for ('D2R', 'D3R')


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /cache/arthurcerveira/MPNN-MT/checkpoints/D2R-D3R exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K  | train
1 | agg             | MeanAggregat

                                                                            

/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
  return F.mse_loss(preds, targets, reduction="none")


Epoch 0: 100%|██████████| 89/89 [00:02<00:00, 30.12it/s, train_loss_step=0.664]

  return F.mse_loss(preds, targets, reduction="none")


Epoch 1:   4%|▍         | 4/89 [00:00<00:02, 29.81it/s, train_loss_step=0.964, val_loss=1.290, train_loss_epoch=0.825] 

  return F.mse_loss(preds, targets, reduction="none")


Epoch 19: 100%|██████████| 89/89 [00:03<00:00, 26.36it/s, train_loss_step=0.656, val_loss=0.474, train_loss_epoch=0.410]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 89/89 [00:03<00:00, 26.13it/s, train_loss_step=0.656, val_loss=0.474, train_loss_epoch=0.410]

Restoring states from the checkpoint path at /cache/arthurcerveira/MPNN-MT/checkpoints/D2R-D3R/best-epoch=19-val_loss=0.47.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /cache/arthurcerveira/MPNN-MT/checkpoints/D2R-D3R/best-epoch=19-val_loss=0.47.ckpt
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



Testing DataLoader 0: 100%|██████████| 25/25 [00:00<00:00, 34.40it/s]


[{'test/mse': 0.5692845582962036, 'test/mae': 0.5805091261863708}]


In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import torch
from lightning import pytorch as pl
from chemprop import data, featurizers, models


# Evaluate the multitarget MPNN models on the target-specific test sets
target_combinations = (
    # Alzheimers
    ("AChE", "MAOB"),
    # Schizophrenia
    ("D2R", "_5HT2A"),
    # Parkinsons
    ("D2R", "D3R"),
)

targets = ['AChE', 'D2R', 'D3R', '_5HT2A', 'MAOB']
multitarget_mpnn_metrics = dict()

for target0, target1 in target_combinations:
    print(f"Evaluating {(target0, target1)} - pXC50")
    mpnn = models.MPNN.load_from_checkpoint(f'../checkpoints/{target0}-{target1}/last.ckpt')

    for target in [target0, target1]:
        print(f"- Evaluating on {target} dataset")
        multitarget_mpnn_metrics.setdefault(target, dict())

        df_test = pd.read_csv(f"../data/Assays-pXC50/{target}.csv").query("split == 'test'")        
        smiles_input = df_test["SMILES"].tolist()
        preds_t0, preds_t1 = run_mpnn_on_smiles(smiles_input, mpnn)

        if target == target0:   test_preds = preds_t0
        elif target == target1: test_preds = preds_t1

        # Skip molecules that failed to be processed (prediction is nan)
        df_test["pXC50_pred"] = test_preds
        nan_count = df_test['pXC50_pred'].isna().sum()
        print(f"Number of NaN predictions for {target}: {nan_count}")
        df_test = df_test.dropna(subset=["pXC50_pred"])
        y_true = df_test["pXC50"].values
        test_preds = df_test["pXC50_pred"].values

        mse = mean_squared_error(y_true, test_preds)
        r2 = r2_score(y_true, test_preds)
        mae = mean_absolute_error(y_true, test_preds)

        print(f"* Mean Squared Error: {mse}")
        print(f"* R2 Score: {r2}")
        print(f"* Mean Absolute Error: {mae}\n")
        multitarget_mpnn_metrics[target][f"{target0}_{target1}"] = {"mse": mse, "r2": r2, "mae": mae}

multitarget_mpnn_metrics

Evaluating ('AChE', 'MAOB') - pXC50
- Evaluating on AChE dataset


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 12/12 [00:00<00:00, 38.20it/s]
Number of NaN predictions for AChE: 0
* Mean Squared Error: 1.3606314024012394
* R2 Score: 0.38047961632346816
* Mean Absolute Error: 0.9378276905191107

- Evaluating on MAOB dataset


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 46.33it/s] 
Number of NaN predictions for MAOB: 0
* Mean Squared Error: 1.429479629437975
* R2 Score: 0.33819725192782035
* Mean Absolute Error: 0.9150499585875782

Evaluating ('D2R', '_5HT2A') - pXC50
- Evaluating on D2R dataset


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 24/24 [00:00<00:00, 36.70it/s]
Number of NaN predictions for D2R: 0
* Mean Squared Error: 0.8582009658345083
* R2 Score: 0.33372465075745905
* Mean Absolute Error: 0.7216015881402591

- Evaluating on _5HT2A dataset


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 12/12 [00:00<00:00, 39.94it/s]
Number of NaN predictions for _5HT2A: 0
* Mean Squared Error: 0.8211905388031981
* R2 Score: 0.4480501061236515
* Mean Absolute Error: 0.7043620530927773

Evaluating ('D2R', 'D3R') - pXC50
- Evaluating on D2R dataset


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 24/24 [00:00<00:00, 37.81it/s]
Number of NaN predictions for D2R: 0
* Mean Squared Error: 0.7173887937507349
* R2 Score: 0.4430459902428875
* Mean Absolute Error: 0.6552236086348152

- Evaluating on D3R dataset


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/arthurcerveira/miniconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 10/10 [00:00<00:00, 40.11it/s]
Number of NaN predictions for D3R: 0
* Mean Squared Error: 0.6073677154200291
* R2 Score: 0.6175702960456524
* Mean Absolute Error: 0.6080584795385872



{'AChE': {'AChE_MAOB': {'mse': 1.3606314024012394,
   'r2': 0.38047961632346816,
   'mae': 0.9378276905191107}},
 'MAOB': {'AChE_MAOB': {'mse': 1.429479629437975,
   'r2': 0.33819725192782035,
   'mae': 0.9150499585875782}},
 'D2R': {'D2R__5HT2A': {'mse': 0.8582009658345083,
   'r2': 0.33372465075745905,
   'mae': 0.7216015881402591},
  'D2R_D3R': {'mse': 0.7173887937507349,
   'r2': 0.4430459902428875,
   'mae': 0.6552236086348152}},
 '_5HT2A': {'D2R__5HT2A': {'mse': 0.8211905388031981,
   'r2': 0.4480501061236515,
   'mae': 0.7043620530927773}},
 'D3R': {'D2R_D3R': {'mse': 0.6073677154200291,
   'r2': 0.6175702960456524,
   'mae': 0.6080584795385872}}}

In [8]:
# Combine with target-specific MPNN metrics
for target in multitarget_mpnn_metrics:
    multitarget_mpnn_metrics[target]["target-specific"] = mpnn_metrics[target]

# Create multi-index DataFrame
multi_index = pd.MultiIndex.from_tuples(
    [(target, model) for target in multitarget_mpnn_metrics for model in multitarget_mpnn_metrics[target]],
    names=["target", "model"]
)

metrics_df = pd.DataFrame(
    [multitarget_mpnn_metrics[target][model] for target in multitarget_mpnn_metrics for model in multitarget_mpnn_metrics[target]],
    index=multi_index
)
metrics_df.map(lambda x: round(x, 2))

Unnamed: 0_level_0,Unnamed: 1_level_0,mse,r2,mae
target,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AChE,AChE_MAOB,1.36,0.38,0.94
AChE,target-specific,0.78,0.64,0.64
MAOB,AChE_MAOB,1.43,0.34,0.92
MAOB,target-specific,0.89,0.59,0.73
D2R,D2R__5HT2A,0.86,0.33,0.72
D2R,D2R_D3R,0.72,0.44,0.66
D2R,target-specific,0.57,0.56,0.56
_5HT2A,D2R__5HT2A,0.82,0.45,0.7
_5HT2A,target-specific,0.66,0.55,0.62
D3R,D2R_D3R,0.61,0.62,0.61
