# Training with chemprop

In [1]:
import os
import pandas as pd
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint
from chemprop import data, featurizers, models, nn

from chemprop.training import output_transform


In [2]:
path = os.getcwd ()
path

'C:\\Users\\leonz\\PyCharmMiscProject\\chemprop'

In [3]:
filename = "regression.csv"
path_2_file = path + "\\" + filename
path_2_file

'C:\\Users\\leonz\\PyCharmMiscProject\\chemprop\\regression.csv'

In [5]:
## Load the csv file
datos = pd.read_csv (path_2_file)
datos

Unnamed: 0,smiles,logSolubility
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770
1,Cc1occc1C(=O)Nc2ccccc2,-3.300
2,CC(C)=CCCC(C)=CC(=O),-2.060
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870
4,c1ccsc1,-1.330
...,...,...
495,Nc1cc(nc(N)n1=O)N2CCCCC2,-1.989
496,Nc2cccc3nc1ccccc1cc23,-4.220
497,c1ccc2cc3c4cccc5cccc(c3cc2c1)c45,-8.490
498,OC(c1ccc(Cl)cc1)(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,-5.666


# Get smiles and targets

In [6]:
## Column informations
smiles_column = 'smiles'
target_columns = ['logSolubility']

In [7]:
# Get smiles and targets
smles = datos.loc [:, smiles_column].values
ys = datos.loc [:, target_columns].values

In [8]:
smles

array(['OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O',
       'Cc1occc1C(=O)Nc2ccccc2', 'CC(C)=CCCC(C)=CC(=O)',
       'c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43', 'c1ccsc1', 'c2ccc1scnc1c2',
       'Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl',
       'CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O',
       'ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl',
       'COc5cc4OCC3Oc2c1CC(Oc1ccc2C(=O)C3c4cc5OC)C(C)=C', 'O=C1CCCN1',
       'Clc1ccc2ccccc2c1', 'CCCC=C', 'CCC1(C(=O)NCNC1=O)c2ccccc2',
       'CCCCCCCCCCCCCC', 'CC(C)Cl', 'CCC(C)CO', 'N#Cc1ccccc1',
       'CCOP(=S)(OCC)Oc1cc(C)nc(n1)C(C)C', 'CCCCCCCCCC(C)O',
       'Clc1ccc(c(Cl)c1)c2c(Cl)ccc(Cl)c2Cl',
       'O=c2[nH]c1CCCc1c(=O)n2C3CCCCC3', 'CCOP(=S)(OCC)SCSCC',
       'CCOc1ccc(NC(=O)C)cc1',
       'CCN(CC)c1c(cc(c(N)c1N(=O)=O)C(F)(F)F)N(=O)=O', 'CCCCCCCO',
       'Cn1c(=O)n(C)c2nc[nH]c2c1=O', 'CCCCC1(CC)C(=O)NC(=O)NC1=O',
       'ClC(Cl)=C(c1ccc(Cl)cc1)c2ccc(Cl)cc2', 'CCCCCCCC(=O)OC',
       'CCc1ccc(CC)cc1', 'CCOP(=S)(OCC)SCSC(C)(C)C',
       'C

In [9]:
ys

array([[-0.77 ],
       [-3.3  ],
       [-2.06 ],
       [-7.87 ],
       [-1.33 ],
       [-1.5  ],
       [-7.32 ],
       [-5.03 ],
       [-6.29 ],
       [-4.42 ],
       [ 1.07 ],
       [-4.14 ],
       [-2.68 ],
       [-2.64 ],
       [-7.96 ],
       [-1.41 ],
       [-0.47 ],
       [-1.   ],
       [-3.64 ],
       [-2.94 ],
       [-7.43 ],
       [-4.594],
       [-4.11 ],
       [-2.35 ],
       [-5.47 ],
       [-1.81 ],
       [-1.39 ],
       [-1.661],
       [-6.9  ],
       [-3.17 ],
       [-3.75 ],
       [-4.755],
       [-4.805],
       [-1.64 ],
       [-5.22 ],
       [-0.85 ],
       [-3.504],
       [-3.927],
       [-4.15 ],
       [-4.81 ],
       [-4.88 ],
       [-0.15 ],
       [-1.655],
       [-2.53 ],
       [-0.63 ],
       [-1.55 ],
       [-3.083],
       [-3.127],
       [-4.76 ],
       [-1.94 ],
       [-3.66 ],
       [-2.17 ],
       [-8.057],
       [-2.523],
       [-8.6  ],
       [-0.62 ],
       [-5.05 ],
       [-4.14 ],
       [-6.57 

# Get datapoints

In [10]:
real_data = [ data.MoleculeDatapoint.from_smi (smi, y) for smi, y in zip(smles, ys)]
real_data [0]

MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x000001BC685A3ED0>, y=array([-0.77]), weight=1.0, gt_mask=None, lt_mask=None, x_d=None, x_phase=None, name='OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O', V_f=None, E_f=None, V_d=None)

# Train, test, valid split

In [11]:
mols =[ d.mol for d in real_data ] # Create RDkit objects to perform structure based split => Prevents that the sets containing similar molecules
mols

[<rdkit.Chem.rdchem.Mol at 0x1bc685a3ed0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685a3bc0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685a3d80>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685a3610>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685a3e60>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685a34c0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685a0740>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685df0d0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc685df1b0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684eff40>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684edb60>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684edc40>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684edd20>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ede00>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684edee0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684edfc0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ee0a0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ee180>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ee2d0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ee420>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ee500>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ee5e0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc684ed9a0>,
 <rdkit.Che

In [12]:
# Here indices are obteined for each group
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8,0.1,0.1))
train_indices

The return type of make_split_indices has changed in v2.1 - see help(make_split_indices)


[[np.int64(254),
  np.int64(283),
  np.int64(445),
  np.int64(461),
  np.int64(15),
  np.int64(316),
  np.int64(489),
  np.int64(159),
  np.int64(153),
  np.int64(241),
  np.int64(250),
  np.int64(390),
  np.int64(289),
  np.int64(171),
  np.int64(329),
  np.int64(468),
  np.int64(355),
  np.int64(154),
  np.int64(37),
  np.int64(205),
  np.int64(366),
  np.int64(240),
  np.int64(108),
  np.int64(45),
  np.int64(438),
  np.int64(21),
  np.int64(367),
  np.int64(96),
  np.int64(233),
  np.int64(428),
  np.int64(118),
  np.int64(124),
  np.int64(191),
  np.int64(374),
  np.int64(492),
  np.int64(311),
  np.int64(451),
  np.int64(353),
  np.int64(238),
  np.int64(322),
  np.int64(46),
  np.int64(403),
  np.int64(221),
  np.int64(76),
  np.int64(1),
  np.int64(213),
  np.int64(325),
  np.int64(418),
  np.int64(102),
  np.int64(363),
  np.int64(170),
  np.int64(343),
  np.int64(144),
  np.int64(132),
  np.int64(12),
  np.int64(327),
  np.int64(173),
  np.int64(224),
  np.int64(342),
  np.in

In [13]:
# Splitting the dataset using this indices
train_data, val_data, test_data = data.split_data_by_indices(
    real_data, train_indices, val_indices, test_indices
)
train_data

[[MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x000001BC685C52A0>, y=array([-5.35]), weight=1.0, gt_mask=None, lt_mask=None, x_d=None, x_phase=None, name='CC(=O)OC3(CCC4C2C=C(C)C1=CC(=O)CCC1(C)C2CCC34C)C(C)=O', V_f=None, E_f=None, V_d=None),
  MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x000001BC685C6C00>, y=array([-3.21]), weight=1.0, gt_mask=None, lt_mask=None, x_d=None, x_phase=None, name='CCc1ccccc1C', V_f=None, E_f=None, V_d=None),
  MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x000001BC685636F0>, y=array([-4.56]), weight=1.0, gt_mask=None, lt_mask=None, x_d=None, x_phase=None, name='Brc1ccc(I)cc1', V_f=None, E_f=None, V_d=None),
  MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x000001BC6857C580>, y=array([-3.03]), weight=1.0, gt_mask=None, lt_mask=None, x_d=None, x_phase=None, name='CCN(CC)c1ccccc1', V_f=None, E_f=None, V_d=None),
  MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x000001BC684EDFC0>, y=array([-1.41]), weight=1.0, 

# Get MolecularDatasets

In [1]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer ()

train_dset = data.MoleculeDataset (train_data [0], featurizer)
scaler = train_dset.scaler

val_dset = data.MoleculeDataset(val_data [0], featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data [0], featurizer)


NameError: name 'featurizers' is not defined

# Get DataLoader

In [37]:
train_loader = data.build_dataloader(train_dset, num_workers=15)
val_loader = data.build_dataloader(val_dset, num_workers=15, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=15, shuffle=False)

# Message Passing Neural Networks

In [23]:
# output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
mp = nn.BondMessagePassing ()

# Aggregation

In [24]:
agg = nn.MeanAggregation ()

# Feed-Forward Networkd (FFN)

In [25]:
#output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
ffn = nn.RegressionFFN ()

# Batch Normalization


In [32]:
batch_norm = True

# Metrics

In [27]:
print(nn.metrics.MetricRegistry)

ClassRegistry {
    'mse': <class 'chemprop.nn.metrics.MSE'>,
    'mae': <class 'chemprop.nn.metrics.MAE'>,
    'rmse': <class 'chemprop.nn.metrics.RMSE'>,
    'bounded-mse': <class 'chemprop.nn.metrics.BoundedMSE'>,
    'bounded-mae': <class 'chemprop.nn.metrics.BoundedMAE'>,
    'bounded-rmse': <class 'chemprop.nn.metrics.BoundedRMSE'>,
    'r2': <class 'chemprop.nn.metrics.R2Score'>,
    'binary-mcc': <class 'chemprop.nn.metrics.BinaryMCCMetric'>,
    'multiclass-mcc': <class 'chemprop.nn.metrics.MulticlassMCCMetric'>,
    'roc': <class 'chemprop.nn.metrics.BinaryAUROC'>,
    'prc': <class 'chemprop.nn.metrics.BinaryAUPRC'>,
    'accuracy': <class 'chemprop.nn.metrics.BinaryAccuracy'>,
    'f1': <class 'chemprop.nn.metrics.BinaryF1Score'>
}


In [30]:
metrics = [nn.metrics.RMSE (), nn.metrics.MSE ()]

# Constructing the mpnn

In [33]:
mpnn = models.MPNN (mp,agg, ffn, batch_norm, metrics)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=372, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]])
    (output_transform): Identity()
  )
  (X_d_transform): Identity()
  (metrics): ModuleList(
    (0): RMSE(task_weights=[[1.0]])
    (1-2): 2 x MSE(task_weights

# Trainer

In [38]:
# Configure model checkpointing
checkpointing = ModelCheckpoint(
    "checkpoints",  # Directory where model checkpoints will be saved
    "best-{epoch}-{val_loss:.2f}",  # Filename format for checkpoints, including epoch and validation loss
    "val_loss",  # Metric used to select the best checkpoint (based on validation loss)
    mode="min",  # Save the checkpoint with the lowest validation loss (minimization objective)
    save_last=True,  # Always save the most recent checkpoint, even if it's not the best
)


trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="auto",
    devices=1,
    max_epochs=20, # number of epochs to train for
    callbacks=[checkpointing], # Use the configured checkpoint callback
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(mpnn, train_loader, val_loader)

C:\Users\leonz\anaconda3\envs\chemprop_env\Lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:881: Checkpoint directory C:\Users\leonz\PyCharmMiscProject\chemprop\checkpoints exists and is not empty.
Loading `train_dataloader` to estimate number of stepping batches.
C:\Users\leonz\anaconda3\envs\chemprop_env\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:429: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.
