In [1]:
# auto reload changes ipynb
# auto reload
%load_ext autoreload
%autoreload 2
import sys

sys.path.append("/home/calvin/code/cmpnn_revised")

In [18]:

from cmpnn.featurizer.molecule_dataset import MultiMoleculeDataset
from cmpnn.featurizer.atom_bond import AtomFeaturizer, BondFeaturizer
from cmpnn.featurizer.global_feat import CompositeGlobalFeaturizer, MorganBinaryFeaturizer, RDKit2DNormalizedFeaturizer

In [22]:
csv_file = '/home/calvin/code/cmpnn_revised/cmpnn/mol_data/multi_mols.csv'

In [23]:
loader = MultiMoleculeDataset(
    csv_file=csv_file,
    atom_featurizer=AtomFeaturizer(v2=False),
    bond_featurizer=BondFeaturizer(v2=False),
    global_featurizer=CompositeGlobalFeaturizer(
        featurizers=[
            MorganBinaryFeaturizer(radius=2, length=1024),
            RDKit2DNormalizedFeaturizer()
        ]
    ),
    atom_messages=True
)

Using all atomic numbers from 1 to 100
Loading cached dataset from /home/calvin/code/cmpnn_revised/cmpnn/mol_data/multi_mols_cache.pt


In [24]:
loader[0]

[MoleculeData(f_atoms=[24, 133], f_bonds=[54, 14], a2b=[24], b2a=[54], a_scope=[1], b_scope=[1], global_features=[1224], y=[1], bonds=[27, 2], smiles='Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14', b2revb=[54]),
 MoleculeData(f_atoms=[33, 133], f_bonds=[70, 14], a2b=[33], b2a=[70], a_scope=[1], b_scope=[1], global_features=[1224], y=[1], bonds=[35, 2], smiles='CCN(CCN(C)C)S(=O)(=O)c1ccc(cc1)c2cnc(N)c(n2)C(=O)Nc3cccnc3', b2revb=[70])]

In [25]:
from torch.utils.data import DataLoader

dataloaded = DataLoader(dataset=loader, batch_size=32, collate_fn=multi_collate_fn)

In [26]:
for batch in dataloaded:
    print(batch)
    break

[from_data_list] Init: 0.0002s | Loop: 0.0148s | Finalize: 0.0023s | Total: 0.0173s
[from_data_list] Init: 0.0001s | Loop: 0.0143s | Finalize: 0.0025s | Total: 0.0169s
MultiMoleculeDataBatch(n_samples=32, n_components=2)


In [32]:
from cmpnn.models.lightning import MultiCMPNNLightningModule

multi_model = MultiCMPNNLightningModule(
    atom_fdim=133,
    bond_fdim=14,
    global_fdim=1224,
    shared_encoder=False,
    atom_messages=True,
    booster='attention',
)

In [33]:
from pytorch_lightning import Trainer

trainer = Trainer(
    max_epochs=10,
    accelerator='gpu',
    devices=1,
    enable_progress_bar=True,
)
trainer.fit(
    multi_model,
    dataloaded,
)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/calvin/miniforge3/envs/dmpnn_rocm/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type           | Params | Mode 
------------------------------------------------------
0 | encoders   | ModuleList     | 663 K  | train
1 | aggregator | MeanAggregator | 0      | train
2 | bn         | Identity       | 0      | train
3 | ffn        | MLP            | 346 K  | train
4 | metrics    | ModuleDict     | 0      | train
------------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.040     Total 

Training: |          | 0/? [00:00<?, ?it/s]

[from_data_list] Init: 0.0002s | Loop: 0.0243s | Finalize: 0.0044s | Total: 0.0289s
[from_data_list] Init: 0.0002s | Loop: 0.0167s | Finalize: 0.0031s | Total: 0.0200s
[from_data_list] Init: 0.0001s | Loop: 0.0140s | Finalize: 0.0024s | Total: 0.0165s
[from_data_list] Init: 0.0001s | Loop: 0.0141s | Finalize: 0.0022s | Total: 0.0164s
[from_data_list] Init: 0.0001s | Loop: 0.0142s | Finalize: 0.0023s | Total: 0.0166s
[from_data_list] Init: 0.0001s | Loop: 0.0144s | Finalize: 0.0023s | Total: 0.0169s
[from_data_list] Init: 0.0001s | Loop: 0.0020s | Finalize: 0.0005s | Total: 0.0026s
[from_data_list] Init: 0.0001s | Loop: 0.0017s | Finalize: 0.0003s | Total: 0.0021s
[from_data_list] Init: 0.0002s | Loop: 0.0145s | Finalize: 0.0026s | Total: 0.0173s
[from_data_list] Init: 0.0001s | Loop: 0.0142s | Finalize: 0.0023s | Total: 0.0167s
[from_data_list] Init: 0.0001s | Loop: 0.0141s | Finalize: 0.0023s | Total: 0.0166s
[from_data_list] Init: 0.0001s | Loop: 0.0139s | Finalize: 0.0025s | Total: 

`Trainer.fit` stopped: `max_epochs=10` reached.


[from_data_list] Init: 0.0001s | Loop: 0.0137s | Finalize: 0.0023s | Total: 0.0161s
[from_data_list] Init: 0.0001s | Loop: 0.0138s | Finalize: 0.0022s | Total: 0.0161s
[from_data_list] Init: 0.0001s | Loop: 0.0019s | Finalize: 0.0004s | Total: 0.0025s
[from_data_list] Init: 0.0001s | Loop: 0.0017s | Finalize: 0.0003s | Total: 0.0020s


In [34]:
trainer.test(
    multi_model,
    dataloaded,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/home/calvin/miniforge3/envs/dmpnn_rocm/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[from_data_list] Init: 0.0003s | Loop: 0.0135s | Finalize: 0.0032s | Total: 0.0170s
[from_data_list] Init: 0.0002s | Loop: 0.0165s | Finalize: 0.0029s | Total: 0.0195s
[from_data_list] Init: 0.0005s | Loop: 0.0247s | Finalize: 0.0033s | Total: 0.0285s
[from_data_list] Init: 0.0002s | Loop: 0.0188s | Finalize: 0.0046s | Total: 0.0236s
[from_data_list] Init: 0.0001s | Loop: 0.0131s | Finalize: 0.0025s | Total: 0.0157s
[from_data_list] Init: 0.0001s | Loop: 0.0128s | Finalize: 0.0024s | Total: 0.0153s
[from_data_list] Init: 0.0002s | Loop: 0.0019s | Finalize: 0.0004s | Total: 0.0025s
[from_data_list] Init: 0.0001s | Loop: 0.0016s | Finalize: 0.0003s | Total: 0.0019s


[{'RMSE': 0.32477355003356934,
  'MAE': 0.2376757115125656,
  'R2': 0.9394694566726685}]