In [1]:
!pip install git+https://github.com/bhanum1/chemprop.git
from chemprop import data
import pandas as pd

Collecting git+https://github.com/bhanum1/chemprop.git
  Cloning https://github.com/bhanum1/chemprop.git to /tmp/pip-req-build-aql13dlj
  Running command git clone --filter=blob:none --quiet https://github.com/bhanum1/chemprop.git /tmp/pip-req-build-aql13dlj
  Resolved https://github.com/bhanum1/chemprop.git to commit f1f52e1b011352a8edfeb61efac76c28d3c3e413
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
from chemprop.cli.utils.parsing import build_data_from_files, make_dataset
from pathlib import Path

batch_size = 64
num_workers = 0

input_path = '/home/bhanu/Documents/GitHub/Thermal_Fluid_Prediction_GNN/Datasets/viscosity_data.csv' # path to your data .csv file
df = pd.read_csv(input_path) #convert to dataframe

smis = df['smiles']
targets = df['Viscosity']
temps = df['temperature']
lnA_targets = df['lnA']

all_data = [data.MoleculeDatapoint.from_smi(smi, y, temp=temp, lnA_target=lnA) for smi, y,temp,lnA in zip(smis, targets,temps,lnA_targets)]
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits


grouped = df.groupby(df['splits'].str.lower())
train_indices = grouped.groups.get("train", pd.Index([])).tolist()
val_indices = grouped.groups.get("val", pd.Index([])).tolist()
test_indices = grouped.groups.get("test", pd.Index([])).tolist()
train_indices, val_indices, test_indices = [train_indices], [val_indices], [test_indices]

train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

train_data = train_data[0]
val_data = val_data[0]
test_data = test_data[0]

train_dset = make_dataset(train_data, reaction_mode='REAC_PROD')
val_dset = make_dataset(val_data,reaction_mode='REAC_PROD')
test_dset = make_dataset(test_data, reaction_mode='REAC_PROD')

train_loader = data.build_dataloader(train_dset, batch_size=batch_size, num_workers=num_workers, shuffle = False)
val_loader = data.build_dataloader(val_dset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, batch_size=batch_size, num_workers=num_workers, shuffle=False)


In [3]:
class Args():
    def __init__(self, data_path, output_dir, ensemble_size, 
                 message_hidden_dim, depth, dropout, activation, aggregation, 
                 aggregation_norm, ffn_hidden_dim, ffn_num_layers, loss_reg, 
                 model_frzn, warmup_epochs,target_columns
                  ):
        
        self.data_path = Path(data_path)
        self.output_dir = Path(output_dir)
        self.ensemble_size = ensemble_size
        self.message_hidden_dim = message_hidden_dim
        self.depth=depth
        self.dropout=dropout
        self.activation = activation
        self.aggregation = aggregation
        self.aggregation_norm = aggregation_norm
        self.ffn_hidden_dim = ffn_hidden_dim
        self.ffn_num_layers = ffn_num_layers
        self.loss_reg = loss_reg
        self.warmup_epochs = warmup_epochs
        self.init_lr = 0.001
        self.max_lr = 0.0001
        self.final_lr = 0.001
        self.epochs = 5
        self.accelerator='cpu'
        self.devices = 'auto'
        self.target_columns = [target_columns]

        self.model_frzn = model_frzn
        self.smiles_columns = ['smiles']
        self.reaction_columns = None
        self.splits_column = 'splits'
        self.ignore_columns = None
        self.weight_column = None


        self.grad_clip = 0
        self.patience = None
        self.no_batch_norm = False
        self.undirected = False
        self.message_bias = False
        self.pytorch_seed = 0
        self.v_kl = 0.0
        self.eps = 1e-08
        self.task_type = 'regression'
        self.loss_function='mse'
        self.task_weights = [1]
        self.atom_messages = False
        self.metrics=['mse']
        self.multiclass_num_classes = 3
        self.no_header_row = False

args = Args('/home/bhanu/Documents/GitHub/Thermal_Fluid_Prediction_GNN/Datasets/viscosity_data.csv','/home/bhanu/Documents/Chemprop_Models/',1, 300, 3, 0.0, 'RELU', 'mean', 100, 300, 2, 0.0, None, 2, 'Viscosity')

In [4]:
from chemprop.cli.train import *
from pathlib import Path

input_transforms = [[None],[None],[None]]
output_transform = None

train_loader.dataset.Y = train_loader.dataset.Y.reshape(-1,1)
val_loader.dataset.Y = val_loader.dataset.Y.reshape(-1,1)
test_loader.dataset.Y = test_loader.dataset.Y.reshape(-1,1)

train_model(
    args,
    train_loader,
    val_loader,
    test_loader,
    args.output_dir,
    output_transform,
    input_transforms,
)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/bhanu/miniconda3/envs/chemprop_git/lib/python3.11/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/home/bhanu/miniconda3/envs/chemprop_git/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /home/bhanu/Documents/Chemprop_Models/model_0/checkpoints exists and is not empty.
Loading `train_dataloader` to estimate number of stepping batches.
/home/bhanu/miniconda3/envs/chemprop_git/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=27` in the `DataLoader` to improve performance.
/home/bhanu/miniconda3/envs/c

                                                                           

/home/bhanu/miniconda3/envs/chemprop_git/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=27` in the `DataLoader` to improve performance.


Epoch 4: 100%|██████████| 35/35 [00:00<00:00, 54.55it/s, v_num=5, train_loss=1.150, val_loss=0.881]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 35/35 [00:00<00:00, 53.60it/s, v_num=5, train_loss=1.150, val_loss=0.881]

Restoring states from the checkpoint path at /home/bhanu/Documents/Chemprop_Models/model_0/checkpoints/best-epoch=1-val_loss=0.63.ckpt
Loaded model weights from the checkpoint at /home/bhanu/Documents/Chemprop_Models/model_0/checkpoints/best-epoch=1-val_loss=0.63.ckpt
/home/bhanu/miniconda3/envs/chemprop_git/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=27` in the `DataLoader` to improve performance.



Predicting DataLoader 0: 100%|██████████| 5/5 [00:00<00:00, 131.82it/s]
No Ea / R targets provided
Entire Test Set results: {'entire_test/mse': 0.6038371408451128}
