<a href="https://colab.research.google.com/github/adithyamauryakr/BANDNN_pytorch/blob/main/hpt-BANDNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install rdkit
!pip install optuna
!git clone https://github.com/adithyamauryakr/BANDNN_pytorch.git
!git clone https://github.com/isayev/ANI1_dataset.git

Cloning into 'BANDNN_pytorch'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 45 (delta 22), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (45/45), 261.11 KiB | 3.26 MiB/s, done.
Resolving deltas: 100% (22/22), done.
fatal: destination path 'ANI1_dataset' already exists and is not an empty directory.


In [2]:
!export PYTHONPATH="${PYTHONPATH}:/content/ANI1_dataset/readers/lib to PYTHONPATH"

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import lib.pyanitools as pya
import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import h5py

features_list = []
with h5py.File('/content/drive/MyDrive/bandnn_datasets/molecules.h5', 'r') as h5f:
    for mol_key in h5f.keys():
        group = h5f[mol_key]
        mol_data = {key: group[key][()] for key in group}
        for k, v in mol_data.items():
            if isinstance(v, bytes):
                mol_data[k] = v.decode('utf-8')
        features_list.append(mol_data)

print(len(features_list))

In [4]:
y = pd.read_csv('/content/drive/MyDrive/bandnn_datasets/energy_list.csv').values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features_list, y, test_size=0.2, random_state=42
)

In [5]:
class CustomDataset(Dataset):

  def __init__(self, features, targets):
    self.features = features
    self.targets = torch.tensor(targets, dtype=torch.float32)

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, index):
    return self.features[index], self.targets[index]

In [6]:
def collate_Fn(batch):
    feature_batch, target_batch = zip(*batch)
    return list(feature_batch), torch.tensor(target_batch, dtype=torch.float32)


In [None]:

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32,collate_fn=collate_Fn, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_Fn, shuffle=False, pin_memory=True)



In [21]:
torch.manual_seed(42)

<torch._C.Generator at 0x7a2ad2d3d190>

In [None]:
torch.manual_seed(42)

class BANDNN(nn.Module):

    def __init__(self, bonds_input_dim, angles_input_dim, nonbonds_input_dim, dihedral_input_dim,
                 num_hidden_layers, neurons_per_layer_bonds, neurons_per_layer_angles,
                 neurons_per_layer_nonbonds, neurons_per_layer_dihedrals):

      super().__init__()

      bonds_model_layers = []

      for i in range(num_hidden_layers):

        bonds_model_layers.append(nn.Linear(bonds_input_dim, neurons_per_layer_bonds))
        bonds_model_layers.append(nn.ReLU())
        bonds_input_dim = neurons_per_layer_bonds

      bonds_model_layers.append(nn.Linear(neurons_per_layer_bonds, 1))

      angles_layers = []

      for i in range(num_hidden_layers):

        angles_layers.append(nn.Linear(anlges_input_dim, neurons_per_layer_angles))
        angles_layers.append(nn.ReLU())
        anlges_input_dim = neurons_per_layer_angles

      angles_layers.append(nn.Linear(neurons_per_layer_angles, 1))

      nonbonds_layers = []

      for i in range(num_hidden_layers):

        nonbonds_layers.append(nn.Linear(nonbonds_input_dim, neurons_per_layer_nonbonds))
        nonbonds_layers.append(nn.ReLU())
        nonbonds_input_dim = neurons_per_layer_nonbonds

      nonbonds_layers.append(nn.Linear(neurons_per_layer_nonbonds, 1))

      dihedrals_layers = []

      for i in range(num_hidden_layers):

        dihedrals_layers.append(nn.Linear(dihedrals_input_dim, neurons_per_layer_dihedrals))
        dihedrals_layers.append(nn.ReLU())
        dihedrals_input_dim = neurons_per_layer_dihedrals

      dihedrals_layers.append(nn.Linear(neurons_per_layer_dihedrals, 1))

      self.bonds_model = nn.Sequential(*bonds_model_layers)
      self.angles_model = nn.Sequential(*angles_layers)
      self.nonbonds_model = nn.Sequential(*nonbonds_layers)
      self.dihedrals_model = nn.Sequential(*dihedrals_layers)


    def forward(self, bonds_input, angles_input, non_bonds_input, dihedrals_input):
        bonds_energy = self.bonds_model(bonds_input).sum()
        angles_energy = self.angles_model(angles_input).sum()
        nonbonds_energy = self.nonbonds_model(non_bonds_input).sum()
        dihedrals_energy = self.dihedrals_model(dihedrals_input).sum()

        total_energy = bonds_energy + angles_energy + nonbonds_energy + dihedrals_energy
        return total_energy

In [None]:
# objective function:
def objective(trial):

  #next HP values from the search space
  num_hidden_layers = trial.suggest_int('num_hidden_layers', 1, 5)

  neurons_per_layer_bonds = trial.suggest_int('neurons_per_layer', 8, 512, step=8)
  neurons_per_layer_angles = trial.suggest_int('neurons_per_layer', 8, 512, step=8)
  neurons_per_layer_nonbonds = trial.suggest_int('neurons_per_layer', 8, 512, step=8)
  neurons_per_layer_dihedrals = trial.suggest_int('neurons_per_layer', 8, 512, step=8)

  epochs = trial.suggest_int('epochs', 10, 50, step=10)
  learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
  batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
  optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'SGD', 'RMSprop'])
  weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)

  train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, pin_memory=True)
  test_loader =  DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

  #model init
  BONDS_DIM, ANGLES_DIM, NONBONDS_DIM, DIHEDRALS_DIM = 17, 27, 17, 38

  model = BANDNN(BONDS_DIM, ANGLES_DIM, NONBONDS_DIM, DIHEDRALS_DIM)
  model.to(device)


  #optimizer selection

  criterion = nn.MSELoss()

  optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  if optimizer_name == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  elif optimizer_name == 'RMSprop':
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  else:
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

  #training loop
  for epoch in range(epochs):
    print(f'Epoch {epoch + 1}')
    total_epoch_loss = 0
    num_samples = 0

    for batch in train_loader:

      for feature_dict, target in zip(*batch):

        bond_feat = torch.stack([torch.tensor(arr, dtype=torch.float32) for arr in feature_dict['bonds']]).float().to(device)
        angle_feat = torch.stack([torch.tensor(arr, dtype=torch.float32) for arr in feature_dict['angles']]).float().to(device)
        nonbond_feat = torch.stack([torch.tensor(arr, dtype=torch.float32) for arr in feature_dict['nonbonds']]).float().to(device)
        dihedral_feat = torch.stack([torch.tensor(arr, dtype=torch.float32) for arr in feature_dict['dihedrals']]).float().to(device)
        energy_feat = torch.tensor([target], dtype=torch.float32).to(device)

        optimizer.zero_grad()

        outputs = model(bond_feat, angle_feat, nonbond_feat, dihedral_feat,
                        )

        loss = criterion(outputs, energy_feat)
        loss.backward()

        optimizer.step()

        num_samples+=1
        total_epoch_loss += loss.item()


    avg_loss = total_epoch_loss / num_samples
    print(f'Average epoch Loss: {avg_loss:.4f}')

  #evaluation
    model.eval()  # Set model to eval mode
    total_loss = 0
    total_samples = 0

    predictions = []
    targets_list = []



    with torch.no_grad():  # No gradients needed
        for batch in test_loader:
            features_list, targets = batch

            for feature, target in zip(features_list, targets):
                # Convert and move feature components to device
                bonds = torch.stack([torch.tensor(b, dtype=torch.float32) for b in feature['bonds']]).to(device)
                angles = torch.stack([torch.tensor(a, dtype=torch.float32) for a in feature['angles']]).to(device)
                nonbonds = torch.stack([torch.tensor(n, dtype=torch.float32) for n in feature['nonbonds']]).to(device)
                dihedrals = torch.stack([torch.tensor(d, dtype=torch.float32) for d in feature['dihedrals']]).to(device)

                target = torch.tensor(target, dtype=torch.float32).to(device)

                # Get model output
                output = model(bonds, angles, nonbonds, dihedrals)

                # Compute loss
                loss = criterion(output, target)
                total_loss += loss.item()
                total_samples += 1

                predictions.append(output.item())
                targets_list.append(target.item())

    avg_loss = total_loss / total_samples
    print(f"Evaluation MSE Loss: {avg_loss:.4f}")


  return avg_loss

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

In [None]:
study.best_params

In [None]:
study.best_value