In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import optuna

from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from tqdm import tqdm
from rdkit.Chem import rdMolDescriptors

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math
from sklearn.metrics import r2_score

import os 
import sys
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors, MACCSkeys
from rdkit.Avalon import pyAvalonTools


In [None]:
import dgl
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from dgllife.model import model_zoo
from dgllife.utils import smiles_to_bigraph
from dgllife.utils import EarlyStopping, Meter
from dgllife.utils import AttentiveFPAtomFeaturizer
from dgllife.utils import AttentiveFPBondFeaturizer
from dgllife.data import MoleculeCSVDataset

In [None]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [None]:
def collate_molgraphs(data):
    assert len(data[0]) in [3, 4], \
        'Expect the tuple to be of length 3 or 4, got {:d}'.format(len(data[0]))
    if len(data[0]) == 3:
        smiles, graphs, labels = map(list, zip(*data))
        masks = None
    else:
        smiles, graphs, labels, masks = map(list, zip(*data))

    bg = dgl.batch(graphs)
    bg.set_n_initializer(dgl.init.zero_initializer)
    bg.set_e_initializer(dgl.init.zero_initializer)
    labels = torch.stack(labels, dim=0)

    if masks is None:
        masks = torch.ones(labels.shape)
    else:
        masks = torch.stack(masks, dim=0)
    return smiles, bg, labels, masks


In [None]:
def compute_loss(model, prediction, labels, masks, loss_criterion):
    mse_loss = (loss_criterion(prediction, labels) * (masks != 0).float()).mean()
    return mse_loss

In [None]:
def objective(train_loader, valid_loader, test_loader, num_epoch):
    def objective_inner(trial):
        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
        num_layers = trial.suggest_int('num_layers', 1, 5)
        graph_feat_size = trial.suggest_int('graph_feat_size', 100, 500)
        dropout_rate = round(trial.suggest_uniform('dropout_rate', 0.0, 0.5), 5)
        learning_rate = round(trial.suggest_loguniform('learning_rate', 1e-5, 1e-1), 5)
        num_timesteps = trial.suggest_int('num_timesteps', 1, 3)
        
        atom_featurizer = AttentiveFPAtomFeaturizer(atom_data_field='hv')
        bond_featurizer = AttentiveFPBondFeaturizer(bond_data_field='he')
        n_feats = atom_featurizer.feat_size('hv')
        e_feats = bond_featurizer.feat_size('he')

        model = model_zoo.AttentiveFPPredictor(node_feat_size=n_feats,
                                           edge_feat_size=e_feats,
                                           num_layers=num_layers,
                                           num_timesteps=num_timesteps,
                                           graph_feat_size=graph_feat_size,
                                           n_tasks=1,
                                           dropout=dropout_rate
                                            )
        model = model.to(device)
        #Train
        loss_criterion = nn.MSELoss(reduction='none')
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)    
            
        for epoch in range(num_epoch):
            model.train()
            losses = []
            train_meter = Meter()
            for batch_id, batch_data in enumerate(train_loader):
                smiles, bg, labels, masks = batch_data
                bg=bg.to(device)
                labels = labels.to(device)
                masks = masks.to(device)
                n_feats = bg.ndata.pop('hv').to(device)
                e_feats = bg.edata.pop('he').to(device)
                prediction = model(bg, n_feats, e_feats)
                #loss = (loss_criterion(prediction, labels) * (masks != 0).float()).mean()
                loss = compute_loss(model, prediction, labels, masks, loss_criterion)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                losses.append(loss.data.item())
            total_train_loss = np.mean(losses)
    
        model.eval()
        val_losses=[]
        eval_meter = Meter()
        with torch.no_grad():
            for batch_id, batch_data in enumerate(valid_loader):
                smiles, bg, labels, masks = batch_data
                bg = bg.to(device)
                labels = labels.to(device)
                masks = masks.to(device)
                n_feats = bg.ndata.pop('hv').to(device)
                e_feats = bg.edata.pop('he').to(device)
                valid_prediction = model(bg, n_feats, e_feats)
                #val_loss = (loss_criterion(valid_prediction, labels) * (masks != 0).float()).mean()
                val_loss = compute_loss(model, valid_prediction, labels, masks, loss_criterion)
                val_loss=val_loss.detach().cpu().numpy()
                val_losses.append(val_loss)
            total_val_loss = np.mean(val_losses)
        return total_val_loss
    return objective_inner

In [None]:
def get_all_metrics(best_params, num_epoch, train_loader, valid_loader, test_loader, cv_name):
    device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
    best_n_layers = best_params['num_layers']
    best_graph_feat_size = best_params['graph_feat_size']
    best_dropout_rate = best_params['dropout_rate']
    best_learning_rate = best_params['learning_rate']
    best_num_timesteps = best_params['num_timesteps']

    atom_featurizer = AttentiveFPAtomFeaturizer(atom_data_field='hv')
    bond_featurizer = AttentiveFPBondFeaturizer(bond_data_field='he')
    n_feats = atom_featurizer.feat_size('hv')
    e_feats = bond_featurizer.feat_size('he')

    model = model_zoo.AttentiveFPPredictor(node_feat_size=n_feats,
                                       edge_feat_size=e_feats,
                                       num_layers=best_n_layers,
                                       num_timesteps=best_num_timesteps,
                                       graph_feat_size=best_graph_feat_size,
                                       n_tasks=1,
                                       dropout=best_dropout_rate
                                        )
    model = model.to(device)
    #Train
    loss_criterion = nn.MSELoss(reduction='none')
    optimizer = torch.optim.Adam(model.parameters(), lr=best_learning_rate) 
    criterion = nn.MSELoss()

    train_loss_history = []
    val_loss_history = []

    for epoch in range(num_epoch):
        model.train()
        losses = []
        for batch_id, batch_data in enumerate(train_loader):
            smiles, bg, labels, masks = batch_data
            bg=bg.to(device)
            labels = labels.to(device)
            masks = masks.to(device)
            n_feats = bg.ndata.pop('hv').to(device)
            e_feats = bg.edata.pop('he').to(device)
            prediction = model(bg, n_feats, e_feats)
            #loss = (loss_criterion(prediction, labels) * (masks != 0).float()).mean()
            loss = compute_loss(model, prediction, labels, masks, loss_criterion)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.data.item())
        total_train_loss = np.mean(losses)
        train_loss_history.append(total_train_loss)
        
        model.eval()
        val_losses=[]
        with torch.no_grad():
            for batch_id, batch_data in enumerate(valid_loader):
                smiles, bg, labels, masks = batch_data
                bg = bg.to(device)
                labels = labels.to(device)
                masks = masks.to(device)
                n_feats = bg.ndata.pop('hv').to(device)
                e_feats = bg.edata.pop('he').to(device)
                valid_prediction = model(bg, n_feats, e_feats)
                val_loss = (loss_criterion(valid_prediction, labels) * (masks != 0).float()).mean()
                val_loss=val_loss.detach().cpu().numpy()
                val_losses.append(val_loss)
            total_val_loss = np.mean(val_losses)
            val_loss_history.append(total_val_loss)

    num_epoch_plot = num_epoch + 1
    plt.plot(range(1, num_epoch_plot), train_loss_history, label='Train Loss')
    plt.plot(range(1, num_epoch_plot), val_loss_history, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
    #model_path = 'model_attentive_fp_' + cv_name + '.pth'
    #torch.save(model.state_dict(), model_path)
    model.eval()
    with torch.no_grad():
        labels_app = []
        train_prediction_app = []
        for batch_id, batch_data in enumerate(train_loader):
            smiles, bg, labels, masks = batch_data
            bg = bg.to(device)
            labels = labels.to(device)
            masks = masks.to(device)
            n_feats = bg.ndata.pop('hv').to(device)
            e_feats = bg.edata.pop('he').to(device)
            train_prediction = model(bg, n_feats, e_feats)
            labels_app.extend(labels)
            train_prediction_app.extend(train_prediction)
        train_rmse = math.sqrt(criterion(torch.cat(train_prediction_app), torch.cat(labels_app)).item())
        train_r2 = r2_score(torch.cat(labels_app).detach().cpu().numpy(), torch.cat(train_prediction_app).detach().cpu().numpy())
        
    model.eval()
    with torch.no_grad():
        labels_app = []
        valid_prediction_app = []
        for batch_id, batch_data in enumerate(valid_loader):
            smiles, bg, labels, masks = batch_data
            bg = bg.to(device)
            labels = labels.to(device)
            masks = masks.to(device)
            n_feats = bg.ndata.pop('hv').to(device)
            e_feats = bg.edata.pop('he').to(device)
            valid_prediction = model(bg, n_feats, e_feats)
            labels_app.extend(labels)
            valid_prediction_app.extend(valid_prediction)
        valid_rmse = math.sqrt(criterion(torch.cat(valid_prediction_app), torch.cat(labels_app)).item())
        valid_r2 = r2_score(torch.cat(labels_app).detach().cpu().numpy(), torch.cat(valid_prediction_app).detach().cpu().numpy())
    
    model.eval()
    with torch.no_grad():
        labels_app = []
        test_prediction_app = []
        for batch_id, batch_data in enumerate(test_loader):
            smiles, bg, labels, masks = batch_data
            bg = bg.to(device)
            labels = labels.to(device)
            masks = masks.to(device)
            n_feats = bg.ndata.pop('hv').to(device)
            e_feats = bg.edata.pop('he').to(device)
            test_prediction = model(bg, n_feats, e_feats)
            labels_app.extend(labels)
            test_prediction_app.extend(test_prediction)
        test_rmse = math.sqrt(criterion(torch.cat(test_prediction_app), torch.cat(labels_app)).item())
        test_r2 = r2_score(torch.cat(labels_app).detach().cpu().numpy(), torch.cat(test_prediction_app).detach().cpu().numpy())

    return train_rmse, valid_rmse, test_rmse, train_r2, valid_r2, test_r2


In [None]:
atom_featurizer = AttentiveFPAtomFeaturizer(atom_data_field='hv')
bond_featurizer = AttentiveFPBondFeaturizer(bond_data_field='he')
n_feats = atom_featurizer.feat_size('hv')
e_feats = bond_featurizer.feat_size('he')

In [None]:
def dataset_func(data_file):
    X =data_file['smiles']
    y=data_file['ee']
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, shuffle=False)

    print("Train Dataset: {}".format(X_train.shape))
    print("Val Dataset: {}".format(X_val.shape))
    print("Test Dataset: {}".format(X_test.shape))

    df_train = pd.concat([X_train, y_train], axis=1)
    df_val = pd.concat([X_val, y_val], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    return df_train, df_val, df_test


In [None]:
def load_data(data,name):
    cache_file_path = f"{name}_dataset.bin"
    dataset = MoleculeCSVDataset(data,
                                 smiles_to_graph=smiles_to_bigraph,
                                 node_featurizer=atom_featurizer,
                                 edge_featurizer= bond_featurizer,
                                 smiles_column='smiles',
                                 task_names=['ee'],init_mask=True,n_jobs=8,
                                 cache_file_path=cache_file_path
                                )
    return dataset

In [None]:
def data_loader_fn(dc_listings1, dc_listings2, dc_listings3):
    train_datasets = load_data(dc_listings1,'train')
    valid_datasets = load_data(dc_listings2,'valid')
    test_datasets = load_data(dc_listings3,'test')
    train_loader = DataLoader(train_datasets, batch_size=32,shuffle=False,
                              collate_fn=collate_molgraphs)
    valid_loader = DataLoader(valid_datasets,batch_size=32,shuffle=False,
                              collate_fn=collate_molgraphs)
    test_loader = DataLoader(test_datasets,batch_size=32,shuffle=False,
                              collate_fn=collate_molgraphs)
    return train_loader, valid_loader, test_loader

In [None]:
data_file_path = 'ART_30_splits.xlsx'
CV_list_name = ['fullcv_00','fullcv_01','fullcv_02','fullcv_03','fullcv_04','fullcv_05','fullcv_06','fullcv_07','fullcv_08','fullcv_09',
'fullcv_10','fullcv_11','fullcv_12','fullcv_13','fullcv_14','fullcv_15','fullcv_16','fullcv_17',
'fullcv_18','fullcv_19','fullcv_20','fullcv_21','fullcv_22','fullcv_23','fullcv_24','fullcv_25',
'fullcv_26','fullcv_27','fullcv_28','fullcv_29']

In [None]:
metrics = []
best_params_app = []

for i in CV_list_name:
    num_epoch = 300
    study = optuna.create_study(direction='minimize')
    data_file = pd.read_excel(data_file_path, sheet_name=i)
    dc_listings1, dc_listings2, dc_listings3 = dataset_func(data_file)
    train_loader, valid_loader, test_loader = data_loader_fn(dc_listings1, dc_listings2, dc_listings3)
    #print(next(iter(train_loader)))
    
    study.optimize(objective(train_loader, valid_loader, test_loader, num_epoch), n_trials=30)

    best_params = study.best_params
    best_accuracy = study.best_value

    best_params = study.best_params
    print("Best Hyperparameters:", best_params)
    print("Best Accuracy:", best_accuracy)

    accuracy = get_all_metrics(best_params, num_epoch, train_loader, valid_loader, test_loader, i)
    best_params_app.append(best_params)
    metrics.append(accuracy)

In [None]:
# Extract the train, valid, and test accuracy from the accuracy_list
train_accuracy = [item[0] for item in metrics]
valid_accuracy = [item[1] for item in metrics]
test_accuracy = [item[2] for item in metrics]

train_top_k = [item[3] for item in metrics]
valid_top_k = [item[4] for item in metrics]
test_top_k = [item[5] for item in metrics]

# Create a dictionary from the accuracy values
data_rs = {'Split': CV_list_name,
        'Train RMSE': train_accuracy,
        'Validation RMSE': valid_accuracy,
        'Test RMSE': test_accuracy,
        'Train_R2': train_top_k,
        'Valid_R2': valid_top_k,
        'Test_R2': test_top_k,
        'Parameters': best_params_app}
# Create a pandas DataFrame
dff_result = pd.DataFrame(data_rs)
dff_result

In [None]:
dff_result.to_csv('Performance_file.csv')
dff_result