In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import optuna

from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from tqdm import tqdm
from rdkit.Chem import rdMolDescriptors

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math
from sklearn.metrics import r2_score

import os 

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

# Define the DNN model with dropout and layer normalization
class DNN(nn.Module):
    def __init__(self, input_size, n_layers, n_neurons, dropout_rate):
        super(DNN, self).__init__()
        layers = []
        for _ in range(n_layers):
            layers.append(nn.Linear(input_size, n_neurons))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(n_neurons))
            layers.append(nn.Dropout(p=dropout_rate))
            input_size = n_neurons
        layers.append(nn.Linear(input_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

def dataset_func(X, y, device, rand_state):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.12, random_state=rand_state)

    print("Train Dataset: {}".format(X_train.shape))
    print("Val Dataset: {}".format(X_val.shape))
    print("Test Dataset: {}".format(X_test.shape))

    # Convert data to tensors and move to GPU
    X_train = torch.tensor(X_train, device=device).float()
    X_val = torch.tensor(X_val, device=device).float()
    X_test = torch.tensor(X_test, device=device).float() 
    
    y_train = torch.tensor(y_train.reshape(-1, 1), device=device).float()
    y_val = torch.tensor(y_val.reshape(-1, 1), device=device).float()
    y_test = torch.tensor(y_test.reshape(-1, 1), device=device).float()

    return X_train, y_train, X_val, y_val, X_test, y_test

def objective(X, y, rand_state, num_epoch):
    def objective_inner(trial):
        device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
        X_train, y_train, X_val, y_val, X_test, y_test = dataset_func(X, y, device, rand_state)

        input_size = X_train.size(1)

        n_layers = trial.suggest_int('n_layers', 1, 5)
        n_neurons = trial.suggest_int('n_neurons', 1, 100)
        dropout_rate = round(trial.suggest_uniform('dropout_rate', 0.0, 0.9), 5)
        learning_rate = round(trial.suggest_loguniform('learning_rate', 1e-5, 1e-1), 5)

        model = DNN(input_size, n_layers, n_neurons, dropout_rate).to(device)
        criterion = nn.MSELoss() 
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        for epoch in range(num_epoch):
            model.train()
            optimizer.zero_grad()

            outputs = model(X_train)
            loss = criterion(outputs, y_train)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            outputs = model(X_val)
            loss = criterion(outputs, y_val)

        return loss.item()

    return objective_inner

def get_all_metrics(best_params, rand_state, num_epoch):
    X_train, y_train, X_val, y_val, X_test, y_test = dataset_func(X, y, device, rand_state)

    input_size = X_train.size(1)

    best_n_layers = best_params['n_layers']
    best_n_neurons = best_params['n_neurons']
    best_dropout_rate = best_params['dropout_rate']
    best_learning_rate = best_params['learning_rate']

    model = DNN(input_size, best_n_layers, best_n_neurons, best_dropout_rate).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)

    train_loss_history = []
    val_loss_history = []

    for epoch in range(num_epoch):
        model.train()
        optimizer.zero_grad()

        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        train_loss = loss.item()
        train_loss_history.append(train_loss)

        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val).item()
            val_loss_history.append(val_loss)

    num_epoch_plot = num_epoch + 1
    plt.plot(range(1, num_epoch_plot), train_loss_history, label='Train Loss')
    plt.plot(range(1, num_epoch_plot), val_loss_history, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    model.eval()
    with torch.no_grad():
        outputs = model(X_train)
        train_rmse = math.sqrt(criterion(outputs, y_train).item())
        train_r2 = r2_score(y_train.cpu().numpy(), outputs.cpu().numpy())

    model.eval()
    with torch.no_grad():
        outputs = model(X_val)
        val_rmse = math.sqrt(criterion(outputs, y_val).item())
        val_r2 = r2_score(y_val.cpu().numpy(), outputs.cpu().numpy())

    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        test_rmse = math.sqrt(criterion(outputs, y_test).item())
        test_r2 = r2_score(y_test.cpu().numpy(), outputs.cpu().numpy())

    return train_rmse, val_rmse, test_rmse, train_r2, val_r2, test_r2



In [None]:
df=pd.read_csv('ATT_ind.csv',encoding='ISO-8859-1')
df

# Convert categorical data to one-hot encoded data
df_ohe = pd.get_dummies(df, columns=['alkene', 'ligand', 'substrate'], drop_first=True)
df_ohe = df_ohe.replace({True: 1, False: 0})
df_ohe

X = np.array(df_ohe.iloc[:,1:])
y=np.array(df_ohe['ee']).reshape(-1, 1)

In [None]:
metrics = []
best_params_app = []
#rand_state_list = [5,10,15]

# Set random state
rand_state_list = [i for i in range(0,30)]

for i in rand_state_list:
    num_epoch = 300
    study = optuna.create_study(direction='minimize')
    study.optimize(objective(X, y, i, num_epoch), n_trials=50)

    best_params = study.best_params
    best_accuracy = study.best_value

    best_params = study.best_params
    print("Best Hyperparameters:", best_params)
    print("Best Accuracy:", best_accuracy)

    accuracy = get_all_metrics(best_params, i, num_epoch)
    best_params_app.append(best_params)
    metrics.append(accuracy)

In [None]:
# Extract the train, valid, and test accuracy from the accuracy_list
train_accuracy = [item[0] for item in metrics]
valid_accuracy = [item[1] for item in metrics]
test_accuracy = [item[2] for item in metrics]

train_top_k = [item[3] for item in metrics]
valid_top_k = [item[4] for item in metrics]
test_top_k = [item[5] for item in metrics]

# Create a dictionary from the accuracy values
data_rs = {'Split': rand_state_list,
        'Train RMSE': train_accuracy,
        'Validation RMSE': valid_accuracy,
        'Test RMSE': test_accuracy,
        'Train_R2': train_top_k,
        'Valid_R2': valid_top_k,
        'Test_R2': test_top_k,
        'Parameters': best_params_app}
# Create a pandas DataFrame
dff_result = pd.DataFrame(data_rs)
dff_result

In [None]:
dff_result.to_csv('performance_file.csv', index =None)