In [8]:
# Data processing and visualizationlibrary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches
# Tensorflow and keras library
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard
from keras.models import load_model
# ScikitLearn library
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
# Misc
import sys
import os
import random
from unicodedata import name
from datetime import datetime

print("To ensure repeatability, use the following setup:")
print("TensorFlow version:", "2.10.0")
print("Numpy version:", '1.23.4')
print("Pandas version:", '1.5.2')
print("SKLearn version:", "1.1.3")
print("Python version:", "3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]")
print("==============================")
print("Your current library version:")
print("TensorFlow version:", tf.__version__)
print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("SKLearn version:", sklearn.__version__)
print("Python version:",sys.version)

To ensure repeatability, use the following setup:
TensorFlow version: 2.10.0
Numpy version: 1.23.4
Pandas version: 1.5.2
SKLearn version: 1.1.3
Python version: 3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]
Your current library version:
TensorFlow version: 2.10.0
Numpy version: 1.23.4
Pandas version: 1.5.2
SKLearn version: 1.1.3
Python version: 3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]


In [9]:
# Get the current directory
current_directory = os.getcwd()
# Construct the relative path to the CSV file
csv_filename1 = 'datasetscsv/trainval_set.csv' #Training-validation datasets
csv_filename2 = 'datasetscsv/test_set.csv' #Unseen test set
csv_filename3 = 'datasetscsv/master_dataset.csv' #Combined train-validation-test set
csv_filename4 = 'datasetscsv/allMolecule.csv' #Molecule only set
csv_path1 = os.path.join(current_directory, csv_filename1)
csv_path2 = os.path.join(current_directory, csv_filename2)
csv_path3 = os.path.join(current_directory, csv_filename3)
csv_path4 = os.path.join(current_directory, csv_filename4)
seed = 21
random.seed(seed)

In [10]:
# Importing dataset
trainval_df = pd.read_csv(csv_path1, names=["Molecule", "Formula","SMILES","Type","Cyclicity",
                                       "Primary counts","Secondary counts","Tertiary counts",
                                       "Hydroxyl counts","Carboxyl counts","Oxyl counts",
                                       "M0(nhb)", "M0(oh)", "M0(nh)", "M0(op)", 
                                       "M1(nhb_donor)", "M1(nhb_weak)", "M1(nhb_acceptor)",
                                       "M1(oh_donor)", "M1(oh_weak)","M1(oh_acceptor)",
                                       "M1(nh_donor)", "M1(nh_weak)", "M1(nh_acceptor)",
                                       "M1(op_donor)", "M1(op_weak)", "M1(op_acceptor)",
                                       "M2(nhb)", "M2(oh)", "M2(nh)","M2(op)",
                                       "MW","Partial Pressure","Temperature",
                                       "Amine Concentration", "Absorption Capacity", "References","Rounded Concentration", "Abbreviation"]) 
test_df     = pd.read_csv(csv_path2, names=["Molecule", "Formula","SMILES","Type","Cyclicity", 
                                       "Primary counts","Secondary counts","Tertiary counts",
                                       "Hydroxyl counts","Carboxyl counts","Oxyl counts",
                                       "M0(nhb)", "M0(oh)", "M0(nh)", "M0(op)", 
                                       "M1(nhb_donor)", "M1(nhb_weak)", "M1(nhb_acceptor)",
                                       "M1(oh_donor)", "M1(oh_weak)","M1(oh_acceptor)",
                                       "M1(nh_donor)", "M1(nh_weak)", "M1(nh_acceptor)",
                                       "M1(op_donor)", "M1(op_weak)", "M1(op_acceptor)",
                                       "M2(nhb)", "M2(oh)", "M2(nh)","M2(op)",
                                       "MW","Partial Pressure","Temperature",
                                       "Amine Concentration", "Absorption Capacity", "References","Rounded Concentration", "Abbreviation"]) 
# PREPROCESSING
X_test_label = test_df.drop(columns=["Absorption Capacity"])
y_test = test_df['Absorption Capacity']

X_trainval_label =  trainval_df.drop(columns=["Absorption Capacity"])
y_trainval_label = trainval_df['Absorption Capacity']

# DATA SPLITING
X_train, X_val, y_train, y_val = train_test_split(X_trainval_label, y_trainval_label, test_size=0.20, random_state=seed)

# Reserve the dataset for visualization
X_train_1 = X_train.copy()
X_val_1 = X_val.copy()
X_reserved = pd.concat([X_train_1, X_val_1, X_test_label], axis=0)

# DROPPING STRINGS AND DATA PREPROCESSING
X_train = X_train.drop(columns=["Molecule", "Formula", "SMILES", "Type", "Cyclicity", "References", "Rounded Concentration", "Abbreviation"])
X_test = X_test_label.drop(columns=["Molecule", "Formula", "SMILES", "Type", "Cyclicity", "References", "Rounded Concentration", "Abbreviation"])
X_val = X_val.drop(columns=["Molecule", "Formula", "SMILES", "Type", "Cyclicity", "References", "Rounded Concentration", "Abbreviation"])

feature_names = X_train.columns.tolist()
preprocessor = Pipeline(steps=[('step1', StandardScaler())])
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train = pipeline.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=feature_names)

X_val = pipeline.transform(X_val)
X_val = pd.DataFrame(X_val, columns=feature_names)

X_test = pipeline.transform(X_test)
X_test = pd.DataFrame(X_test, columns=feature_names)

master_df = pd.read_csv(csv_path3, 
    names=["Molecule","Formula","SMILES","Type","Cyclicity",
           "Primary counts","Secondary counts","Tertiary counts",
           "Hydroxyl counts","Carboxyl counts","Oxyl counts",
           "M0(nhb)", "M0(oh)", "M0(nh)", "M0(op)", 
           "M1(nhb_donor)", "M1(nhb_weak)", "M1(nhb_acceptor)",
           "M1(oh_donor)", "M1(oh_weak)","M1(oh_acceptor)",
           "M1(nh_donor)", "M1(nh_weak)", "M1(nh_acceptor)",
           "M1(op_donor)", "M1(op_weak)", "M1(op_acceptor)",
           "M2(nhb)", "M2(oh)", "M2(nh)","M2(op)",
           "MW","Partial Pressure","Temperature",
           "Amine Concentration", "Absorption Capacity", "References","Rounded Concentration","Abbreviation"]) 
scaler = preprocessor.named_steps['step1']
mean_values = scaler.mean_[27] # pressure structure
std_values = scaler.scale_[27] # pressure structure

In [11]:
#MODEL IMPORT
model_path = [os.path.join(current_directory, 'Model/SWISH_WnD.h5'),
              os.path.join(current_directory, 'Model/SWISH_SQ.h5'),
              os.path.join(current_directory, 'Model/ReLU_WnD.h5'),
              os.path.join(current_directory, 'Model/ReLU_SQ.h5'),
              os.path.join(current_directory, 'Model/sigmoid_WnD.h5'),
              os.path.join(current_directory, 'Model/sigmoid_SQ.h5'),
              os.path.join(current_directory, 'Model/tanh_WnD.h5'),
              os.path.join(current_directory, 'Model/tanh_SQ.h5')]
RMSE = []
MAE  = []
R2 = []
for i in range(len(model_path)):
    model = load_model(model_path[i])
    y_train_pred    = model.predict(X_train)
    RSME_train      = np.sqrt(mean_squared_error(y_train, y_train_pred))
    MAE_train       = mean_absolute_error(y_train, y_train_pred)
    R2_train        = r2_score(y_train, y_train_pred)*100

    y_val_pred = model.predict(X_val)
    RMSE_val        = np.sqrt(mean_squared_error(y_val, y_val_pred))
    MAE_val         = mean_absolute_error(y_val, y_val_pred)
    R2_val          = r2_score(y_val, y_val_pred)*100

    y_test_pred     = model.predict(X_test)
    RMSE_test       = np.sqrt(mean_squared_error(y_test, y_test_pred))
    MAE_test        = mean_absolute_error(y_test, y_test_pred)
    R2_test         = r2_score(y_test, y_test_pred)*100

    RMSE.append([RSME_train, RMSE_val, RMSE_test])
    MAE.append([MAE_train, MAE_val, MAE_test])
    R2.append([R2_train, R2_val, R2_test])




In [12]:
RMSE = pd.DataFrame(RMSE, columns=['RMSE Train','RMSE Validation','RMSE Test'])
MAE = pd.DataFrame(MAE, columns=['MAE Train','MAE Validation','MAE Test'])
R2 = pd.DataFrame(R2, columns=['R2 Train','R2 Validation','R2 Test'])

In [13]:
mod_name = {
    'Model' : [
        'Swish: WnD',
        'Swish: SQ',
        'ReLU: WnD',
        'ReLU: SQ',
        'sigm: WnD',
        'sigm: SQ',
        'tanh: WnD',
        'tanh: SQ'
        ]}
mod_name = pd.DataFrame(mod_name)
result = pd.concat([mod_name,RMSE,MAE,R2], axis=1)
result['Avg'] = (result['R2 Validation']+result['R2 Train'])/2
result['Cons'] = result['Avg']-result['R2 Test']
result

Unnamed: 0,Model,RMSE Train,RMSE Validation,RMSE Test,MAE Train,MAE Validation,MAE Test,R2 Train,R2 Validation,R2 Test,Avg,Cons
0,Swish: WnD,0.073325,0.092935,0.102713,0.047792,0.05893,0.075215,96.459997,94.811873,86.014808,95.635935,9.621127
1,Swish: SQ,0.090979,0.112676,0.135044,0.062866,0.074739,0.099751,94.550188,92.373708,75.825281,93.461948,17.636667
2,ReLU: WnD,0.068157,0.084614,0.110713,0.045698,0.054855,0.085026,96.941432,95.699279,83.751516,96.320355,12.568839
3,ReLU: SQ,0.079593,0.100312,0.127111,0.053124,0.067046,0.103524,95.828901,93.95548,78.581829,94.892191,16.310361
4,sigm: WnD,0.101605,0.116907,0.14771,0.07031,0.079824,0.112807,93.202818,91.790184,71.077731,92.496501,21.41877
5,sigm: SQ,0.08909,0.111478,0.175648,0.058871,0.070928,0.13977,94.774111,92.534928,59.102349,93.65452,34.552171
6,tanh: WnD,0.070953,0.088539,0.136924,0.046433,0.055204,0.107139,96.6853,95.29109,75.147366,95.988195,20.840829
7,tanh: SQ,0.086778,0.104768,0.138598,0.05575,0.067229,0.100927,95.041832,93.406557,74.536046,94.224195,19.688149
