In [None]:
#Installing the required library
!pip install tensorflow==2.10.0
!pip install numpy==1.23.4
!pip install pandas==1.5.2
!pip install scikit-learn==1.1.3

In [None]:
# Data processing and visualizationlibrary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches
# Tensorflow and keras library
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard
# ScikitLearn library
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
# Misc
import sys
import os
import random
from unicodedata import name
from datetime import datetime

print("To ensure repeatability, use the following setup:")
print("TensorFlow version:", "2.10.0")
print("Numpy version:", '1.23.4')
print("Pandas version:", '1.5.2')
print("SKLearn version:", "1.1.3")
print("Python version:", "3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]")
print("==============================")
print("Your current library version:")
print("TensorFlow version:", tf.__version__)
print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("SKLearn version:", sklearn.__version__)
print("Python version:",sys.version)

In [None]:
csv_path1 = 'https://raw.githubusercontent.com/afriwahyudi/Supplementary/main/Notebook_for_LocalMachine/datasetscsv/trainval_set.csv' #Training-validation datasets
csv_path2 = 'https://raw.githubusercontent.com/afriwahyudi/Supplementary/main/Notebook_for_LocalMachine/datasetscsv/test_set.csv' #Unseen test set
csv_path3 = 'https://raw.githubusercontent.com/afriwahyudi/Supplementary/main/Notebook_for_LocalMachine/datasetscsv/master_dataset.csv' #Combined train-validation-test set
csv_path4 = 'https://raw.githubusercontent.com/afriwahyudi/Supplementary/main/Notebook_for_LocalMachine/datasetscsv/allMolecule.csv' #Molecule only set

# Setting up random seed for reproducibility
seed = 21
keras_seed = 21
random.seed(seed) #This one is for python random seed
tf.keras.utils.set_random_seed(keras_seed) #This one is for tensorflow random seed

In [None]:
# Importing dataset
trainval_df = pd.read_csv(csv_path1, names=["Molecule", "Formula","SMILES","Type","Cyclicity",
                                       "Primary counts","Secondary counts","Tertiary counts",
                                       "Hydroxyl counts","Carboxyl counts","Oxyl counts",
                                       "M0(nhb)", "M0(oh)", "M0(nh)", "M0(op)", 
                                       "M1(nhb_donor)", "M1(nhb_weak)", "M1(nhb_acceptor)",
                                       "M1(oh_donor)", "M1(oh_weak)","M1(oh_acceptor)",
                                       "M1(nh_donor)", "M1(nh_weak)", "M1(nh_acceptor)",
                                       "M1(op_donor)", "M1(op_weak)", "M1(op_acceptor)",
                                       "M2(nhb)", "M2(oh)", "M2(nh)","M2(op)",
                                       "MW","Partial Pressure","Temperature",
                                       "Amine Concentration", "Absorption Capacity", "References","Rounded Concentration", "Abbreviation"]) 
test_df     = pd.read_csv(csv_path2, names=["Molecule", "Formula","SMILES","Type","Cyclicity", 
                                       "Primary counts","Secondary counts","Tertiary counts",
                                       "Hydroxyl counts","Carboxyl counts","Oxyl counts",
                                       "M0(nhb)", "M0(oh)", "M0(nh)", "M0(op)", 
                                       "M1(nhb_donor)", "M1(nhb_weak)", "M1(nhb_acceptor)",
                                       "M1(oh_donor)", "M1(oh_weak)","M1(oh_acceptor)",
                                       "M1(nh_donor)", "M1(nh_weak)", "M1(nh_acceptor)",
                                       "M1(op_donor)", "M1(op_weak)", "M1(op_acceptor)",
                                       "M2(nhb)", "M2(oh)", "M2(nh)","M2(op)",
                                       "MW","Partial Pressure","Temperature",
                                       "Amine Concentration", "Absorption Capacity", "References","Rounded Concentration", "Abbreviation"]) 
# PREPROCESSING
X_test_label = test_df.drop(columns=["Absorption Capacity"])
y_test = test_df['Absorption Capacity']

X_trainval_label =  trainval_df.drop(columns=["Absorption Capacity"])
y_trainval_label = trainval_df['Absorption Capacity']

# DATA SPLITING
X_train, X_val, y_train, y_val = train_test_split(X_trainval_label, y_trainval_label, test_size=0.20, random_state=seed)

# Reserve the dataset for visualization
X_train_1 = X_train.copy()
X_val_1 = X_val.copy()
X_reserved = pd.concat([X_train_1, X_val_1, X_test_label], axis=0)

# DROPPING STRINGS AND DATA PREPROCESSING
X_train = X_train.drop(columns=["Molecule", "Formula", "SMILES", "Type", "Cyclicity", "References", "Rounded Concentration", "Abbreviation"])
X_test = X_test_label.drop(columns=["Molecule", "Formula", "SMILES", "Type", "Cyclicity", "References", "Rounded Concentration", "Abbreviation"])
X_val = X_val.drop(columns=["Molecule", "Formula", "SMILES", "Type", "Cyclicity", "References", "Rounded Concentration", "Abbreviation"])

feature_names = X_train.columns.tolist()
preprocessor = Pipeline(steps=[('step1', StandardScaler())])
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train = pipeline.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=feature_names)

X_val = pipeline.transform(X_val)
X_val = pd.DataFrame(X_val, columns=feature_names)

X_test = pipeline.transform(X_test)
X_test = pd.DataFrame(X_test, columns=feature_names)

In [None]:
#-------------------------------------CREATE MODEL------------------------------------------#
tf.keras.backend.clear_session()
# Neural Network Architecture
input_features = keras.layers.Input(shape=X_train.shape[1], name='Input_Layer')
hidden_layer_0 = keras.layers.Dense(units=100, activation='swish', name='Hidden_Layer_0', kernel_initializer = 'glorot_normal')(input_features)
hidden_layer_0_bn = keras.layers.BatchNormalization()(hidden_layer_0) 
hidden_layer_1 = keras.layers.Dense(units=50, activation='swish', name='Hidden_Layer_1', kernel_initializer = 'glorot_normal')(hidden_layer_0_bn)
hidden_layer_1_bn = keras.layers.BatchNormalization()(hidden_layer_1) 
concat = keras.layers.concatenate([input_features,hidden_layer_1_bn])
output_layer = keras.layers.Dense(units=1,  activation='relu', name='Predicted_Loading', kernel_initializer = 'glorot_normal')(concat)
model = keras.models.Model(inputs=[input_features], outputs=[output_layer])
# create optimizer with custom learning rate
optimizer = Adam(learning_rate=0.001, decay=0.00001)
model.compile(loss='mae', optimizer=optimizer, metrics=['mse'])
# Define TensorBoard log directory
log_dir =   "Model_" + "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "ModelName"
# Create a TensorBoard callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
# fit the model to the data
cc_model = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                     epochs=5000, batch_size=32, verbose=0, callbacks=[tensorboard_callback, early_stopping])
#-------------------------------------CREATE MODEL------------------------------------------#
graph = keras.utils.plot_model(model, "Architecture"+".png", show_shapes=True)
model.summary()

In [None]:
#Saving THE MODEL WITH DESIRED NAME
model.save("ModelName.h5")

In [None]:
# Evaluate the model on the test data

y_train_pred = model.predict(X_train)
print("Train data RMSE: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Train data MAE: ", mean_absolute_error(y_train, y_train_pred))
print("Train data R^2: ", r2_score(y_train, y_train_pred))

y_val_pred = model.predict(X_val)
print("Validation data RMSE: ", np.sqrt(mean_squared_error(y_val, y_val_pred)))
print("Validation data MAE: ", mean_absolute_error(y_val, y_val_pred))
print("Validation data R^2: ", r2_score(y_val, y_val_pred))

y_test_pred = model.predict(X_test)
print("Test data RMSE: ", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test data MAE: ", mean_absolute_error(y_test, y_test_pred))
print("Test data R^2: ", r2_score(y_test, y_test_pred))


dz_1 = X_train_1['Type']
df_1 = pd.DataFrame(dz_1)

dz_2 = X_test_label['Type']
df_2 = pd.DataFrame(dz_2)

dz_3 = X_val_1['Type']
df_3 = pd.DataFrame(dz_3)
x_vals = np.linspace(min(y_train), max(y_train), 100)

# Assign a color to each categorical value (train_set)
colors_1 = {'Primary': 'red', 'Secondary': 'green', 'Tertiary': 'blue', 'Polyamine': 'purple'}
df_1['color'] = df_1['Type'].map(colors_1)

# Assign a color to each categorical value (test_set)
colors_2 = {'Primary': 'red', 'Secondary': 'green', 'Tertiary': 'blue', 'Polyamine': 'purple'}
df_2['color'] = df_2['Type'].map(colors_2)

# Assign a color to each categorical value (val_set)
colors_3 = {'Primary': 'red', 'Secondary': 'green', 'Tertiary': 'blue', 'Polyamine': 'purple'}
df_3['color'] = df_3['Type'].map(colors_3)

# Plotting the predictions for the train set
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_train_pred, c=df_1['color'])
plt.plot(x_vals, x_vals, color='black', linestyle='--')
plt.title('Train Set Predictions')
plt.xlabel(' True CO$_{2}$ Loading (mol$_{CO2}$/mol$_{A}$)')
plt.ylabel(' Prediction CO$_{2}$ Loading (mol$_{CO2}$/mol$_{A}$)')
legend_elements = [
    matplotlib.patches.Patch(facecolor=color, label=label)
    for label, color in colors_1.items()
]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Plotting the predictions for the test set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, c=df_2['color'])
plt.plot(x_vals, x_vals, color='black', linestyle='--')
plt.title('Test Set Predictions')
plt.xlabel(' True CO$_{2}$ Loading (mol$_{CO2}$/mol$_{A}$)')
plt.ylabel(' Prediction CO$_{2}$ Loading (mol$_{CO2}$/mol$_{A}$)')
legend_elements = [
    matplotlib.patches.Patch(facecolor=color, label=label)
    for label, color in colors_1.items()
]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Plotting the predictions for the validation set
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_val_pred, c=df_3['color'])
plt.plot(x_vals, x_vals, color='black', linestyle='--')
plt.title('Validation Set Predictions')
plt.xlabel(' True CO$_{2}$ Loading (mol$_{CO2}$/mol$_{A}$)')
plt.ylabel(' Prediction CO$_{2}$ Loading (mol$_{CO2}$/mol$_{A}$)')
legend_elements = [
    matplotlib.patches.Patch(facecolor=color, label=label)
    for label, color in colors_1.items()
]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
%load_ext tensorboard
%tensorboard --logdir=<log_dir>