In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import gc
import time
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw
from rdkit import RDConfig
from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski, rdDistGeom, rdPartialCharges
from rdkit.Chem.AllChem import GetMorganGenerator
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [None]:
tf.keras.backend.clear_session()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [6]:
target_path = "result/3_solubility_descriptor_deeplearning"
os.makedirs(target_path, exist_ok=True)

In [7]:
data_ws = pd.read_csv('./data/ws496_logS.csv', dtype={'SMILES': 'string'})
smiles_ws = data_ws['SMILES']
y_ws = data_ws.iloc[:, 2]

data_delaney = pd.read_csv('./data/delaney-processed.csv', dtype={'smiles': 'string'})
smiles_de = data_delaney['smiles']
y_de = data_delaney.iloc[:, 1]

data_lovric2020 = pd.read_csv('./data/Lovric2020_logS0.csv', dtype={'isomeric_smiles': 'string'})
smiles_lo = data_lovric2020['isomeric_smiles']
y_lo = data_lovric2020.iloc[:, 1]

data_huuskonen = pd.read_csv('./data/huusk.csv', dtype={'SMILES': 'string'})
smiles_hu = data_huuskonen['SMILES']
y_hu = data_huuskonen.iloc[:, -1].astype('float')

In [8]:
def mol3d(mol):
    mol = Chem.AddHs(mol)
    optimization_methods = [
        (AllChem.EmbedMolecule, (mol, AllChem.ETKDGv3()), {}),
        (AllChem.UFFOptimizeMolecule, (mol,), {'maxIters': 200}),
        (AllChem.MMFFOptimizeMolecule, (mol,), {'maxIters': 200})
    ]

    for method, args, kwargs in optimization_methods:
        try:
            method(*args, **kwargs)
            if mol.GetNumConformers() > 0:
                return mol
        except ValueError as e:
            print(f"Error: {e} - Trying next optimization method [{method}]")

    print(f"Invalid mol for 3d {'\033[94m'}{Chem.MolToSmiles(mol)}{'\033[0m'} - No conformer generated")
    return None

In [9]:
def convert_smiles_to_mol(smiles, fail_folder=None, index=None, yvalue=None):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"[convert_smiles_to_mol] Cannot convert {smiles} to Mols")
        return None, {"smiles": smiles, "y_value": yvalue, "error": "Invalid SMILES"}

    try:
        Chem.Kekulize(mol, clearAromaticFlags=True)
        isomeric_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
        mol = Chem.MolFromSmiles(isomeric_smiles)
    except Exception as e:
        print(f"[convert_smiles_to_mol] failed {smiles} isomeric_smiles by {e}")
        if fail_folder and index is not None:
            img_path = os.path.join(fail_folder, f"mol_{index}.png")
            img = Draw.MolToImage(mol)
            img.save(img_path)
        return None, {"smiles": smiles, "y_value": yvalue, "error": f"Isomeric SMILES error: {e}"}

    try:
        Chem.SanitizeMol(mol)
    except Exception as e:
        print(f"[convert_smiles_to_mol] failed {smiles} SanitizeMol by {e}")
        if fail_folder and index is not None:
            img_path = os.path.join(fail_folder, f"mol_{index}.png")
            img = Draw.MolToImage(mol)
            img.save(img_path)
        return None, {"smiles": smiles, "y_value": yvalue, "error": f"SanitizeMol error: {e}"}

    return mol, None

In [10]:
def process_smiles(smiles, yvalue, fail_folder, index):
    mol, error = convert_smiles_to_mol(smiles, fail_folder, index, yvalue)
    if error:
        return None, None, error

    mol_3d = mol3d(mol)
    if mol_3d:
        return smiles, yvalue, None
    else:
        img_path = os.path.join(fail_folder, f"mol_{index}.png")
        img = Draw.MolToImage(mol)
        img.save(img_path)
        return None, None, {"smiles": smiles, "y_value": yvalue}

def process_dataset(smiles_list, y_values, dataset_name, target_path="result", max_workers=None):
    start = time.time()
    valid_smiles, valid_y = [], []
    error_smiles_list = []
    fail_folder = f"{target_path}/failed/{dataset_name}"
    os.makedirs(fail_folder, exist_ok=True)

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_smiles, smiles, yvalue, fail_folder, i)
            for i, (smiles, yvalue) in enumerate(zip(smiles_list, y_values))
        ]
        for future in as_completed(futures):
            smiles, yvalue, error = future.result()
            if error:
                error_smiles_list.append(error)
            elif smiles is not None and yvalue is not None:
                valid_smiles.append(smiles)
                valid_y.append(yvalue)

    if error_smiles_list:
        error_df = pd.DataFrame(error_smiles_list)
        error_df.to_csv(os.path.join(fail_folder, "failed_smiles.csv"), index=False)
    print(f" [{dataset_name:<10}] : {time.time()-start:.4f} sec")
    return valid_smiles, valid_y

In [None]:
smiles_ws, y_ws = process_dataset(smiles_ws, y_ws, "ws496", target_path)
smiles_de, y_de = process_dataset(smiles_de, y_de, "delaney", target_path)
smiles_lo, y_lo = process_dataset(smiles_lo, y_lo, "Lovric2020_logS0", target_path)
smiles_hu, y_hu = process_dataset(smiles_hu, y_hu, "huusk", target_path)

In [12]:
LEN_OF_FF = 2048
LEN_OF_MA = 167
LEN_OF_AV = 512

In [13]:
def get_fingerprints(mol):
    if mol is None:
        return None, None, None
    
    morgan_generator = GetMorganGenerator(radius=2, fpSize=LEN_OF_FF)
    ecfp = morgan_generator.GetFingerprint(mol)
    ecfp_array = np.zeros((LEN_OF_FF,),dtype=int)
    DataStructs.ConvertToNumpyArray(ecfp, ecfp_array)
    
    maccs = Chem.rdMolDescriptors.GetMACCSKeysFingerprint(mol)

    avalon_fp = GetAvalonFP(mol)
    avalon_array = np.zeros((LEN_OF_AV,),dtype=int)
    DataStructs.ConvertToNumpyArray(avalon_fp, avalon_array)
    
    return ecfp_array, maccs, avalon_array

def fp_converter(data, use_parallel=True):
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    
    if use_parallel:
        try:            
            with ProcessPoolExecutor() as executor:
                results = list(executor.map(get_fingerprints, mols))
        except Exception as e:
            print(f"Parallel processing failed due to: {e}. Falling back to sequential processing.")
            use_parallel = False
    
    if not use_parallel:
        results = [get_fingerprints(mol) for mol in mols]
    
    ECFP, MACCS, AvalonFP = zip(*results)
    
    ECFP_container = np.vstack([arr for arr in ECFP if arr is not None])
    MACCS_container = np.zeros((len(MACCS), LEN_OF_MA), dtype=int)
    AvalonFP_container = np.vstack([arr for arr in AvalonFP if arr is not None])

    for i, fp in enumerate(MACCS):
        if fp is not None:
            DataStructs.ConvertToNumpyArray(fp, MACCS_container[i])
    
    return mols, ECFP_container, MACCS_container, AvalonFP_container

In [14]:
mol_ws, x_ws, MACCS_ws, AvalonFP_ws = fp_converter(smiles_ws)
mol_de, x_de, MACCS_de, AvalonFP_de = fp_converter(smiles_de)
mol_lo, x_lo, MACCS_lo, AvalonFP_lo = fp_converter(smiles_lo)
mol_hu, x_hu, MACCS_hu, AvalonFP_hu = fp_converter(smiles_hu)

In [15]:
BATCHSIZE = 32
EPOCHS = 100
lr = 0.01 #0.0001
decay = 1e-5 #1e-4

In [16]:
def new_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(
            units=1024,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(decay)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(
            units=469,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(decay)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(units=1)
        ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                      loss=tf.keras.losses.MeanSquaredError(),
                      metrics=[tf.keras.losses.MeanSquaredError(),
                               tf.keras.losses.MeanAbsoluteError(),
                               tf.keras.metrics.RootMeanSquaredError()])
    return model

def new_inference_model(input_dim):
    model = tf.keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(
            units=1024,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(decay)),
        layers.Dropout(0.2),
        layers.Dense(
            units=469,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(decay)),
        layers.Dropout(0.2),
        layers.Dense(units=1)
    ])
    return model

def save_model(x_data):
    model_path = "save_model/full_model.keras"
    if not os.path.exists(model_path):
        try:
            model = new_inference_model(x_data.shape[1])
            os.makedirs("save_model", exist_ok=True)
            model.save(model_path)
            # print(f"Model successfully saved to {model_path}")
        except Exception as e:
            print(f"Error saving model: {e}")
    else:
        # print(f"Model already exists at {model_path}")
        os.remove(model_path)
        save_model(x_data)

In [17]:
import subprocess
import logging
# Environment settings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 --tf_xla_enable_xla_devices'
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda --xla_gpu_force_compilation_parallelism=1'

# Suppress TensorFlow logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)

In [18]:
def learning_result(xdata, ydata):
    BATCHSIZE = 32
    EPOCHS = 100
    try:
        tf.keras.backend.clear_session()
        xtr, xte, ytr, yte = train_test_split(xdata, ydata, test_size=0.2, random_state=42)
        model_new = new_model()
        model_new.fit(xtr, ytr, epochs=EPOCHS, batch_size=BATCHSIZE, validation_split=0.2,verbose=0)
        ypred = model_new.predict(xte, verbose=0)
        record = r2_score(ypred, yte)
    except Exception as e:
        print(f"[learning_result]Error occureed: {e}")
    tf.keras.backend.clear_session()
    del model_new    
    gc.collect()
    return record


In [19]:
def evaluate_descriptor(fps, descriptor, descriptor_name, y_true, r2_list, esti_time=None, cpu_calc=False):
    try:
        new_fps = fps.copy()
        start = time.time()
        
        # Descriptor processing code remains the same...
        if descriptor is None:
            pass
        elif isinstance(descriptor, np.ndarray) and descriptor.ndim >= 2:
            try:
                new_fps = np.concatenate([new_fps, descriptor], axis=1)
                del descriptor
            except Exception as e:
                print(f"[-1-] Error occured: {e}")
        elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray):
            try:
                arrays_1d = [arr[:, None] for arr in descriptor if arr.ndim == 1]
                arrays_2d = [arr for arr in descriptor if arr.ndim == 2]
                combined_1d = np.concatenate(arrays_1d, axis=1) if arrays_1d else None
                combined_2d = np.concatenate(arrays_2d, axis=1) if arrays_2d else None
                to_concat = [new_fps] + [arr for arr in [combined_1d, combined_2d] if arr is not None]
                new_fps = np.concatenate(to_concat, axis=1)
                del descriptor, arrays_1d, arrays_2d
                if combined_1d is not None: del combined_1d
                if combined_2d is not None: del combined_2d
            except Exception as e:
                print(f"[-2-] Error occured: {e}")
        elif isinstance(descriptor, list) and isinstance(descriptor[0], list):
            try:
                descriptor = np.asarray(descriptor).astype('float')
                new_fps = np.concatenate([new_fps, descriptor], axis=1)
                del descriptor
            except Exception as e:
                print(f"[-3-] Error occured: {e}")
        else:
            descriptor = np.asarray(descriptor).astype('float')
            new_fps = np.concatenate([new_fps, descriptor[:,None]], axis=1)
            del descriptor

        try:
            new_fps = np.nan_to_num(new_fps, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
            y_true = np.asarray(y_true).astype('float')
            
            # Save the data files with simple names
            np.save('new_fps.npy', new_fps)
            np.save('y_true.npy', y_true)
            
            save_model(new_fps)  # Make sure this function exists and works as expected
            
            # Run the learning process
            result = subprocess.run(['python3', './extra_code/learning_process.py', 
                                   str(BATCHSIZE), str(EPOCHS), 
                                   str(lr),
                                   'new_fps.npy', 'y_true.npy'],
                                  stdout=subprocess.PIPE, 
                                  stderr=subprocess.PIPE, 
                                  text=True,
                                  encoding='utf-8')  # Explicitly specify encoding
            stdout_lines = result.stdout.strip().splitlines()

            # Find R2 in the stdout output
            r2_result = 0.0
            for line in stdout_lines:
                if line.startswith("R2:"):
                    r2_result = float(line.split("R2:")[1].strip().split()[0])
                    break
            
            if result.stderr:
                print(f"[{descriptor_name}] stderr output:", result.stderr)

            # Store the result
            r2_list[descriptor_name] = r2_result
            
            duration_time = time.time() - start
            if esti_time is not None:
                esti_time[descriptor_name] = duration_time

            print(f'=========================={descriptor_name:<20} R2 score: {r2_result:.4f} ({duration_time:.3f} sec)')

        except Exception as inner_e:
            print(f"[{descriptor_name}] Error during learning process: {inner_e}")
            r2_list[descriptor_name] = 0.0
            duration_time = time.time() - start
            if esti_time is not None:
                esti_time[descriptor_name] = duration_time
            
        finally:
            # Clean up memory
            del new_fps
            gc.collect()
            
    except Exception as e:
        print(f"[{descriptor_name}] Error in evaluate_descriptor: {e}")
        r2_list[descriptor_name] = 0.0
        if esti_time is not None:
            esti_time[descriptor_name] = time.time() - start


In [20]:
# def evaluate_descriptor(fps, descriptor, descriptor_name, y_true, r2_list, cpu_calc=False, esti_time=None):
#     # try:
#     new_fps = fps.copy()
#     start = time.time()
#     if descriptor is None:
#         pass
#     elif isinstance(descriptor, np.ndarray) and descriptor.ndim >= 2:
#         # print("-1-")
#         # try:
#         #     print("numpy",descriptor.shape)
#         # except:
#         #     print("list",len(descriptor))
#         # print(descriptor)
#         try:
#             new_fps = np.concatenate([new_fps, descriptor], axis=1)
#             del descriptor
#         except Exception as e:
#             print(f"[-1-] Error occured: {e}")
#     elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray):
#         # print("-2-")
#         # try:
#         #     print("numpy",descriptor.shape)
#         # except:
#         #     print("list",len(descriptor))
#         # print(descriptor)
#         try:
#             arrays_1d = [arr[:, None] for arr in descriptor if arr.ndim == 1]
#             arrays_2d = [arr for arr in descriptor if arr.ndim == 2]
#             combined_1d = np.concatenate(arrays_1d, axis=1) if arrays_1d else None
#             combined_2d = np.concatenate(arrays_2d, axis=1) if arrays_2d else None
#             to_concat = [new_fps] + [arr for arr in [combined_1d, combined_2d] if arr is not None]
#             new_fps = np.concatenate(to_concat, axis=1)
#             del descriptor, arrays_1d, arrays_2d
#             if combined_1d is not None: del combined_1d
#             if combined_2d is not None: del combined_2d
#         except Exception as e:
#             print(f"[-2-] Error occured: {e}")
#     elif isinstance(descriptor, list) and isinstance(descriptor[0], list):
#         # print("-3-")
#         # try:
#         #     print("numpy",descriptor.shape)
#         # except:
#         #     print("list",len(descriptor))
#         # print(descriptor)
#         try:
#             descriptor = np.asarray(descriptor).astype('float')
#             new_fps = np.concatenate([new_fps, descriptor], axis=1)
#             del descriptor
#         except Exception as e:
#             print(f"[-3-] Error occured: {e}")
#     else:
#         # print("-4-")
#         # try:
#         #     print("numpy",descriptor.shape)
#         # except:
#         #     print("list",len(descriptor))
#         # print(descriptor)
#         descriptor = np.asarray(descriptor).astype('float')
#         new_fps = np.concatenate([new_fps, descriptor[:,None]], axis=1)
#         del descriptor
            
#         #######################################################
#         #######################################################
#         #######################################################
#         #######################################################
#     try:
#         new_fps = np.nan_to_num(new_fps, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#         y_true = np.asarray(y_true).astype('float')
#         if cpu_calc:
#             r2_list[descriptor_name] = learning_result(new_fps, y_true)
#         else:
#             np.save('new_fps.npy', new_fps)
#             np.save('y_true.npy', y_true)
            
#             save_model(new_fps)  
            
#             result = subprocess.run(['python3', './extra_code/learning_process.py', 
#                                     str(BATCHSIZE), str(EPOCHS), 
#                                     str(lr),
#                                     'new_fps.npy', 'y_true.npy'],
#                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            
#             if result.stderr:
#                 filtered_stderr = '\n'.join([line for line in result.stderr.split('\n') if "could not open file to read NUMA node" not in line and "Your kernel may have been built without NUMA support" not in line])
#                 if filtered_stderr:
#                     print(f"Error in subprocess: {filtered_stderr}", file=sys.stderr)

#             for line in result.stdout.splitlines():
#                 if "R2" in line:
#                     if "(prune)" in line:
#                         print(f"Pruning trial due to poor R2: {line}")
#                         r2_result = 0.0
#                     else:
#                         r2_result = float(line.split(":")[1].strip())
#                         print(f"R2 score: {r2_result}")
#             r2_list[descriptor_name] = r2_result

        
#     except Exception as e:
#         print(f"Error during learning result process: {e}", file=sys.stderr)
#         r2_list[descriptor_name] = 0.0

#     duration_time = time.time() - start
#     if esti_time is not None:
#         esti_time[descriptor_name] = duration_time

#     print(f'=========================={descriptor_name:<20} R2 score: {r2_result:.4f} ({duration_time:.3f} sec)')

#     del new_fps
#     gc.collect()
        
#     # except Exception as e:
#     #     print(f"Error in evaluate_descriptor: {e}, Descriptor: {descriptor_name}", file=sys.stderr)

In [21]:
# 241023
# def evaluate_descriptor(fps, descriptor, descriptor_name, y_true, r2_list, esti_time=None):
#     try:
#         new_fps = fps.copy()
#         start = time.time()
#         if descriptor is None:
#             pass
#         elif isinstance(descriptor, np.ndarray) and len(descriptor.shape) == 2:
#             if new_fps.shape[0] != descriptor.shape[0]:
#                 raise ValueError(f"-2- Shape mismatch: fps has {new_fps.shape[0]} rows, but descriptor has {descriptor.shape[0]} rows")
#             new_fps = np.concatenate([new_fps, descriptor], axis=1)
#             del descriptor
#         elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray):
#             descriptor_2d = [arr.reshape(-1, 1) if arr.ndim == 1 else arr for arr in descriptor]
#             descriptor_df = np.concatenate(descriptor_2d, axis=1)
#             new_fps = np.concatenate([new_fps, descriptor_df], axis=1)
#             del descriptor, descriptor_df, descriptor_2d
#         elif isinstance(descriptor, list) and isinstance(descriptor[0], list):
#             descriptor = np.asarray(descriptor)
#             new_fps = np.concatenate((new_fps, descriptor), axis=1)
#             del descriptor
#         else:
#             descriptor = np.asarray(descriptor).reshape(-1, 1)
#             if new_fps.shape[0] != descriptor.shape[0]:
#                 raise ValueError(f"-3- Shape mismatch: fps has {new_fps.shape[0]} rows, but descriptor has {descriptor.shape[0]} rows")
#             new_fps = np.concatenate((new_fps, descriptor), axis=1)
#             del descriptor
            
#         #######################################################
#         #######################################################
#         #######################################################
#         #######################################################
#         try:
#             new_fps = np.nan_to_num(new_fps, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#             y_true = np.asarray(y_true).astype('float')
#             np.save('new_fps.npy', new_fps)
#             np.save('y_true.npy', y_true)
            
#             save_model(new_fps)  
            
#             result = subprocess.run(['python3', './extra_code/learning_process.py', 
#                                  str(BATCHSIZE), str(EPOCHS), 
#                                  str(lr),
#                                  'new_fps.npy', 'y_true.npy'],
#                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            
#             if result.stderr:
#                 filtered_stderr = '\n'.join([line for line in result.stderr.split('\n') if "could not open file to read NUMA node" not in line and "Your kernel may have been built without NUMA support" not in line])
#                 if filtered_stderr:
#                     print(f"Error in subprocess: {filtered_stderr}", file=sys.stderr)

#             for line in result.stdout.splitlines():
#                 if "R2" in line:
#                     if "(prune)" in line:
#                         print(f"Pruning trial due to poor R2: {line}")
#                         r2_result = 0.0
#                     else:
#                         r2_result = float(line.split(":")[1].strip())
#                         print(f"R2 score: {r2_result}")

#             # if result.returncode != 0:
#             #     raise ValueError(f"[{descriptor_name}] Error during learning result process: {result.stderr}")
#             # try:
#             #     r2_result = float(result.stdout.strip())
#             # except ValueError:
#             #     raise ValueError(f"[{descriptor_name}] Unable to parse R² score from output: {result.stdout}")

#             # if r2_result is None:
#             #     raise ValueError(f"[{descriptor_name}] R² score is None")
#             r2_list[descriptor_name] = r2_result
            
#         except Exception as e:
#             print(f"Error during learning result process: {e}", file=sys.stderr)
#             r2_list[descriptor_name] = 0.0

#         duration_time = time.time() - start
#         if esti_time is not None:
#             esti_time[descriptor_name] = duration_time

#         print(f'=========================={descriptor_name:<20} R2 score: {r2_result:.4f} ({duration_time:.3f} sec)')

#         del new_fps
#         gc.collect()
        
#     except Exception as e:
#         print(f"Error in evaluate_descriptor: {e}, Descriptor: {descriptor_name}", file=sys.stderr)

In [22]:
# def evaluate_descriptor(fps, descriptor, descriptor_name, y_true, r2_list, esti_time=None):
#     try:
#         new_fps = fps.copy()
#         start = time.time()
#         if descriptor is None:
#             pass
#         elif isinstance(descriptor, np.ndarray) and len(descriptor.shape) == 2:
#             if new_fps.shape[0] != descriptor.shape[0]:
#                 raise ValueError(f"-2- Shape mismatch: fps has {new_fps.shape[0]} rows, but descriptor has {descriptor.shape[0]} rows")
#             new_fps = np.concatenate([new_fps, descriptor], axis=1)
#             del descriptor
#         elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray):
#             descriptor_2d = [arr.reshape(-1, 1) if arr.ndim == 1 else arr for arr in descriptor]
#             descriptor_df = np.concatenate(descriptor_2d, axis=1)
#             new_fps = np.concatenate([new_fps, descriptor_df], axis=1)
#             del descriptor, descriptor_df, descriptor_2d
#         elif isinstance(descriptor, list) and isinstance(descriptor[0], list):
#             descriptor = np.asarray(descriptor)
#             new_fps = np.concatenate((new_fps, descriptor), axis=1)
#             del descriptor
#         else:
#             descriptor = np.asarray(descriptor).reshape(-1, 1)
#             if new_fps.shape[0] != descriptor.shape[0]:
#                 raise ValueError(f"-3- Shape mismatch: fps has {new_fps.shape[0]} rows, but descriptor has {descriptor.shape[0]} rows")
#             new_fps = np.concatenate((new_fps, descriptor), axis=1)
#             del descriptor

#         try:
#             new_fps = np.nan_to_num(new_fps, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#             y_true = np.asarray(y_true).astype('float')
#             np.save('new_fps.npy', new_fps)
#             np.save('y_true.npy', y_true)
            
#             save_model(new_fps)  
            
#             result = subprocess.run(['python3', './extra_code/learning_process.py', 
#                                  str(BATCHSIZE), str(EPOCHS), 
#                                  str(lr),
#                                  'new_fps.npy', 'y_true.npy'],
#                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            
#             if result.stderr:
#                 sys.stderr.write(result.stderr)

#             if result.returncode != 0:
#                 raise ValueError(f"[{descriptor_name}] Error during learning result process: {result.stderr}")

#             try:
#                 r2_result = float(result.stdout.strip())
#             except ValueError:
#                 raise ValueError(f"[{descriptor_name}] Unable to parse R² score from output: {result.stdout}")

#             if r2_result is None:
#                 raise ValueError(f"[{descriptor_name}] R² score is None")
#             r2_list[descriptor_name] = r2_result
            
#         except Exception as inner_e:
#             print(f"Error during learning result process: {inner_e}", file=sys.stderr)
#             r2_list[descriptor_name] = 0

#         duration_time = time.time() - start
#         if esti_time is not None:
#             esti_time[descriptor_name] = duration_time

#         print(f'=========================={descriptor_name:<20} R2 score: {r2_result:.4f} ({duration_time:.3f} sec)')

#         del new_fps
#         gc.collect()
        
#     except Exception as e:
#         print(f"Error in evaluate_descriptor: {e}, Descriptor: {descriptor_name}", file=sys.stderr)

In [23]:
def Normalization(descriptor):
    descriptor = np.asarray(descriptor)
    epsilon = 1e-10
    max_value = 1e15
    descriptor = np.clip(descriptor, -max_value, max_value)
    descriptor_custom = np.where(np.abs(descriptor) < epsilon, epsilon, descriptor)
    descriptor_log = np.sign(descriptor_custom) * np.log1p(np.abs(descriptor_custom))
    descriptor_log = np.nan_to_num(descriptor_log, nan=0.0, posinf=0.0, neginf=0.0)
    del epsilon
    gc.collect()    
    return descriptor_log

# def Normalization(descriptor, logOnly=True):
#     if logOnly:
#         descriptor = np.asarray(descriptor)
#         descriptor = np.log1p(descriptor+0.0001)
#         descriptor = np.nan_to_num(descriptor, nan=0.0, posinf=0.0, neginf=0.0)
#         gc.collect()
#         return descriptor
#     else:
#         epsilon = 1e-10
#         descriptor_adjusted = np.where(np.abs(descriptor) < epsilon, epsilon, descriptor)
#         descriptor_log_transformed = np.sign(descriptor_adjusted) * np.log1p(np.abs(descriptor_adjusted))
#         del epsilon
#         gc.collect()
#         return descriptor_log_transformed
    

# # Original R2 score: 0.7853 (9.152 sec)
# # Ipc R2 score: 0.8647 (8.066 sec)
# # PMI1 R2 score: 0.8014 (8.812 sec)
# # PMI2 R2 score: 0.7945 (8.344 sec)
# # PMI3 R2 score: 0.7944 (8.918 sec)
# # PMI_ALL_sum R2 score: 0.7639 (8.805 sec)
# # PMI_ALL_ind R2 score: 0.7968 (9.276 sec)
# # MORSE R2 score: 0.7654 (9.119 sec)
# # GETAWAY R2 score: 0.7837 (8.395 sec)
# # <class 'dict'> {'Original': 0.7853323221206665, 'Ipc': 0.8647328019142151, 'PMI1': 0.8014324903488159, 'PMI2': 0.7944905161857605, 'PMI3': 0.7943621873855591, 'PMI_ALL_sum': 0.763858437538147, 'PMI_ALL_ind': 0.7968299388885498, 'MORSE': 0.7653760313987732, 'GETAWAY': 0.7837035059928894

# # 1. Min-Max 스케일링
# def Normalization1(descriptor):
#     min_val = np.min(descriptor)
#     max_val = np.max(descriptor)
#     return (descriptor - min_val) / (max_val - min_val)
#         # 모든 값을 0과 1 사이로 변환합니다.
#         # 원래 데이터의 상대적인 차이를 유지합니다.
#         # 음수 값도 처리할 수 있습니다.
#         # 단점: 이상치에 민감할 수 있습니다.

# # Original R2 score: 0.7897 (9.397 sec)
# # Ipc R2 score: 0.7891 (8.114 sec)
# # PMI1 R2 score: 0.7897 (8.418 sec)
# # PMI2 R2 score: 0.7937 (8.643 sec)
# # PMI3 R2 score: 0.7960 (8.784 sec)
# # PMI_ALL_sum R2 score: 0.8045 (8.100 sec)
# # PMI_ALL_ind R2 score: 0.7887 (8.632 sec)
# # MORSE R2 score: 0.8002 (8.797 sec)
# # GETAWAY R2 score: 0.7991 (8.633 sec)
# # <class 'dict'> {'Original': 0.7897382974624634, 'Ipc': 0.7890591621398926, 'PMI1': 0.7897088527679443, 'PMI2': 0.7937353849411011, 'PMI3': 0.795976459980011, 'PMI_ALL_sum': 0.8044917583465576, 'PMI_ALL_ind': 0.7886590957641602, 'MORSE': 0.8001644611358643, 'GETAWAY': 0.7991337776184082        


# # 2. 로버스트 스케일링
# def Normalization2(descriptor):
#     median = np.median(descriptor)
#     q1, q3 = np.percentile(descriptor, [25, 75])
#     iqr = q3 - q1
#     return (descriptor - median) / iqr
#         # 중앙값을 0으로, 사분위수 범위를 1로 만듭니다.
#         # 이상치의 영향을 줄입니다.
#         # 음수와 양수 값을 모두 유지합니다. 

# # Original R2 score: 0.7555 (8.747 sec)
# # Ipc R2 score: 0.7010 (8.247 sec)
# # PMI1 R2 score: 0.8046 (8.603 sec)
# # PMI2 R2 score: 0.7892 (8.903 sec)
# # PMI3 R2 score: 0.7925 (8.180 sec)
# # PMI_ALL_sum R2 score: 0.8078 (8.460 sec)
# # PMI_ALL_ind R2 score: 0.8207 (8.786 sec)
# # MORSE R2 score: -0.4633 (8.368 sec)
# # GETAWAY R2 score: 0.7914 (8.156 sec)
# # <class 'dict'> {'Original': 0.7555391192436218, 'Ipc': 0.7009564638137817, 'PMI1': 0.804631233215332, 'PMI2': 0.7892019748687744, 'PMI3': 0.7925474643707275, 'PMI_ALL_sum': 0.8078181147575378, 'PMI_ALL_ind': 0.820658266544342, 'MORSE': -0.46331870555877686, 'GETAWAY': 0.7913508415222168


# # 3. 표준화 (Z-score 정규화):
# def Normalization3(descriptor):
#     mean = np.mean(descriptor)
#     std = np.std(descriptor)
#     return (descriptor - mean) / std
#         # 평균을 0으로, 표준편차를 1로 만듭니다.
#         # 음수와 양수 값을 모두 유지합니다.
#         # 정규 분포를 따르는 데이터에 적합합니다.

# Original R2 score: 0.7848 (8.133 sec)
# WARNING:tensorflow:5 out of the last 9 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x7f23178bfe20> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
# WARNING:tensorflow:6 out of the last 12 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x7f23178bfe20> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
# Ipc R2 score: 0.7911 (8.218 sec)
# PMI1 R2 score: 0.7916 (7.400 sec)
# PMI2 R2 score: 0.8127 (8.217 sec)
# PMI3 R2 score: 0.7969 (8.257 sec)
# PMI_ALL_sum R2 score: 0.8016 (8.246 sec)
# PMI_ALL_ind R2 score: 0.8103 (7.652 sec)
# MORSE R2 score: 0.8166 (8.539 sec)
# GETAWAY R2 score: 0.8039 (8.539 sec)
# <class 'dict'> {'Original': 0.784812867641449, 'Ipc': 0.7910993099212646, 'PMI1': 0.7916101813316345, 'PMI2': 0.8127365708351135, 'PMI3': 0.7969383001327515, 'PMI_ALL_sum': 0.8016161918640137, 'PMI_ALL_ind': 0.8103429079055786, 'MORSE': 0.8166193962097168, 'GETAWAY': 0.8039106130599976}

# # 4. 중앙값과 절대편차 (MAD) 정규화:
# def Normalization4(descriptor):
#     median = np.median(descriptor)
#     mad = np.median(np.abs(descriptor - median))
#     return (descriptor - median) / mad
#         # 중앙값을 0으로, MAD를 1로 만듭니다.
#         # 이상치에 매우 강건합니다.
#         # 음수와 양수 값을 모두 유지합니다.

# # Original R2 score: 0.7931 (10.758 sec)
# # Ipc R2 score: -1150.3533 (8.209 sec)
# # PMI1 R2 score: 0.7592 (8.801 sec)
# # PMI2 R2 score: 0.7604 (9.580 sec)
# # PMI3 R2 score: 0.7664 (8.281 sec)
# # PMI_ALL_sum R2 score: 0.7016 (9.358 sec)
# # PMI_ALL_ind R2 score: 0.7478 (9.451 sec)
# # MORSE R2 score: -1.5698 (8.654 sec)
# # GETAWAY R2 score: 0.7888 (8.044 sec)
# # <class 'dict'> {'Original': 0.7931280732154846, 'Ipc': -1150.353271484375, 'PMI1': 0.7591773271560669, 'PMI2': 0.7603589296340942, 'PMI3': 0.7663605213165283, 'PMI_ALL_sum': 0.7015960812568665, 'PMI_ALL_ind': 0.7478456497192383, 'MORSE': -1.56978440284729, 'GETAWAY': 0.7888303995132446

# # 5. 최대 절대값 스케일링
# def Normalization5(descriptor):
#     max_abs = np.max(np.abs(descriptor))
#     return descriptor / max_abs
#         # 모든 값을 -1과 1 사이로 변환합니다.
#         # 0을 중심으로 하는 데이터에 유용합니다.
#         # 원래 데이터의 부호를 유지합니다.
        
# # Original R2 score: 0.7959 (9.297 sec)
# # Ipc R2 score: 0.7932 (8.758 sec)
# # PMI1 R2 score: 0.7897 (8.806 sec)
# # PMI2 R2 score: 0.8081 (8.967 sec)
# # PMI3 R2 score: 0.8110 (9.134 sec)
# # PMI_ALL_sum R2 score: 0.7716 (8.160 sec)
# # PMI_ALL_ind R2 score: 0.7780 (9.306 sec)
# # MORSE R2 score: 0.7969 (8.748 sec)
# # GETAWAY R2 score: 0.8007 (8.906 sec)
# # <class 'dict'> {'Original': 0.7958791255950928, 'Ipc': 0.7931562066078186, 'PMI1': 0.7896729111671448, 'PMI2': 0.8081350922584534, 'PMI3': 0.8109619617462158, 'PMI_ALL_sum': 0.7715592384338379, 'PMI_ALL_ind': 0.7780115008354187, 'MORSE': 0.7968926429748535, 'GETAWAY': 0.8007307052612305}

# # 6. Outlier
# def Normalization6(descriptor, clip_threshold=3.0):
#     # 1. 이상치 제거
#     mean = np.mean(descriptor, axis=0)
#     std = np.std(descriptor, axis=0)
#     z_scores = np.abs((descriptor - mean) / std)
#     descriptor_clipped = np.clip(descriptor, 
#                                  mean - clip_threshold * std, 
#                                  mean + clip_threshold * std)
#     median = np.median(descriptor_clipped, axis=0)
#     q1 = np.percentile(descriptor_clipped, 25, axis=0)
#     q3 = np.percentile(descriptor_clipped, 75, axis=0)
#     iqr = q3 - q1
#     iqr = np.where(iqr == 0, 1e-6, iqr)    
#     scaled_descriptor = (descriptor_clipped - median) / iqr
#     final_descriptor = np.clip(scaled_descriptor, -5, 5)
    
#     return final_descriptor

# # Original R2 score: 0.7818 (9.309 sec)
# # Ipc R2 score: 0.8058 (8.277 sec)
# # PMI1 R2 score: 0.8024 (9.047 sec)
# # PMI2 R2 score: 0.7887 (8.503 sec)
# # PMI3 R2 score: 0.7969 (9.133 sec)
# # PMI_ALL_sum R2 score: 0.8020 (9.083 sec)
# # PMI_ALL_ind R2 score: 0.8004 (9.157 sec)
# # MORSE R2 score: 0.6951 (8.468 sec)
# # GETAWAY R2 score: 0.6884 (8.690 sec)
# # <class 'dict'> {'Original': 0.7818286418914795, 'Ipc': 0.8058270812034607, 'PMI1': 0.8023815155029297, 'PMI2': 0.7886528372764587, 'PMI3': 0.7969330549240112, 'PMI_ALL_sum': 0.8020067811012268, 'PMI_ALL_ind': 0.8003607392311096, 'MORSE': 0.6950668096542358, 'GETAWAY': 0.6883505582809448}

In [24]:
def values_chi(mol, chi_type):
    i = 0
    chi_func = Chem.GraphDescriptors.ChiNn_ if chi_type == 'n' else Chem.GraphDescriptors.ChiNv_
    while chi_func(mol, i) != 0.0:
        i += 1
    return np.array([chi_func(mol, j) for j in range(i)])

def generate_chi(mols, chi_type, n_jobs=None):
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        futures = [executor.submit(values_chi, mol, chi_type) for mol in mols]
        descriptor = [future.result() for future in futures]
    
    max_length = max(len(x) for x in descriptor)
    padded_descriptor = np.array([np.pad(x, (0, max_length - len(x)), 'constant') for x in descriptor])
    
    return padded_descriptor

def sanitize_and_compute_descriptor(mol):
    try:
        mol = Chem.RemoveHs(mol)        
        Chem.SanitizeMol(mol)        
        try:
            return Chem.rdMolDescriptors.BCUT2D(mol)
        except Exception as e:
            print(f"BCUT2D calculation failed: {e}")            
            # Fallback to a simpler descriptor (e.g., MolWt)
            return [Descriptors.MolWt(mol)] * 8
    except Exception as e:
        # smiles = Chem.MolToSmiles(mol) if mol else 'Unknown'
        # print(f"Error with molecule: {smiles} - {e}")
        return [0] * 8

def compute_descriptors_parallel(mols, n_jobs=None):
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        futures = [executor.submit(sanitize_and_compute_descriptor, mol) for mol in mols if mol is not None]
        descriptors = [future.result() for future in futures]
    return np.array(descriptors)

In [25]:
def process_molecules_parallel(mols, max_workers=4, chunk_size=100):
    results = []    
    for i in range(0, len(mols), chunk_size):
        chunk = mols[i:i + chunk_size]        
        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(mol3d, mol) for mol in chunk]
            for future in as_completed(futures):
                result = future.result()
                if result is not None:
                    results.append(result)        
        gc.collect()    
    return results

In [26]:
def cleanup_resources(local_vars):
    for name, _ in list(local_vars.items()):
        if name not in ['r2_list', 'esti_time']:  # Keep the return value
            del local_vars[name]
    devices = tf.config.list_logical_devices('GPU')
    if devices:
        with tf.device(devices[0].name):
            tf.keras.backend.clear_session()
            gc.collect()        
    gc.collect()        

In [27]:
def descriptors_list(fps, mols, y_true, target_name=None, target_path="", cpu_calc=False, read_logs=False):
    r2_list={}
    esti_time = {}
    #########################################
    try:
        evaluate_descriptor(fps, None, 'Original', y_true, r2_list, esti_time, cpu_calc)
        gc.collect()
        descriptor = [Chem.Descriptors.ExactMolWt(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'ExactMolWt', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Descriptors.MolWt(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'MolWt', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Crippen.MolLogP(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'MolLogP', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Crippen.MolMR(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'MolMR', y_true, r2_list, esti_time, cpu_calc) 
        del descriptor
        gc.collect()
        descriptor = [Chem.Descriptors.TPSA(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'TPSA', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumRotatableBonds(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumRotatableBonds', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.HeavyAtomCount(alpha)     for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'HeavyAtomCount', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumHAcceptors(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumHAcceptors', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumHDonors(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumHDonors', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumHeteroatoms(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumHeteroatoms', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NHOHCount(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NHOHCount', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NOCount(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NOCount', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.RingCount(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'RingCount', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumAromaticRings(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumAromaticRings', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumSaturatedRings(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumSaturatedRings', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.NumAliphaticRings(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumAliphaticRings', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcLabuteASA(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'CalcLabuteASA', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Descriptors.NumValenceElectrons(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'NumValenceElectrons', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.GraphDescriptors.BalabanJ(alpha) for alpha in mols]
        # descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'BalabanJ', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.GraphDescriptors.BertzCT(alpha) for alpha in mols]
        # descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'BertzCT', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.GraphDescriptors.Ipc(alpha) for alpha in mols]
        descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'Ipc', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor1 = [Chem.GraphDescriptors.Kappa1(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor1, 'Kappa1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.GraphDescriptors.Kappa2(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor2, 'Kappa2', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.GraphDescriptors.Kappa3(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor3, 'Kappa3', y_true, r2_list, esti_time, cpu_calc)
        d1 = np.asarray(descriptor1).astype('float')
        d2 = np.asarray(descriptor2).astype('float')
        d3 = np.asarray(descriptor3).astype('float')
        del descriptor1,descriptor2,descriptor3
        gc.collect()
        evaluate_descriptor(fps, d1+d2+d3, 'Kappa_all_sum', y_true, r2_list, esti_time, cpu_calc)
        dataset = [d1, d2, d3]
        evaluate_descriptor(fps, dataset, 'Kappa_all_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1, d2, d3
        #########################################
        if read_logs:
            print("# 1 Finished")
        #########################################
        descriptor1 = [Chem.GraphDescriptors.Chi0(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor1, 'Chi0', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.GraphDescriptors.Chi0n(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor2, 'Chi0n', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.GraphDescriptors.Chi0v(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor3, 'Chi0v', y_true, r2_list, esti_time, cpu_calc)
        descriptor4 = [Chem.GraphDescriptors.Chi1(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor4, 'Chi1', y_true, r2_list, esti_time, cpu_calc)
        descriptor5 = [Chem.GraphDescriptors.Chi1n(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor5, 'Chi1n', y_true, r2_list, esti_time, cpu_calc)
        descriptor6 = [Chem.GraphDescriptors.Chi1v(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor6, 'Chi1v', y_true, r2_list, esti_time, cpu_calc)
        descriptor7 = [Chem.GraphDescriptors.Chi2n(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor7, 'Chi2n', y_true, r2_list, esti_time, cpu_calc)
        descriptor8 = [Chem.GraphDescriptors.Chi2v(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor8, 'Chi2v', y_true, r2_list, esti_time, cpu_calc)
        descriptor9 = [Chem.GraphDescriptors.Chi3n(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor9, 'Chi3n', y_true, r2_list, esti_time, cpu_calc)
        descriptor10 = [Chem.GraphDescriptors.Chi3v(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor10, 'Chi3v', y_true, r2_list, esti_time, cpu_calc)
        descriptor11 = [Chem.GraphDescriptors.Chi4n(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor11, 'Chi4n', y_true, r2_list, esti_time, cpu_calc)
        descriptor12 = [Chem.GraphDescriptors.Chi4v(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor12, 'Chi4v', y_true, r2_list, esti_time, cpu_calc)
        #########################################
        descriptor13 = generate_chi(mols, 'n')
        evaluate_descriptor(fps, descriptor13, 'ChiN_', y_true, r2_list, esti_time, cpu_calc)
        #########################################
        descriptor14 = generate_chi(mols, 'v')
        evaluate_descriptor(fps, descriptor14, 'ChiNv_', y_true, r2_list, esti_time, cpu_calc)
        #########################################
        d1  = np.asarray(descriptor1).astype('float')
        d2  = np.asarray(descriptor2).astype('float')
        d3  = np.asarray(descriptor3).astype('float')
        d4  = np.asarray(descriptor4).astype('float')
        d5  = np.asarray(descriptor5).astype('float')
        d6  = np.asarray(descriptor6).astype('float')
        d7  = np.asarray(descriptor7).astype('float')
        d8  = np.asarray(descriptor8).astype('float')
        d9  = np.asarray(descriptor9).astype('float')
        d10 = np.asarray(descriptor10).astype('float')
        d11 = np.asarray(descriptor11).astype('float')
        d12 = np.asarray(descriptor12).astype('float')
        d13 = np.asarray(descriptor13).astype('float')
        d14 = np.asarray(descriptor14).astype('float')
        td13 = d13.mean(axis=1)
        td14 = d14.mean(axis=1)
        evaluate_descriptor(fps, d1+d2+d3+d4+d5+d6+d7+d8+d9+d10+d11+d12+td13+td14, 'Chi_ALL_sum', y_true, r2_list, esti_time, cpu_calc)
        del descriptor1,descriptor2,descriptor3,descriptor4,descriptor5,descriptor6,descriptor7,descriptor8,descriptor9,descriptor10,descriptor11,descriptor12, descriptor13, descriptor14
        gc.collect()
        dataset = [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14]
        evaluate_descriptor(fps, dataset, 'Chi_ALL_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14
        ########################################
        if read_logs:
            print("# 2 - Chi_series Finished")
        #########################################
        descriptor = [Chem.rdMolDescriptors.CalcPhi(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'CalcPhi', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.GraphDescriptors.HallKierAlpha(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'HallKierAlpha', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcNumAmideBonds(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'CalcNumAmideBonds', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.Lipinski.FractionCSP3(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'FractionCSP3', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcNumSpiroAtoms(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'CalcNumSpiroAtoms', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcNumBridgeheadAtoms(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'CalcNumBridgeheadAtoms', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        #########################################
        if read_logs:
            print("# 3 Finished")
        #########################################
        descriptor1 = [Chem.MolSurf.PEOE_VSA1(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor1, 'PEOE_VSA1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.MolSurf.PEOE_VSA2(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor2, 'PEOE_VSA2', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.MolSurf.PEOE_VSA3(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor3, 'PEOE_VSA3', y_true, r2_list, esti_time, cpu_calc)
        descriptor4 = [Chem.MolSurf.PEOE_VSA4(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor4, 'PEOE_VSA4', y_true, r2_list, esti_time, cpu_calc)
        descriptor5 = [Chem.MolSurf.PEOE_VSA5(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor5, 'PEOE_VSA5', y_true, r2_list, esti_time, cpu_calc)
        descriptor6 = [Chem.MolSurf.PEOE_VSA6(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor6, 'PEOE_VSA6', y_true, r2_list, esti_time, cpu_calc)
        descriptor7 = [Chem.MolSurf.PEOE_VSA7(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor7, 'PEOE_VSA7', y_true, r2_list, esti_time, cpu_calc)
        descriptor8 = [Chem.MolSurf.PEOE_VSA8(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor8, 'PEOE_VSA8', y_true, r2_list, esti_time, cpu_calc)
        descriptor9 = [Chem.MolSurf.PEOE_VSA9(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor9, 'PEOE_VSA9', y_true, r2_list, esti_time, cpu_calc)
        descriptor10 = [Chem.MolSurf.PEOE_VSA10(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor10, 'PEOE_VSA10', y_true, r2_list, esti_time, cpu_calc)
        descriptor11 = [Chem.MolSurf.PEOE_VSA11(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor11, 'PEOE_VSA11', y_true, r2_list, esti_time, cpu_calc)
        descriptor12 = [Chem.MolSurf.PEOE_VSA12(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor12, 'PEOE_VSA12', y_true, r2_list, esti_time, cpu_calc)
        descriptor13 = [Chem.MolSurf.PEOE_VSA13(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor13, 'PEOE_VSA13', y_true, r2_list, esti_time, cpu_calc)
        descriptor14 = [Chem.MolSurf.PEOE_VSA14(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor14, 'PEOE_VSA14', y_true, r2_list, esti_time, cpu_calc)
        d1  = np.asarray(descriptor1 ).astype('float')
        d2  = np.asarray(descriptor2 ).astype('float')
        d3  = np.asarray(descriptor3 ).astype('float')
        d4  = np.asarray(descriptor4 ).astype('float')
        d5  = np.asarray(descriptor5 ).astype('float')
        d6  = np.asarray(descriptor6 ).astype('float')
        d7  = np.asarray(descriptor7 ).astype('float')
        d8  = np.asarray(descriptor8 ).astype('float')
        d9  = np.asarray(descriptor9 ).astype('float')
        d10 = np.asarray(descriptor10).astype('float')
        d11 = np.asarray(descriptor11).astype('float')
        d12 = np.asarray(descriptor12).astype('float')
        d13 = np.asarray(descriptor13).astype('float')
        d14 = np.asarray(descriptor14).astype('float')
        evaluate_descriptor(fps, d1+d2+d3+d4+d5+d6+d7+d8+d9+d10+d11+d12+d13+d14, 'PEOE_VSA_ALL_sum', y_true, r2_list, esti_time, cpu_calc)
        del descriptor1,descriptor2,descriptor3,descriptor4,descriptor5,descriptor6,descriptor7,descriptor8,descriptor9,descriptor10,descriptor11,descriptor12,descriptor13,descriptor14
        gc.collect() 
        dataset = [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14]
        evaluate_descriptor(fps, dataset, 'PEOE_VSA_ALL_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14
        #########################################
        if read_logs:
            print("# 4 - PEOE_VSA series Finished")
        #########################################
        descriptor1 = [Chem.MolSurf.SMR_VSA1(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor1, 'SMR_VSA1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.MolSurf.SMR_VSA2(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor2, 'SMR_VSA2', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.MolSurf.SMR_VSA3(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor3, 'SMR_VSA3', y_true, r2_list, esti_time, cpu_calc)
        descriptor4 = [Chem.MolSurf.SMR_VSA4(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor4, 'SMR_VSA4', y_true, r2_list, esti_time, cpu_calc)
        descriptor5 = [Chem.MolSurf.SMR_VSA5(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor5, 'SMR_VSA5', y_true, r2_list, esti_time, cpu_calc)
        descriptor6 = [Chem.MolSurf.SMR_VSA6(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor6, 'SMR_VSA6', y_true, r2_list, esti_time, cpu_calc)
        descriptor7 = [Chem.MolSurf.SMR_VSA7(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor7, 'SMR_VSA7', y_true, r2_list, esti_time, cpu_calc)
        descriptor8 = [Chem.MolSurf.SMR_VSA8(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor8, 'SMR_VSA8', y_true, r2_list, esti_time, cpu_calc)
        descriptor9 = [Chem.MolSurf.SMR_VSA9(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor9, 'SMR_VSA9', y_true, r2_list, esti_time, cpu_calc)
        descriptor10 = [Chem.MolSurf.SMR_VSA10(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor10, 'SMR_VSA10', y_true, r2_list, esti_time, cpu_calc)
        d1  = np.asarray(descriptor1 ).astype('float')
        d2  = np.asarray(descriptor2 ).astype('float')
        d3  = np.asarray(descriptor3 ).astype('float')
        d4  = np.asarray(descriptor4 ).astype('float')
        d5  = np.asarray(descriptor5 ).astype('float')
        d6  = np.asarray(descriptor6 ).astype('float')
        d7  = np.asarray(descriptor7 ).astype('float')
        d8  = np.asarray(descriptor8 ).astype('float')
        d9  = np.asarray(descriptor9 ).astype('float')
        d10 = np.asarray(descriptor10).astype('float')
        evaluate_descriptor(fps, d1+d2+d3+d4+d5+d6+d7+d8+d9+d10, 'SMR_VSA_ALL_SUM', y_true, r2_list, esti_time, cpu_calc)
        del descriptor1,descriptor2,descriptor3,descriptor4,descriptor5,descriptor6,descriptor7,descriptor8,descriptor9,descriptor10
        gc.collect()
        dataset = [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10]
        evaluate_descriptor(fps, dataset, 'SMR_VSA_ALL_IND', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
        #########################################
        if read_logs:
            print("# 5 - SMR_VSA series Finished")
        #########################################
        descriptor1 = [Chem.MolSurf.SlogP_VSA1(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor1, 'SlogP_VSA1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.MolSurf.SlogP_VSA2(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor2, 'SlogP_VSA2', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.MolSurf.SlogP_VSA3(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor3, 'SlogP_VSA3', y_true, r2_list, esti_time, cpu_calc)
        descriptor4 = [Chem.MolSurf.SlogP_VSA4(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor4, 'SlogP_VSA4', y_true, r2_list, esti_time, cpu_calc)
        descriptor5 = [Chem.MolSurf.SlogP_VSA5(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor5, 'SlogP_VSA5', y_true, r2_list, esti_time, cpu_calc)
        descriptor6 = [Chem.MolSurf.SlogP_VSA6(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor6, 'SlogP_VSA6', y_true, r2_list, esti_time, cpu_calc)
        descriptor7 = [Chem.MolSurf.SlogP_VSA7(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor7, 'SlogP_VSA7', y_true, r2_list, esti_time, cpu_calc)
        descriptor8 = [Chem.MolSurf.SlogP_VSA8(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor8, 'SlogP_VSA8', y_true, r2_list, esti_time, cpu_calc)
        descriptor9 = [Chem.MolSurf.SlogP_VSA9(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor9, 'SlogP_VSA9', y_true, r2_list, esti_time, cpu_calc)
        descriptor10 = [Chem.MolSurf.SlogP_VSA10(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor10, 'SlogP_VSA10', y_true, r2_list, esti_time, cpu_calc)
        descriptor11 = [Chem.MolSurf.SlogP_VSA11(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor11, 'SlogP_VSA11', y_true, r2_list, esti_time, cpu_calc)
        descriptor12 = [Chem.MolSurf.SlogP_VSA12(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor12, 'SlogP_VSA12', y_true, r2_list, esti_time, cpu_calc)
        d1  = np.asarray(descriptor1 ).astype('float')
        d2  = np.asarray(descriptor2 ).astype('float')
        d3  = np.asarray(descriptor3 ).astype('float')
        d4  = np.asarray(descriptor4 ).astype('float')
        d5  = np.asarray(descriptor5 ).astype('float')
        d6  = np.asarray(descriptor6 ).astype('float')
        d7  = np.asarray(descriptor7 ).astype('float')
        d8  = np.asarray(descriptor8 ).astype('float')
        d9  = np.asarray(descriptor9 ).astype('float')
        d10 = np.asarray(descriptor10).astype('float')
        d11 = np.asarray(descriptor11).astype('float')
        d12 = np.asarray(descriptor12).astype('float')
        evaluate_descriptor(fps, d1+d2+d3+d4+d5+d6+d7+d8+d9+d10+d11+d12, 'SlogP_VSA_ALL_sum', y_true, r2_list, esti_time, cpu_calc)
        del descriptor1,descriptor2,descriptor3,descriptor4,descriptor5,descriptor6,descriptor7,descriptor8,descriptor9,descriptor10,descriptor11,descriptor12
        gc.collect()
        dataset = [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12]
        evaluate_descriptor(fps, dataset, 'SlogP_VSA_ALL_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12
        #########################################
        if read_logs:
            print("# 6 - SlogP_VSA series Finished")
        #########################################
        descriptor1 = [Chem.EState.EState_VSA.VSA_EState1(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor1, 'EState_VSA_EState1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.EState.EState_VSA.VSA_EState2(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor2, 'EState_VSA_EState2', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.EState.EState_VSA.VSA_EState3(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor3, 'EState_VSA_EState3', y_true, r2_list, esti_time, cpu_calc)
        descriptor4 = [Chem.EState.EState_VSA.VSA_EState4(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor4, 'EState_VSA_EState4', y_true, r2_list, esti_time, cpu_calc)
        descriptor5 = [Chem.EState.EState_VSA.VSA_EState5(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor5, 'EState_VSA_EState5', y_true, r2_list, esti_time, cpu_calc)
        descriptor6 = [Chem.EState.EState_VSA.VSA_EState6(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor6, 'EState_VSA_EState6', y_true, r2_list, esti_time, cpu_calc)
        descriptor7 = [Chem.EState.EState_VSA.VSA_EState7(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor7, 'EState_VSA_EState7', y_true, r2_list, esti_time, cpu_calc)
        descriptor8 = [Chem.EState.EState_VSA.VSA_EState8(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor8, 'EState_VSA_EState8', y_true, r2_list, esti_time, cpu_calc)
        descriptor9 = [Chem.EState.EState_VSA.VSA_EState9(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor9, 'EState_VSA_EState9', y_true, r2_list, esti_time, cpu_calc)
        descriptor10 = [Chem.EState.EState_VSA.VSA_EState10(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor10, 'EState_VSA_EState10', y_true, r2_list, esti_time, cpu_calc)
        d1  = np.asarray(descriptor1 ).astype('float')
        d2  = np.asarray(descriptor2 ).astype('float')
        d3  = np.asarray(descriptor3 ).astype('float')
        d4  = np.asarray(descriptor4 ).astype('float')
        d5  = np.asarray(descriptor5 ).astype('float')
        d6  = np.asarray(descriptor6 ).astype('float')
        d7  = np.asarray(descriptor7 ).astype('float')
        d8  = np.asarray(descriptor8 ).astype('float')
        d9  = np.asarray(descriptor9 ).astype('float')
        d10 = np.asarray(descriptor10).astype('float')
        del descriptor1,descriptor2,descriptor3,descriptor4,descriptor5,descriptor6,descriptor7,descriptor8,descriptor9,descriptor10
        gc.collect()
        evaluate_descriptor(fps, d1+d2+d3+d4+d5+d6+d7+d8+d9+d10, 'VSA_EState_ALL_sum', y_true, r2_list, esti_time, cpu_calc)
        dataset = [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10]
        evaluate_descriptor(fps, dataset, 'VSA_EState_ALL_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
        #########################################
        if read_logs:
            print("# 7 - VSA_EState series Finished")
            print("# 2D descriptors finished")
        ######################################### 3D Descriptors
        start = time.time()
        # mols2=[mol3d(mols) for mols in mols]
        mols2 = process_molecules_parallel(mols, max_workers=8)
        print(f"Converted to mols2: {time.time()-start:.2f} sec")
        gc.collect()
        #########################################
        descriptor = [Chem.rdMolDescriptors.CalcAsphericity(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor, 'Asphericity', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcPBF(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor, 'PBF', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor1 = [Chem.rdMolDescriptors.CalcPMI1(alpha) for alpha in mols2]
        descriptor1 = Normalization(descriptor1)
        evaluate_descriptor(fps, descriptor1, 'PMI1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.rdMolDescriptors.CalcPMI2(alpha) for alpha in mols2]    
        descriptor2 = Normalization(descriptor2)
        evaluate_descriptor(fps, descriptor2, 'PMI2', y_true, r2_list, esti_time, cpu_calc)
        descriptor3 = [Chem.rdMolDescriptors.CalcPMI3(alpha) for alpha in mols2]
        descriptor3 = Normalization(descriptor3)
        evaluate_descriptor(fps, descriptor3, 'PMI3', y_true, r2_list, esti_time, cpu_calc)
        d1  = np.asarray(descriptor1).astype('float')
        d2  = np.asarray(descriptor2).astype('float')
        d3  = np.asarray(descriptor3).astype('float')
        del descriptor1,descriptor2,descriptor3
        gc.collect()
        evaluate_descriptor(fps, d1+d2+d3, 'PMI_ALL_sum', y_true, r2_list, esti_time, cpu_calc)
        dataset = [d1,d2,d3]
        evaluate_descriptor(fps, dataset, 'PMI_ALL_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset,d1,d2,d3
        # #########################################
        descriptor1 = [Chem.rdMolDescriptors.CalcNPR1(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor1, 'NPR1', y_true, r2_list, esti_time, cpu_calc)
        descriptor2 = [Chem.rdMolDescriptors.CalcNPR2(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor2, 'NPR2', y_true, r2_list, esti_time, cpu_calc)
        d1  = np.asarray(descriptor1).astype('float')
        d2  = np.asarray(descriptor2).astype('float')
        del descriptor1, descriptor2
        gc.collect()
        evaluate_descriptor(fps, d1+d2, 'NPR_ALL_sum', y_true, r2_list, esti_time, cpu_calc)
        dataset = [d1,d2]
        evaluate_descriptor(fps, dataset, 'NPR_ALL_ind', y_true, r2_list, esti_time, cpu_calc)
        del dataset, d1,d2
        #########################################
        descriptor = [Chem.rdMolDescriptors.CalcRadiusOfGyration(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor, 'RadiusOfGyration', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcInertialShapeFactor(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor, 'InertialShapeFactor', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcEccentricity(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor, 'Eccentricity', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcSpherocityIndex(alpha) for alpha in mols2]
        evaluate_descriptor(fps, descriptor, 'SpherocityIndex', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.MQNs_(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'MQNs', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR2D(alpha) for alpha in mols]
        evaluate_descriptor(fps, descriptor, 'AUTOCORR2D', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR3D(mols) for mols in mols2]
        evaluate_descriptor(fps, descriptor, 'AUTOCORR3D', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcRDF(mols) for mols in mols2]
        descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'RDF', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = compute_descriptors_parallel(mols)
        evaluate_descriptor(fps, descriptor, 'BCUT2D', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcMORSE(mols) for mols in mols2]
        descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'MORSE', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcWHIM(mols) for mols in mols2]
        descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'WHIM', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        descriptor = [Chem.rdMolDescriptors.CalcGETAWAY(mols) for mols in mols2]
        descriptor = Normalization(descriptor)
        evaluate_descriptor(fps, descriptor, 'GETAWAY', y_true, r2_list, esti_time, cpu_calc)
        del descriptor
        gc.collect()
        if read_logs:
            print(f"-------Complete-------")
        #########################################
    except Exception as e:
        print(f"Error in descriptors_list: {e}")
    finally:
        print(r2_list)
        # Ensure the target directory exists
        def ensure_directory_exists(directory):
            if not os.path.exists(directory):
                try:
                    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
                except Exception as e:
                    print(f"Error creating directory {directory}: {e}")
                    return False
            return True

        # Check if target_path is valid and directory exists
        if ensure_directory_exists(target_path):
            try:
                # Save R2 scores
                df = pd.DataFrame(list(r2_list.items()), columns=['descriptor', 'R2_score'])
                df.to_csv(f"{target_path}/{target_name}_fps_descriptor_individual_learning_results.csv", index=False)

                # Save estimation time if available
                if esti_time:
                    df2 = pd.DataFrame(list(esti_time.items()), columns=['descriptor', 'time_estimation'])
                    df2.to_csv(f"{target_path}/{target_name}_fps_descriptor_individual_learning_results_time.csv", index=False)
            except Exception as e:
                print(f"Error saving CSV: {e}")
        else:
            print(f"Failed to ensure the directory {target_path} exists.")
        # cleanup_resources(locals())        
    return r2_list

In [28]:
def plot_maker(fps2, name, dataset_name):
    os.makedirs(target_path, exist_ok=True)
    record_fps_tmp2 = pd.DataFrame({'Descriptor_Name': fps2.keys(), 'R2_Score': fps2.values()})
    record_fps_tmp2 = record_fps_tmp2[~record_fps_tmp2['Descriptor_Name'].isin(['Original_tmp1', 'Original_tmp2'])]
    record_fps_tmp2.to_csv(f"{target_path}/record_fps2_r2_score_{name}.csv", index=False)

    original_r2 = record_fps_tmp2.loc[record_fps_tmp2['Descriptor_Name'] == 'Original', 'R2_Score'].values[0]

    colors = []
    for score in record_fps_tmp2['R2_Score']:
        if score < 0:
            colors.append('#ff9999')  # Negative R2 Score
        elif score > original_r2:
            colors.append('#66b3ff')  # Improved
        else:
            colors.append('#ffcc99')  # Not Improved

    fig, ax = plt.subplots(figsize=(25, 7))
    bars = ax.bar(record_fps_tmp2['Descriptor_Name'], record_fps_tmp2['R2_Score'], color=colors, edgecolor='black', linewidth=1)
    plt.axhline(original_r2, color='red', linestyle='--', linewidth=2)
    ax.set_facecolor('#f7f7f7')

    plt.text(1.01, original_r2, 'Original', color='red', fontsize=12, ha='left', va='center', weight='bold', transform=ax.get_yaxis_transform())

    plt.title(f"The Improvement of additional Descriptors[{dataset_name}]", fontsize=18, weight='bold')
    plt.xlabel("Descriptor Name", fontsize=14)
    plt.ylabel("R2 Score", fontsize=14)

    for bar in bars:
        ax.annotate(f"{bar.get_height():.3f}", 
                    (bar.get_x() + bar.get_width() / 2, - 0.05),
                    ha='center', va='top', fontsize=10, rotation=90, color='black')

    # for bar in bars:
    #     ax.annotate(f"{bar.get_height():.3f}", 
    #                 (bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.03),
    #                 ha='center', va='bottom', fontsize=10, rotation=90, color='black')
    #                 # ha='center', va='bottom', fontsize=10, rotation=90, color='black')

    # Adding legend for color representation
    improved_patch = mpatches.Patch(color='#66b3ff', label='R2 Improved')
    not_improved_patch = mpatches.Patch(color='#ffcc99', label='R2 Not Improved')
    ax.legend(handles=[improved_patch, not_improved_patch], loc='lower right', fontsize=12)

    plt.ylim(-0.5, 1.0)
    plt.xticks(rotation=90, ha='right', fontsize=12)
    plt.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{target_path}/r2_score_all_minus_inc_{name}.png", dpi=300)
    plt.show()
    plt.close()

In [None]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

In [30]:
def concatenate_to_numpy(*dataframes):
    numpy_arrays = [df.to_numpy() if isinstance(df, pd.DataFrame) else df for df in dataframes]
    if not all(isinstance(arr, np.ndarray) for arr in numpy_arrays):
        raise ValueError("All inputs must be either pandas DataFrame or numpy array")
    return np.concatenate(numpy_arrays, axis=1)

In [31]:
try:
    group_nws = concatenate_to_numpy(x_ws, MACCS_ws, AvalonFP_ws)
    group_nde = concatenate_to_numpy(x_de, MACCS_de, AvalonFP_de)
    group_nlo = concatenate_to_numpy(x_lo, MACCS_lo, AvalonFP_lo)
    group_nhu = concatenate_to_numpy(x_hu, MACCS_hu, AvalonFP_hu)
    del x_ws, MACCS_ws, AvalonFP_ws
    del x_de, MACCS_de, AvalonFP_de
    del x_lo, MACCS_lo, AvalonFP_lo
    del x_hu, MACCS_hu, AvalonFP_hu
    gc.collect()
except Exception as e:
    print(f"Error occured: {e}")

In [32]:
makenew = False

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[ws].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_ws = pd.read_csv(file_path)
        fps_learning_ws = pd_ws.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_ws = descriptors_list(group_nws, mol_ws, y_ws, 'ws496', target_path)
        pd_ws = pd.DataFrame(list(fps_learning_ws.items()), columns=['descriptor_name', 'R2score'])
        pd_ws.to_csv(file_path)
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_ws, f'ws_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'ws496')
#57m 13.4s <- 100 epochs
#48m 36.1s
#46m 2.9s

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[de].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_de = pd.read_csv(file_path)
        fps_learning_de = pd_de.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_de = descriptors_list(group_nde, mol_de, y_de, 'delaney', target_path)
        pd_de = pd.DataFrame(list(fps_learning_de.items()), columns=['descriptor_name', 'R2score'])
        pd_de.to_csv(f"{target_path}/dnn_feature_r2_score[de].csv")
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_de, f'de_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'delaney')
#75m 47.4s
#50m 32.0s
#50m 10.6s

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[lo].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_lo = pd.read_csv(file_path)
        fps_learning_lo = pd_lo.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_lo = descriptors_list(group_nlo, mol_lo, y_lo, 'lovric', target_path)
        pd_lo = pd.DataFrame(list(fps_learning_lo.items()), columns=['descriptor_name', 'R2score'])
        pd_lo.to_csv(f"{target_path}/dnn_feature_r2_score[lo].csv")
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_lo, f'lo_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'lovric')
## 77m 31.7s
#50m 1.0s
#50m 1.0s

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[hu].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_hu = pd.read_csv(file_path)
        fps_learning_hu = pd_hu.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_hu = descriptors_list(group_nhu, mol_hu, y_hu, 'hussk', target_path)
        pd_hu = pd.DataFrame(list(fps_learning_hu.items()), columns=['descriptor_name', 'R2score'])
        pd_hu.to_csv(f"{target_path}/dnn_feature_r2_score[hu].csv")
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_hu, f'hu_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'hussk')
#83m 13.5s
#56m 35.4

In [None]:
test = True
test

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[ws].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_ws = pd.read_csv(file_path)
        fps_learning_ws = pd_ws.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_ws = descriptors_list(group_nws, mol_ws, y_ws, 'ws496', target_path, cpu_calc=True)
        pd_ws = pd.DataFrame(list(fps_learning_ws.items()), columns=['descriptor_name', 'R2score'])
        pd_ws.to_csv(file_path)
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_ws, f'ws_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'ws496')
#57m 13.4s <- 100 epochs
#48m 36.1s
#46m 2.9s

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[de].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_de = pd.read_csv(file_path)
        fps_learning_de = pd_de.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_de = descriptors_list(group_nde, mol_de, y_de, 'delaney', target_path, cpu_calc=True)
        pd_de = pd.DataFrame(list(fps_learning_de.items()), columns=['descriptor_name', 'R2score'])
        pd_de.to_csv(f"{target_path}/dnn_feature_r2_score[de].csv")
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_de, f'de_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'delaney')
#75m 47.4s
#50m 32.0s
#50m 10.6s

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[lo].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_lo = pd.read_csv(file_path)
        fps_learning_lo = pd_lo.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_lo = descriptors_list(group_nlo, mol_lo, y_lo, 'lovric', target_path, cpu_calc=True)
        pd_lo = pd.DataFrame(list(fps_learning_lo.items()), columns=['descriptor_name', 'R2score'])
        pd_lo.to_csv(f"{target_path}/dnn_feature_r2_score[lo].csv")
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_lo, f'lo_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'lovric')
## 77m 31.7s
#50m 1.0s
#50m 1.0s

In [None]:
file_path = f"{target_path}/dnn_feature_r2_score[hu].csv"
try:
    if os.path.exists(file_path) and makenew==False:
        pd_hu = pd.read_csv(file_path)
        fps_learning_hu = pd_hu.set_index('descriptor_name')['R2score'].to_dict()
    else:
        fps_learning_hu = descriptors_list(group_nhu, mol_hu, y_hu, 'hussk', target_path, cpu_calc=True)
        pd_hu = pd.DataFrame(list(fps_learning_hu.items()), columns=['descriptor_name', 'R2score'])
        pd_hu.to_csv(f"{target_path}/dnn_feature_r2_score[hu].csv")
except Exception as e:
    print(f"Error : {e}")    
plot_maker(fps_learning_hu, f'hu_{BATCHSIZE}batch_{EPOCHS}epoch_{lr}lr', 'hussk')
#83m 13.5s
#56m 35.4

In [None]:
def plot_r2_score(data, dataset_name, target_path):
    ax = data.plot.bar(x='descriptor_name', y='R2score', figsize=(30, 10), ylabel='R2score', ylim=(-0.5, 1))
    plt.axhline(data.loc[0][1], color='red', linestyle='--', linewidth=3)
    plt.title(f"The prediction improvement with additional chemical descriptors [{dataset_name}]", fontsize=20)
    
    for index, value in enumerate(data['R2score']):
        if value <= -0.5:
            ax.text(index, value - 0.05, f'{value:.2f}', ha='center', va='bottom', fontsize=12, color='black')
        else:
            ax.text(index, value + 0.01, f'{value:.2f}', ha='center', va='bottom', fontsize=12, color='black')

    plt.savefig(f"{target_path}/r2_score_each_descriptors_{dataset_name}.png", dpi=300, bbox_inches="tight")
    plt.show()

plot_r2_score(pd_ws, 'ws', target_path)
plot_r2_score(pd_de, 'de', target_path)
plot_r2_score(pd_lo, 'lo', target_path)
plot_r2_score(pd_hu, 'hu', target_path)

In [None]:
res_ws = pd_ws.loc[pd_ws['R2score'] > pd_ws.loc[0][1]]
res_de = pd_de.loc[pd_de['R2score'] > pd_de.loc[0][1]]
res_lo = pd_lo.loc[pd_lo['R2score'] > pd_lo.loc[0][1]]
res_hu = pd_hu.loc[pd_hu['R2score'] > pd_hu.loc[0][1]]

re_arrange_ws = res_ws.sort_values(by='R2score',ascending=False)
re_arrange_de = res_de.sort_values(by='R2score',ascending=False)
re_arrange_lo = res_lo.sort_values(by='R2score',ascending=False)
re_arrange_hu = res_hu.sort_values(by='R2score',ascending=False)

re_arrange_ws.to_csv(f"{target_path}/[3]_individual_r2_score_higher_than_original_prediction[ws].csv", index=False)
re_arrange_de.to_csv(f"{target_path}/[3]_individual_r2_score_higher_than_original_prediction[de].csv", index=False)
re_arrange_lo.to_csv(f"{target_path}/[3]_individual_r2_score_higher_than_original_prediction[lo].csv", index=False)
re_arrange_hu.to_csv(f"{target_path}/[3]_individual_r2_score_higher_than_original_prediction[hu].csv", index=False)

In [None]:
pd_ws.max(), pd_de.max(), pd_lo.max(), pd_hu.max()

In [None]:
res = [pd_ws,pd_de.iloc[:,1],pd_lo.iloc[:,1],pd_hu.iloc[:,1]]
res = pd.concat(res, axis=1)
res.columns = ['descriptor_name', 'ws496', 'delaney', 'lovrics', 'huusk']
res

In [None]:
ax = res.plot(x='descriptor_name', y=['ws496', 'delaney', 'lovrics', 'huusk'], figsize=(35, 15), stacked=True, kind='bar', ylim=(-0.5, None))
plt.legend(loc=1, fontsize=12)

# 레이블 추가
for bar in ax.patches:
    height = bar.get_height()
    width = bar.get_width()
    x = bar.get_x()
    y = bar.get_y()
    label_text = f'{height:0.4f}'
    label_x = x + width / 2
    label_y = y + height / 2

    if height > 0 or height < -0.5:
        ax.text(label_x, label_y, label_text, ha='center', size=9, va='center', rotation='vertical', fontweight='bold')

# 축 및 제목 설정
plt.xlabel("Descriptor Name", fontsize=14)
plt.ylabel("R2 Score", fontsize=14)
plt.title("DNN Improvement with Additional Chemical Descriptors", fontsize=18, fontweight='bold')

# 파일 저장 및 그래프 출력
plt.savefig(f"{target_path}/chem_descriptor_r2score_result_ALL_vertical.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
target_print_count = 20
selected_descriptors = ['Original','ExactMolWt','MolWt', 'MolLogP', 'MolMR']
top_descriptors = res.nlargest(target_print_count-len(selected_descriptors), 'delaney')  

filtered_res = pd.concat([res[res['descriptor_name'].isin(selected_descriptors)], top_descriptors])
filtered_res = filtered_res.drop_duplicates(subset='descriptor_name', keep='first')

best_dataset = 'delaney'

colors = ['#0000ff', '#ff7f0e', '#2ca02c', '#d62728']

font = {'size': 18, 'weight': 'bold'}
plt.rc('font', **font)

fig, ax = plt.subplots(figsize=(18, 10))

bar_width = 0.2
index = np.arange(len(filtered_res['descriptor_name']))

max_scores = filtered_res[['ws496', 'delaney', 'lovrics', 'huusk']].max(axis=1)

for i in range(len(filtered_res)):
    # ws496
    alpha_ws496 = 1.0 if filtered_res['ws496'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.bar(index[i] - 1.5 * bar_width, filtered_res['ws496'].iloc[i], bar_width, label='ws496' if i == 0 else "", 
           color=colors[0], alpha=alpha_ws496)
    
    # delaney
    alpha_delaney = 1.0 if filtered_res['delaney'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.bar(index[i] - 0.5 * bar_width, filtered_res['delaney'].iloc[i], bar_width, label='delaney' if i == 0 else "", 
           color=colors[1], alpha=alpha_delaney)
    
    # lovrics
    alpha_lovrics = 1.0 if filtered_res['lovrics'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.bar(index[i] + 0.5 * bar_width, filtered_res['lovrics'].iloc[i], bar_width, label='lovrics' if i == 0 else "", 
           color=colors[2], alpha=alpha_lovrics)
    
    # huusk
    alpha_huusk = 1.0 if filtered_res['huusk'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.bar(index[i] + 1.5 * bar_width, filtered_res['huusk'].iloc[i], bar_width, label='huusk' if i == 0 else "", 
           color=colors[3], alpha=alpha_huusk)

for i in range(len(filtered_res)):
    ax.text(i - 1.5 * bar_width, -0.1, f'{filtered_res["ws496"].iloc[i]:.4f}', 
            ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
    ax.text(i - 0.5 * bar_width, -0.1, f'{filtered_res["delaney"].iloc[i]:.4f}', 
            ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
    ax.text(i + 0.5 * bar_width, -0.1, f'{filtered_res["lovrics"].iloc[i]:.4f}', 
            ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
    ax.text(i + 1.5 * bar_width, -0.1, f'{filtered_res["huusk"].iloc[i]:.4f}', 
            ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)

ax.set_xlabel('Descriptor Name', fontsize=18, labelpad=10)
ax.set_ylabel('R2 Score', fontsize=18, labelpad=10)
ax.set_xticks(index)
ax.set_xticklabels(filtered_res['descriptor_name'], rotation=45, ha='right', fontsize=12)

ax.set_ylim(-0.5, 1.0)

ax.legend(loc='upper left', fontsize=14, title='Datasets', title_fontsize=14)

ax.grid(True, which='major', axis='y', linestyle='--', linewidth=0.5, color='gray', alpha=0.7)

plt.tight_layout()
plt.savefig(f"{target_path}/chem_descriptor_r2score.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:

font = {'size': 20, 'weight': 'bold'}
plt.rc('font', **font)

ax = res.plot(x='descriptor_name', y=['ws496', 'delaney', 'lovrics', 'huusk'], figsize=(30, 42), stacked=True, kind='barh', xlim=(-0.5, None))

ax.invert_yaxis()

plt.legend(loc=1, fontsize=25)

for p in ax.patches:
    h, w, x, y = p.get_height(), p.get_width(), p.get_x(), p.get_y()
    text = f'{w:0.4f}'


    if w > 0 or w < -0.5:
        ax.annotate(text=text, xy=(x + w / 2, y + h / 2), ha='center', va='center', size=18, fontweight='bold')

plt.ylabel("Chemical Descriptors", fontsize=25, fontweight='bold')
plt.xlabel("R2 Score", fontsize=25, fontweight='bold')
plt.title("DNN Improvement with Additional Chemical Descriptors", fontsize=30, fontweight='bold', pad=20)

plt.savefig(f"{target_path}/chem_descriptor_r2score_result_ALL_horizontal.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
target_print_count = 20
selected_descriptors = ['Original', 'ExactMolWt', 'MolWt', 'MolLogP', 'MolMR']
top_descriptors = res.nlargest(target_print_count - len(selected_descriptors), 'delaney')

filtered_res = pd.concat([res[res['descriptor_name'].isin(selected_descriptors)], top_descriptors])
filtered_res = filtered_res.drop_duplicates(subset='descriptor_name', keep='first')

best_dataset = 'delaney'

colors = ['#0000ff', '#ff7f0e', '#2ca02c', '#d62728']

font = {'size': 18, 'weight': 'bold'}
plt.rc('font', **font)

fig, ax = plt.subplots(figsize=(10, 18))

bar_width = 0.2
index = np.arange(len(filtered_res['descriptor_name']))

max_scores = filtered_res[['ws496', 'delaney', 'lovrics', 'huusk']].max(axis=1)

for i in range(len(filtered_res)):
    alpha_ws496 = 1.0 if filtered_res['ws496'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.barh(index[i] - 1.5 * bar_width, filtered_res['ws496'].iloc[i], bar_width, label='ws496' if i == 0 else "", 
            color=colors[0], alpha=alpha_ws496)

    alpha_delaney = 1.0 if filtered_res['delaney'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.barh(index[i] - 0.5 * bar_width, filtered_res['delaney'].iloc[i], bar_width, label='delaney' if i == 0 else "", 
            color=colors[1], alpha=alpha_delaney)

    alpha_lovrics = 1.0 if filtered_res['lovrics'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.barh(index[i] + 0.5 * bar_width, filtered_res['lovrics'].iloc[i], bar_width, label='lovrics' if i == 0 else "", 
            color=colors[2], alpha=alpha_lovrics)

    alpha_huusk = 1.0 if filtered_res['huusk'].iloc[i] == max_scores.iloc[i] else 0.3
    ax.barh(index[i] + 1.5 * bar_width, filtered_res['huusk'].iloc[i], bar_width, label='huusk' if i == 0 else "", 
            color=colors[3], alpha=alpha_huusk)

for i in range(len(filtered_res)):
    ax.text(-0.1, i - 1.5 * bar_width, f'{filtered_res["ws496"].iloc[i]:.4f}', 
            ha='right', va='center', fontsize=12, fontweight='bold')
    ax.text(-0.1, i - 0.5 * bar_width, f'{filtered_res["delaney"].iloc[i]:.4f}', 
            ha='right', va='center', fontsize=12, fontweight='bold')
    ax.text(-0.1, i + 0.5 * bar_width, f'{filtered_res["lovrics"].iloc[i]:.4f}', 
            ha='right', va='center', fontsize=12, fontweight='bold')
    ax.text(-0.1, i + 1.5 * bar_width, f'{filtered_res["huusk"].iloc[i]:.4f}', 
            ha='right', va='center', fontsize=12, fontweight='bold')

ax.set_ylabel('Descriptor Name', fontsize=18, labelpad=10)
ax.set_xlabel('R2 Score', fontsize=18, labelpad=10)
ax.set_yticks(index)
ax.set_yticklabels(filtered_res['descriptor_name'][::-1], rotation=0, ha='right', fontsize=12)

ax.set_xlim(-0.5, 1.0)

ax.legend(loc='upper right', fontsize=14, title='Datasets', title_fontsize=14)

ax.grid(True, which='major', axis='x', linestyle='--', linewidth=0.5, color='gray', alpha=0.7)

plt.title('Comparison of Chemical Descriptors by R2 Score', fontsize=20, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig(f"{target_path}/chem_descriptor_r2score_horizontal_title.png", dpi=300, bbox_inches="tight")
plt.show()

In [50]:
def plot_r2_scores(res, target_print_count=20, selected_descriptors=['Original', 'ExactMolWt', 'MolWt', 'MolLogP', 'MolMR'], orientation='vertical', target_path=".", title='Comparison of Chemical Descriptors by DNN'):
    top_descriptors = res.nlargest(target_print_count - len(selected_descriptors), 'delaney')
    filtered_res = pd.concat([res[res['descriptor_name'].isin(selected_descriptors)], top_descriptors])
    filtered_res = filtered_res.drop_duplicates(subset='descriptor_name', keep='first')

    best_dataset = 'delaney'
    colors = ['#0000ff', '#ff7f0e', '#2ca02c', '#d62728']

    font = {'size': 18, 'weight': 'bold'}
    plt.rc('font', **font)

    if orientation == 'vertical':
        fig, ax = plt.subplots(figsize=(18, 10))
        bar_width = 0.2
        index = np.arange(len(filtered_res['descriptor_name']))

        max_scores = filtered_res[['ws496', 'delaney', 'lovrics', 'huusk']].max(axis=1)

        for i in range(len(filtered_res)):
            alpha_ws496 = 1.0 if filtered_res['ws496'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.bar(index[i] - 1.5 * bar_width, filtered_res['ws496'].iloc[i], bar_width, label='ws496' if i == 0 else "", 
                   color=colors[0], alpha=alpha_ws496)

            alpha_delaney = 1.0 if filtered_res['delaney'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.bar(index[i] - 0.5 * bar_width, filtered_res['delaney'].iloc[i], bar_width, label='delaney' if i == 0 else "", 
                   color=colors[1], alpha=alpha_delaney)

            alpha_lovrics = 1.0 if filtered_res['lovrics'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.bar(index[i] + 0.5 * bar_width, filtered_res['lovrics'].iloc[i], bar_width, label='lovrics' if i == 0 else "", 
                   color=colors[2], alpha=alpha_lovrics)

            alpha_huusk = 1.0 if filtered_res['huusk'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.bar(index[i] + 1.5 * bar_width, filtered_res['huusk'].iloc[i], bar_width, label='huusk' if i == 0 else "", 
                   color=colors[3], alpha=alpha_huusk)

        for i in range(len(filtered_res)):
            if filtered_res['ws496'].iloc[i] == max_scores.iloc[i]:
                ax.text(i - 1.5 * bar_width, -0.1, f'*{filtered_res["ws496"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
            else:
                ax.text(i - 1.5 * bar_width, -0.1, f'{filtered_res["ws496"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, rotation=90)

            if filtered_res['delaney'].iloc[i] == max_scores.iloc[i]:
                ax.text(i - 0.5 * bar_width, -0.1, f'*{filtered_res["delaney"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
            else:
                ax.text(i - 0.5 * bar_width, -0.1, f'{filtered_res["delaney"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, rotation=90)

            if filtered_res['lovrics'].iloc[i] == max_scores.iloc[i]:
                ax.text(i + 0.5 * bar_width, -0.1, f'*{filtered_res["lovrics"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
            else:
                ax.text(i + 0.5 * bar_width, -0.1, f'{filtered_res["lovrics"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, rotation=90)

            if filtered_res['huusk'].iloc[i] == max_scores.iloc[i]:
                ax.text(i + 1.5 * bar_width, -0.1, f'*{filtered_res["huusk"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, fontweight='bold', rotation=90)
            else:
                ax.text(i + 1.5 * bar_width, -0.1, f'{filtered_res["huusk"].iloc[i]:.4f}', 
                        ha='center', va='top', fontsize=12, rotation=90)

        ax.set_xlabel('Descriptor Name', fontsize=18, labelpad=10)
        ax.set_ylabel('R2 Score', fontsize=18, labelpad=10)
        ax.set_xticks(index)
        ax.set_xticklabels(filtered_res['descriptor_name'], rotation=45, ha='right', fontsize=12)
        plt.figtext(0.75, 0.23, "* indicates the best score in the dataset", ha="center", fontsize=12, color='gray')

    else:
        fig, ax = plt.subplots(figsize=(10, 18))
        bar_width = 0.2
        index = np.arange(len(filtered_res['descriptor_name']))

        max_scores = filtered_res[['ws496', 'delaney', 'lovrics', 'huusk']].max(axis=1)

        for i in range(len(filtered_res)):
            alpha_ws496 = 1.0 if filtered_res['ws496'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.barh(index[i] - 1.5 * bar_width, filtered_res['ws496'].iloc[i], bar_width, label='ws496' if i == 0 else "", 
                    color=colors[0], alpha=alpha_ws496)

            alpha_delaney = 1.0 if filtered_res['delaney'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.barh(index[i] - 0.5 * bar_width, filtered_res['delaney'].iloc[i], bar_width, label='delaney' if i == 0 else "", 
                    color=colors[1], alpha=alpha_delaney)

            alpha_lovrics = 1.0 if filtered_res['lovrics'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.barh(index[i] + 0.5 * bar_width, filtered_res['lovrics'].iloc[i], bar_width, label='lovrics' if i == 0 else "", 
                    color=colors[2], alpha=alpha_lovrics)

            alpha_huusk = 1.0 if filtered_res['huusk'].iloc[i] == max_scores.iloc[i] else 0.3
            ax.barh(index[i] + 1.5 * bar_width, filtered_res['huusk'].iloc[i], bar_width, label='huusk' if i == 0 else "", 
                    color=colors[3], alpha=alpha_huusk)

        for i in range(len(filtered_res)):
            if filtered_res['ws496'].iloc[i] == max_scores.iloc[i]:
                ax.text(-0.1, i - 1.5 * bar_width, f'*{filtered_res["ws496"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12, fontweight='bold')
            else:
                ax.text(-0.1, i - 1.5 * bar_width, f'{filtered_res["ws496"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12)

            if filtered_res['delaney'].iloc[i] == max_scores.iloc[i]:
                ax.text(-0.1, i - 0.5 * bar_width, f'*{filtered_res["delaney"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12, fontweight='bold')
            else:
                ax.text(-0.1, i - 0.5 * bar_width, f'{filtered_res["delaney"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12)

            if filtered_res['lovrics'].iloc[i] == max_scores.iloc[i]:
                ax.text(-0.1, i + 0.5 * bar_width, f'*{filtered_res["lovrics"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12, fontweight='bold')
            else:
                ax.text(-0.1, i + 0.5 * bar_width, f'{filtered_res["lovrics"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12)

            if filtered_res['huusk'].iloc[i] == max_scores.iloc[i]:
                ax.text(-0.1, i + 1.5 * bar_width, f'*{filtered_res["huusk"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12, fontweight='bold')
            else:
                ax.text(-0.1, i + 1.5 * bar_width, f'{filtered_res["huusk"].iloc[i]:.4f}', 
                        ha='right', va='center', fontsize=12)

        ax.set_ylabel('Descriptor Name', fontsize=18, labelpad=10)
        ax.set_xlabel('R2 Score', fontsize=18, labelpad=10)
        ax.set_yticks(index)
        ax.set_yticklabels(filtered_res['descriptor_name'], rotation=0, ha='right', fontsize=12)
        plt.figtext(0.47, 0.055, "* indicates the best score in the dataset", ha="center", fontsize=12, color='gray')

    ax.set_ylim(-0.5, 1.0) if orientation == 'vertical' else ax.set_xlim(-0.5, 1.0)

    ax.legend(loc='upper left', fontsize=14, title='Datasets', title_fontsize=14, bbox_to_anchor=(1.05, 1))

    ax.grid(True, which='major', axis='y' if orientation == 'vertical' else 'x', linestyle='--', linewidth=0.5, color='gray', alpha=0.7)

    plt.title(title, fontsize=20, fontweight='bold', pad=20)

    

    plt.tight_layout()
    plt.savefig(f"{target_path}/DNN_descriptor_r2score_{orientation}.png", dpi=300, bbox_inches="tight")
    plt.show()

In [None]:
plot_r2_scores(res, 20 ,orientation='vertical', target_path=target_path)
plot_r2_scores(res, 20 ,orientation='horizontal', target_path=target_path)