In [1]:
import os
import sys
import time
import subprocess
import logging
import warnings
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw
from rdkit import RDConfig
from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski, rdDistGeom, rdPartialCharges
from rdkit.Chem.AllChem import GetMorganGenerator
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [5]:
import optuna
from optuna.trial import TrialState
from optuna.integration import TFKerasPruningCallback

In [6]:
from extra_code.feature_search import search_data_descriptor_compress

In [None]:
tf.keras.backend.clear_session()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [8]:
target_path = "result/4_ANO_feature"
os.makedirs(target_path, exist_ok=True)

In [9]:
data_ws = pd.read_csv('./data/ws496_logS.csv', dtype={'SMILES': 'string'})
smiles_ws = data_ws['SMILES']
y_ws = data_ws.iloc[:, 2]

data_delaney = pd.read_csv('./data/delaney-processed.csv', dtype={'smiles': 'string'})
smiles_de = data_delaney['smiles']
y_de = data_delaney.iloc[:, 1]

data_lovric2020 = pd.read_csv('./data/Lovric2020_logS0.csv', dtype={'isomeric_smiles': 'string'})
smiles_lo = data_lovric2020['isomeric_smiles']
y_lo = data_lovric2020.iloc[:, 1]

data_huuskonen = pd.read_csv('./data/huusk.csv', dtype={'SMILES': 'string'})
smiles_hu = data_huuskonen['SMILES']
y_hu = data_huuskonen.iloc[:, -1].astype('float')

In [10]:
def mol3d(mol):
    mol = Chem.AddHs(mol)
    optimization_methods = [
        (AllChem.EmbedMolecule, (mol, AllChem.ETKDGv3()), {}),
        (AllChem.UFFOptimizeMolecule, (mol,), {'maxIters': 200}),
        (AllChem.MMFFOptimizeMolecule, (mol,), {'maxIters': 200})
    ]

    for method, args, kwargs in optimization_methods:
        try:
            method(*args, **kwargs)
            if mol.GetNumConformers() > 0:
                return mol
        except ValueError as e:
            print(f"Error: {e} - Trying next optimization method [{method}]")

    print(f"Invalid mol for 3d {'\033[94m'}{Chem.MolToSmiles(mol)}{'\033[0m'} - No conformer generated")
    return None

In [11]:
def convert_smiles_to_mol(smiles, fail_folder=None, index=None, yvalue=None):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"[convert_smiles_to_mol] Cannot convert {smiles} to Mols")
        return None, {"smiles": smiles, "y_value": yvalue, "error": "Invalid SMILES"}

    try:
        Chem.Kekulize(mol, clearAromaticFlags=True)
        isomeric_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
        mol = Chem.MolFromSmiles(isomeric_smiles)
    except Exception as e:
        print(f"[convert_smiles_to_mol] failed {smiles} isomeric_smiles by {e}")
        if fail_folder and index is not None:
            img_path = os.path.join(fail_folder, f"mol_{index}.png")
            img = Draw.MolToImage(mol)
            img.save(img_path)
        return None, {"smiles": smiles, "y_value": yvalue, "error": f"Isomeric SMILES error: {e}"}

    try:
        Chem.SanitizeMol(mol)
    except Exception as e:
        print(f"[convert_smiles_to_mol] failed {smiles} SanitizeMol by {e}")
        if fail_folder and index is not None:
            img_path = os.path.join(fail_folder, f"mol_{index}.png")
            img = Draw.MolToImage(mol)
            img.save(img_path)
        return None, {"smiles": smiles, "y_value": yvalue, "error": f"SanitizeMol error: {e}"}

    return mol, None

In [12]:
def process_smiles(smiles, yvalue, fail_folder, index):
    mol, error = convert_smiles_to_mol(smiles, fail_folder, index, yvalue)
    if error:
        return None, None, error

    mol_3d = mol3d(mol)
    if mol_3d:
        return smiles, yvalue, None
    else:
        img_path = os.path.join(fail_folder, f"mol_{index}.png")
        img = Draw.MolToImage(mol)
        img.save(img_path)
        return None, None, {"smiles": smiles, "y_value": yvalue}

def process_dataset(smiles_list, y_values, dataset_name, target_path="result", max_workers=None):
    start = time.time()
    valid_smiles, valid_y = [], []
    error_smiles_list = []
    fail_folder = f"{target_path}/failed/{dataset_name}"
    os.makedirs(fail_folder, exist_ok=True)

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_smiles, smiles, yvalue, fail_folder, i)
            for i, (smiles, yvalue) in enumerate(zip(smiles_list, y_values))
        ]
        for future in as_completed(futures):
            smiles, yvalue, error = future.result()
            if error:
                error_smiles_list.append(error)
            elif smiles is not None and yvalue is not None:
                valid_smiles.append(smiles)
                valid_y.append(yvalue)

    if error_smiles_list:
        error_df = pd.DataFrame(error_smiles_list)
        error_df.to_csv(os.path.join(fail_folder, "failed_smiles.csv"), index=False)
    print(f" [{dataset_name:<10}] : {time.time()-start:.4f} sec")
    return valid_smiles, valid_y

In [None]:
smiles_ws, y_ws = process_dataset(smiles_ws, y_ws, "ws496", target_path)
smiles_de, y_de = process_dataset(smiles_de, y_de, "delaney", target_path)
smiles_lo, y_lo = process_dataset(smiles_lo, y_lo, "Lovric2020_logS0", target_path)
smiles_hu, y_hu = process_dataset(smiles_hu, y_hu, "huusk", target_path)

In [14]:
LEN_OF_FF = 2048
LEN_OF_MA = 167
LEN_OF_AV = 512

In [15]:
def get_fingerprints(mol):
    if mol is None:
        return None, None, None
    
    morgan_generator = GetMorganGenerator(radius=2, fpSize=LEN_OF_FF)
    ecfp = morgan_generator.GetFingerprint(mol)
    ecfp_array = np.zeros((LEN_OF_FF,),dtype=int)
    DataStructs.ConvertToNumpyArray(ecfp, ecfp_array)
    
    maccs = Chem.rdMolDescriptors.GetMACCSKeysFingerprint(mol)

    avalon_fp = GetAvalonFP(mol)
    avalon_array = np.zeros((LEN_OF_AV,),dtype=int)
    DataStructs.ConvertToNumpyArray(avalon_fp, avalon_array)
    
    return ecfp_array, maccs, avalon_array

def fp_converter(data, use_parallel=True):
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    
    if use_parallel:
        try:            
            with ProcessPoolExecutor() as executor:
                results = list(executor.map(get_fingerprints, mols))
        except Exception as e:
            print(f"Parallel processing failed due to: {e}. Falling back to sequential processing.")
            use_parallel = False
    
    if not use_parallel:
        results = [get_fingerprints(mol) for mol in mols]
    
    ECFP, MACCS, AvalonFP = zip(*results)
    
    ECFP_container = np.vstack([arr for arr in ECFP if arr is not None])
    MACCS_container = np.zeros((len(MACCS), LEN_OF_MA), dtype=int)
    AvalonFP_container = np.vstack([arr for arr in AvalonFP if arr is not None])

    for i, fp in enumerate(MACCS):
        if fp is not None:
            DataStructs.ConvertToNumpyArray(fp, MACCS_container[i])
    
    return mols, ECFP_container, MACCS_container, AvalonFP_container

In [16]:
mol_ws, x_ws, MACCS_ws, AvalonFP_ws = fp_converter(smiles_ws,target_path)
mol_de, x_de, MACCS_de, AvalonFP_de = fp_converter(smiles_de,target_path)
mol_lo, x_lo, MACCS_lo, AvalonFP_lo = fp_converter(smiles_lo,target_path)
mol_hu, x_hu, MACCS_hu, AvalonFP_hu = fp_converter(smiles_hu,target_path)

In [17]:
def concatenate_to_numpy(*dataframes):
    numpy_arrays = [df.to_numpy() if isinstance(df, pd.DataFrame) else df for df in dataframes]
    if not all(isinstance(arr, np.ndarray) for arr in numpy_arrays):
        raise ValueError("All inputs must be either pandas DataFrame or numpy array")
    return np.concatenate(numpy_arrays, axis=1)

In [None]:
group_nws = concatenate_to_numpy(x_ws, MACCS_ws, AvalonFP_ws)
group_nde = concatenate_to_numpy(x_de, MACCS_de, AvalonFP_de)
group_nlo = concatenate_to_numpy(x_lo, MACCS_lo, AvalonFP_lo)
group_nhu = concatenate_to_numpy(x_hu, MACCS_hu, AvalonFP_hu)
del x_ws, MACCS_ws, AvalonFP_ws
del x_de, MACCS_de, AvalonFP_de
del x_lo, MACCS_lo, AvalonFP_lo
del x_hu, MACCS_hu, AvalonFP_hu
gc.collect()

In [19]:
import logging
import warnings

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 --tf_xla_enable_xla_devices'
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda --xla_gpu_force_compilation_parallelism=1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_NUMA_NODES'] = '1'

warnings.filterwarnings('ignore')

warnings.simplefilter(action='ignore', category=FutureWarning)

logging.getLogger('tensorflow').setLevel(logging.ERROR)

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)

def suppress_warnings(condition=True):
    if condition:
        logging.getLogger('tensorflow').setLevel(logging.ERROR)
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    else:
        logging.getLogger('tensorflow').setLevel(logging.WARNING)
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'

suppress_warnings(condition=True)

In [20]:
BATCHSIZE = 32
EPOCHS = 1000
lr = 0.001
decay = 0.00001

In [21]:

# Colab
# def new_model():
#     model = tf.keras.Sequential([
#         tf.keras.layers.Dense(
#             units=1024,
#             activation='relu',
#             kernel_initializer='glorot_uniform',
#             kernel_regularizer=regularizers.l2(decay)),
#         tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Dense(
#             units=469,
#             activation='relu',
#             kernel_initializer='glorot_uniform',
#             kernel_regularizer=regularizers.l2(decay)),
#         tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Dense(units=1)
#         ])
#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
#                       loss=tf.keras.losses.MeanSquaredError(),
#                       metrics=[tf.keras.losses.MeanSquaredError(),
#                                tf.keras.losses.MeanAbsoluteError(),
#                                tf.keras.metrics.RootMeanSquaredError()])
#     return model

def new_inference_model(input_dim):
    model = tf.keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(
            units=1024,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(decay)),
        layers.Dropout(0.2),
        layers.Dense(
            units=469,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(decay)),
        layers.Dropout(0.2),
        layers.Dense(units=1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                    loss=tf.keras.losses.MeanSquaredError(),
                    metrics=[tf.keras.losses.MeanSquaredError(),
                            tf.keras.losses.MeanAbsoluteError(),
                            tf.keras.metrics.RootMeanSquaredError()])
    return model

def save_model(x_data):
    model_path = "save_model/full_model.keras"
    if not os.path.exists(model_path):
        try:
            model = new_inference_model(x_data.shape[1])
            os.makedirs("save_model", exist_ok=True)
            model.save(model_path)
            print(f"Model successfully saved to {model_path}")
        except Exception as e:
            print(f"Error saving model: {e}")
    else:
        print(f"Model already exists at {model_path}")
        os.remove(model_path)
        save_model(x_data)

In [22]:
# # Colab
# def preprocess_data(xtr, ytr):
#     dataset = tf.data.Dataset.from_tensor_slices((xtr, ytr))
#     dataset = dataset.shuffle(buffer_size=len(xtr)).batch(BATCHSIZE).prefetch(tf.data.AUTOTUNE)
#     return dataset

# cb = tf.keras.callbacks.EarlyStopping(
#         monitor='loss',  
#         patience=10,
#         restore_best_weights=True,
#         # min_delta=0.001,
#         mode='min',
#         verbose=1
#     )

In [23]:
# # Colab
# def objective_ws_fea(trial):
#     r2_result = 0.0
#     new_x = search_data_descriptor_compress(trial, group_nws, mol_ws, 'ws496')
#     new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#     y_true = np.asarray(y_ws).astype('float')
#     model = new_model()
#     xtr, xte, ytr, yte = train_test_split(new_x, y_true, test_size=0.2, random_state=42)
#     train_dataset = preprocess_data(xtr, ytr)
#     model.fit(train_dataset, epochs=EPOCHS, callbacks=[cb,TFKerasPruningCallback(trial,'loss')], verbose=0)
#     ypred = model.predict(xte, verbose=0)
#     try:
#         r2_result = r2_score(yte, ypred)
#         print(f"r2 score: {r2_result:.4f}")
#     except Exception as e:
#         print(f"Error occured: {e}")
#     return r2_result
    

In [24]:
# # Colab
# def objective_de_fea(trial):
#     r2_result = 0.0
#     new_x = search_data_descriptor_compress(trial, group_nde, mol_de, 'delaney')
#     new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#     y_true = np.asarray(y_de).astype('float')
#     model = new_model()
#     xtr, xte, ytr, yte = train_test_split(new_x, y_true, test_size=0.2, random_state=42)
#     train_dataset = preprocess_data(xtr, ytr)
#     model.fit(train_dataset, epochs=EPOCHS, callbacks=[cb,TFKerasPruningCallback(trial,'loss')], verbose=0)
#     ypred = model.predict(xte, verbose=0)
#     try:
#         r2_result = r2_score(yte, ypred)
#         print(f"r2 score: {r2_result:.4f}")
#     except Exception as e:
#         print(f"Error occured: {e}")
#     return r2_result
    

In [25]:
# # Colab
# def objective_lo_fea(trial):
#     r2_result = 0.0
#     new_x = search_data_descriptor_compress(trial, group_nlo, mol_lo, 'lovric')
#     new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#     y_true = np.asarray(y_lo).astype('float')
#     model = new_model()
#     xtr, xte, ytr, yte = train_test_split(new_x, y_true, test_size=0.2, random_state=42)
#     train_dataset = preprocess_data(xtr, ytr)
#     model.fit(train_dataset, epochs=EPOCHS, callbacks=[cb,TFKerasPruningCallback(trial,'loss')], verbose=0)
#     ypred = model.predict(xte, verbose=0)
#     try:
#         r2_result = r2_score(yte, ypred)
#         print(f"r2 score: {r2_result:.4f}")
#     except Exception as e:
#         print(f"Error occured: {e}")
#     return r2_result
    

In [26]:
# # Colab
# def objective_hu_fea(trial):
#     r2_result = 0.0
#     new_x = search_data_descriptor_compress(trial, group_nhu, mol_hu, 'huusk')
#     new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
#     y_true = np.asarray(y_hu).astype('float')
#     model = new_model()
#     xtr, xte, ytr, yte = train_test_split(new_x, y_true, test_size=0.2, random_state=42)
#     train_dataset = preprocess_data(xtr, ytr)
#     model.fit(train_dataset, epochs=EPOCHS, callbacks=[cb,TFKerasPruningCallback(trial,'loss')], verbose=0)
#     ypred = model.predict(xte, verbose=0)
#     try:
#         r2_result = r2_score(yte, ypred)
#         print(f"r2 score: {r2_result:.4f}")
#     except Exception as e:
#         print(f"Error occured: {e}")
#     return r2_result
    

In [27]:
def objective_ws_fea(trial):
    try:
        new_x = search_data_descriptor_compress(trial, group_nws, mol_ws, 'ws496')
        new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
        y_true = np.asarray(y_ws).astype('float')
        np.save('new_fps.npy', new_x)
        np.save('y_true.npy', y_true)
        
        save_model(new_x)

        result = subprocess.run(['python3', './extra_code/learning_process.py', 
                                 str(BATCHSIZE), str(EPOCHS), 
                                 str(lr),
                                 'new_fps.npy', 'y_true.npy'],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        if result.stderr:
            filtered_stderr = '\n'.join([line for line in result.stderr.split('\n') if "could not open file to read NUMA node" not in line and "Your kernel may have been built without NUMA support" not in line])
            if filtered_stderr:
                print(f"Error in subprocess: {filtered_stderr}", file=sys.stderr)

        for line in result.stdout.splitlines():
            if "R2" in line:
                if "(prune)" in line:
                    print(f"Pruning trial due to poor R2: {line}")
                    r2_result = 0.0
                    trial.report(r2_result, step=0)
                    raise optuna.exceptions.TrialPruned()
                else:
                    r2_result = float(line.split(":")[1].strip())
                    print(f"R2 score: {r2_result}")
                    trial.report(r2_result, step=0)

                    if trial.should_prune():
                        raise optuna.exceptions.TrialPruned()

    except Exception as e:
        print(f"Exception occurred: {e}", file=sys.stderr)
        r2_result = 0.0

    gc.collect()

    return r2_result

In [28]:
def objective_de_fea(trial):
    try:
        new_x = search_data_descriptor_compress(trial, group_nde, mol_de, 'delaney')
        new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
        y_true = np.asarray(y_de).astype('float')
        np.save('new_fps.npy', new_x)
        np.save('y_true.npy', y_true)
        
        save_model(new_x)

        r2_result = 0.0
        
        result = subprocess.run(['python3', './extra_code/learning_process.py',
                                 str(BATCHSIZE), str(EPOCHS), 
                                 str(lr),
                                 'new_fps.npy', 'y_true.npy'],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        if result.stderr:
            filtered_stderr = '\n'.join([line for line in result.stderr.split('\n') if "could not open file to read NUMA node" not in line and "Your kernel may have been built without NUMA support" not in line])
            if filtered_stderr:
                print(f"Error in subprocess: {filtered_stderr}", file=sys.stderr)

        for line in result.stdout.splitlines():
            if "R2" in line:
                if "(prune)" in line:
                    print(f"Pruning trial due to poor R2: {line}")
                    r2_result = 0.0
                    trial.report(r2_result, step=0)
                    raise optuna.exceptions.TrialPruned()
                else:
                    r2_result = float(line.split(":")[1].strip())
                    print(f"R2 score: {r2_result}")
                    trial.report(r2_result, step=0)

                    if trial.should_prune():
                        raise optuna.exceptions.TrialPruned()

    except Exception as e:
        print(f"Exception occurred: {e}", file=sys.stderr)
        r2_result = 0.0

    gc.collect()

    return r2_result

In [29]:
def objective_lo_fea(trial):
    try:
        new_x = search_data_descriptor_compress(trial, group_nlo, mol_lo, 'lovrics')
        new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
        y_true = np.asarray(y_lo).astype('float')
        np.save('new_fps.npy', new_x)
        np.save('y_true.npy', y_true)
        
        save_model(new_x)
        
        r2_result = 0.0

        result = subprocess.run(['python3', './extra_code/learning_process.py',
                                 str(BATCHSIZE), str(EPOCHS), 
                                 str(lr),
                                 'new_fps.npy', 'y_true.npy'],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        if result.stderr:
            filtered_stderr = '\n'.join([line for line in result.stderr.split('\n') if "could not open file to read NUMA node" not in line and "Your kernel may have been built without NUMA support" not in line])
            if filtered_stderr:
                print(f"Error in subprocess: {filtered_stderr}", file=sys.stderr)

        for line in result.stdout.splitlines():
            if "R2" in line:
                if "(prune)" in line:
                    print(f"Pruning trial due to poor R2: {line}")
                    r2_result = 0.0
                    trial.report(r2_result, step=0)
                    raise optuna.exceptions.TrialPruned()
                else:
                    r2_result = float(line.split(":")[1].strip())
                    print(f"R2 score: {r2_result}")
                    trial.report(r2_result, step=0)

                    if trial.should_prune():
                        raise optuna.exceptions.TrialPruned()

    except Exception as e:
        print(f"Exception occurred: {e}", file=sys.stderr)
        r2_result = 0.0

    gc.collect()

    return r2_result

In [30]:
def objective_hu_fea(trial):
    try:
        new_x = search_data_descriptor_compress(trial, group_nhu, mol_hu, 'huusken')
        new_x = np.nan_to_num(new_x, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
        y_true = np.asarray(y_hu).astype('float')
        np.save('new_fps.npy', new_x)
        np.save('y_true.npy', y_true)
        save_model(new_x)
        
        r2_result = 0.0

        result = subprocess.run(['python3', './extra_code/learning_process.py',
                                 str(BATCHSIZE), str(EPOCHS), 
                                 str(lr),
                                 'new_fps.npy', 'y_true.npy'],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        if result.stderr:
            filtered_stderr = '\n'.join([line for line in result.stderr.split('\n') if "could not open file to read NUMA node" not in line and "Your kernel may have been built without NUMA support" not in line])
            if filtered_stderr:
                print(f"Error in subprocess: {filtered_stderr}", file=sys.stderr)

        for line in result.stdout.splitlines():
            if "R2" in line:
                if "(prune)" in line:
                    print(f"Pruning trial due to poor R2: {line}")
                    r2_result = 0.0
                    trial.report(r2_result, step=0)
                    raise optuna.exceptions.TrialPruned()
                else:
                    r2_result = float(line.split(":")[1].strip())
                    print(f"R2 score: {r2_result}")
                    trial.report(r2_result, step=0)

                    if trial.should_prune():
                        raise optuna.exceptions.TrialPruned()

    except Exception as e:
        print(f"Exception occurred: {e}", file=sys.stderr)
        r2_result = 0.0

    gc.collect()

    return r2_result

In [31]:
storage = optuna.storages.RDBStorage(url="sqlite:///ano_analysis.db", engine_kwargs={"connect_args": {"timeout": 10000}})
# storage_urls = "postgresql+psycopg2://postgres:{pwd}}@localhost:{num}}"
# storage = optuna.storages.RDBStorage(url=storage_urls)

In [32]:
TRIALS=100

In [33]:
# try:
#     optuna.delete_study(study_name="ANO_ws_feature", storage=storage)
#     optuna.delete_study(study_name="ANO_de_feature", storage=storage)
#     optuna.delete_study(study_name="ANO_lo_feature", storage=storage)
#     optuna.delete_study(study_name="ANO_hu_feature", storage=storage)
# except:
#     pass

# try:
#     optuna.delete_study(study_name="ANO_ws_feature_fixed", storage=storage)
#     optuna.delete_study(study_name="ANO_de_feature_fixed", storage=storage)
#     optuna.delete_study(study_name="ANO_lo_feature_fixed", storage=storage)
#     optuna.delete_study(study_name="ANO_hu_feature_fixed", storage=storage)
# except:
#     pass

In [None]:
# study_ws_fea = optuna.create_study(study_name='ANO_ws_feature', storage=storage, direction="maximize", pruner=optuna.pruners.SuccessiveHalvingPruner(reduction_factor=64, min_early_stopping_rate=10),load_if_exists=True)     
study_ws_fea = optuna.create_study(study_name='ANO_ws_feature', storage=storage, direction="maximize", pruner=optuna.pruners.HyperbandPruner(min_resource=100,max_resource=1000,reduction_factor=3), load_if_exists=True)
study_ws_fea.optimize(objective_ws_fea, n_trials=TRIALS)
pruned_trials_ws_fea = study_ws_fea.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials_ws_fea = study_ws_fea.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
# 124m 43.4s - Trial 100

In [None]:
study_de_fea = optuna.create_study(study_name='ANO_de_feature', storage=storage, direction="maximize", pruner=optuna.pruners.SuccessiveHalvingPruner(reduction_factor=64, min_early_stopping_rate=10),load_if_exists=True)     
# study_de_fea = optuna.create_study(study_name='ANO_de_feature', storage=storage, direction="maximize", pruner=optuna.pruners.HyperbandPruner(min_resource=100,max_resource=1000,reduction_factor=3), load_if_exists=True)
study_de_fea.optimize(objective_de_fea, n_trials=TRIALS)
pruned_trials_de_fea = study_de_fea.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials_de_fea = study_de_fea.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
# 148m 31.6s - Trial 100

In [37]:
TRIALS=150

In [None]:
study_lo_fea = optuna.create_study(study_name='ANO_lo_feature', storage=storage, direction="maximize", pruner=optuna.pruners.SuccessiveHalvingPruner(reduction_factor=64, min_early_stopping_rate=10),load_if_exists=True)     
# study_lo_fea = optuna.create_study(study_name='ANO_lo_feature', storage=storage, direction="maximize", pruner=optuna.pruners.HyperbandPruner(min_resource=100,max_resource=1000,reduction_factor=3), load_if_exists=True)
study_lo_fea.optimize(objective_lo_fea, n_trials=TRIALS)
pruned_trials_lo_fea = study_lo_fea.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials_lo_fea = study_lo_fea.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
# 137m 25.3s - Trial 100

In [None]:
study_hu_fea = optuna.create_study(study_name='ANO_hu_feature', storage=storage, direction="maximize", pruner=optuna.pruners.SuccessiveHalvingPruner(reduction_factor=64, min_early_stopping_rate=10),load_if_exists=True)     
# study_hu_fea = optuna.create_study(study_name='ANO_hu_feature', storage=storage, direction="maximize", pruner=optuna.pruners.HyperbandPruner(min_resource=100,max_resource=1000,reduction_factor=3), load_if_exists=True)
study_hu_fea.optimize(objective_hu_fea, n_trials=TRIALS)
pruned_trials_hu_fea = study_hu_fea.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials_hu_fea = study_hu_fea.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
# 308m 15.1s - Trial 100

In [None]:
print("Study statistics: [ws_feature] ")
print("  Number of finished trials: ", len(study_ws_fea.trials))
print("  Number of pruned trials: ", len(pruned_trials_ws_fea))
print("  Number of complete trials: ", len(complete_trials_ws_fea))
print("Best trial:")
trial_ws_fea = study_ws_fea.best_trial
print("  Value: ", trial_ws_fea.value)
print("  Params: ")
for key, value in trial_ws_fea.params.items():
    print("    {}: {}".format(key, value))

In [None]:
print("Study statistics: [de_feature] ")
print("  Number of finished trials: ", len(study_de_fea.trials))
print("  Number of pruned trials: ", len(pruned_trials_de_fea))
print("  Number of complete trials: ", len(complete_trials_de_fea))
print("Best trial:")
trial_de_fea = study_de_fea.best_trial
print("  Value: ", trial_de_fea.value)
print("  Params: ")
for key, value in trial_de_fea.params.items():
    print("    {}: {}".format(key, value))

In [None]:
print("Study statistics: [lo_feature] ")
print("  Number of finished trials: ", len(study_lo_fea.trials))
print("  Number of pruned trials: ", len(pruned_trials_lo_fea))
print("  Number of complete trials: ", len(complete_trials_lo_fea))
print("Best trial:")
trial_lo_fea = study_lo_fea.best_trial
print("  Value: ", trial_lo_fea.value)
print("  Params: ")
for key, value in trial_lo_fea.params.items():
    print("    {}: {}".format(key, value))

In [None]:
print("Study statistics: [hu_feature] ")
print("  Number of finished trials: ", len(study_hu_fea.trials))
# print("  Number of pruned trials: ", len(pruned_trials_hu_fea))
# print("  Number of complete trials: ", len(complete_trials_hu_fea))
print("Best trial:")
trial_hu_fea = study_hu_fea.best_trial
print("  Value: ", trial_hu_fea.value)
print("  Params: ")
for key, value in trial_hu_fea.params.items():
    print("    {}: {}".format(key, value))