In [14]:
import pandas as pd
import numpy as np
from optuna.visualization import (
    plot_param_importances,
    plot_optimization_history,
    plot_parallel_coordinate,
    plot_slice,
    plot_intermediate_values,
    plot_edf,
    plot_contour,
)
import plotly.io as pio
import optuna
from optuna.importance import get_param_importances
from pathlib import Path as pt
from loguru import logger

In [6]:
optuna_storage_file = pt('/Users/aravindhnivas/Library/CloudStorage/OneDrive-MassachusettsInstituteofTechnology/ML properties/Analysed/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/tmp_C_processed_data/optuna/optuna_tmp_C.db')
storage = f'sqlite:///{optuna_storage_file}'
optuna_storage_file.exists(), storage

(True,
 'sqlite:////Users/aravindhnivas/Library/CloudStorage/OneDrive-MassachusettsInstituteofTechnology/ML properties/Analysed/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/tmp_C_processed_data/optuna/optuna_tmp_C.db')

In [11]:
existing_studies = optuna.study.get_all_study_summaries(storage=storage)
for s in existing_studies:
    print(s.study_name)

tmpC_topelements_lgbm_mol2vec_embeddings_pretrained_model
tmpC_topelements_lgbm_mol2vec_embeddings_pretrained_model_1


In [13]:
study_name = 'tmpC_topelements_lgbm_mol2vec_embeddings_pretrained_model'
study = optuna.load_study(study_name=study_name, storage=storage)

study.best_params

{'n_estimators': 120,
 'learning_rate': 0.04007607434405443,
 'subsample': 0.5136060355997997,
 'colsample_bytree': 0.6623465944665253,
 'num_leaves': 255,
 'min_child_samples': 97}

In [15]:
importances_fanova = get_param_importances(study)  # default method is "fanova"
importances_mdi = get_param_importances(
    study, evaluator=optuna.importance.MeanDecreaseImpurityImportanceEvaluator()
)

logger.info("Importances from get_param_importances (fanova):")
for param, importance in importances_fanova.items():
    logger.info(f"{param}: {importance}")

logger.info("\nImportances from get_param_importances (MDI):")
for param, importance in importances_mdi.items():
    logger.info(f"{param}: {importance}")

# Save both importance methods to a CSV file
df_importance = pd.DataFrame(
    {
        "Parameter": importances_fanova.keys(),
        "Importance (fanova)": importances_fanova.values(),
        "Importance (MDI)": [
            importances_mdi.get(param, 0) for param in importances_fanova.keys()
        ],
    }
)
df_importance = df_importance.sort_values("Importance (fanova)", ascending=False)

pre_trained_loc = pt('/Users/aravindhnivas/Documents/test-codes/umda')
grid_search_name = 'tmpC_topelements_lgbm_mol2vec_embeddings_pretrained_model'

# Save the hyperparameter importance to a CSV file
savefile = pre_trained_loc / f"{grid_search_name}.hyperparameter_importance.csv"
logger.info(f"Saving importance to {savefile.name}")
df_importance.to_csv(savefile, index=False)
logger.success(f"hyperparameter_importance saved to {savefile.name}")

# save all optuna figures to a folder
optuna_figures_folder = pre_trained_loc / "optuna_figures"
if not optuna_figures_folder.exists():
    optuna_figures_folder.mkdir(parents=True)

def save_figure(fig, filename, formats=["html", "png", "svg"]):
    for fmt in formats:
        full_filename = optuna_figures_folder / f"{filename}.{fmt}"
        if fmt == "html":
            pio.write_html(fig, file=full_filename)
        else:
            pio.write_image(fig, file=full_filename)
        logger.info(f"Saved: {full_filename}")

# 1. Hyperparameter Importances
fig = plot_param_importances(study)
# pio.write_html(fig, file=savefile.with_suffix(".hyperparameter_importance.html"))
save_figure(fig, "hyperparameter_importance")

# 2. Optimization History
fig_history = plot_optimization_history(study)
save_figure(fig_history, "optimization_history")

# 3. Parallel Coordinate
fig_parallel = plot_parallel_coordinate(study)
save_figure(fig_parallel, "parallel_coordinate")

# 4. Slice Plot
fig_slice = plot_slice(study)
save_figure(fig_slice, "slice_plot")

# 5. Intermediate Values
fig_intermediate = plot_intermediate_values(study)
save_figure(fig_intermediate, "intermediate_values")

# 6. Empirical Distribution Function (EDF)
fig_edf = plot_edf(study)
save_figure(fig_edf, "empirical_distribution")

# 7. Contour Plot
try:
    fig_contour = plot_contour(study)
    save_figure(fig_contour, "contour_plot")
except Exception as e:
    logger.error(f"Could not generate contour plot: {str(e)}")

logger.success("All figures have been saved in the 'optuna_figures' directory.")

[32m2024-10-17 17:01:05.023[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mImportances from get_param_importances (fanova):[0m
[32m2024-10-17 17:01:05.024[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlearning_rate: 0.4377272929449897[0m
[32m2024-10-17 17:01:05.024[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mcolsample_bytree: 0.28215893274168524[0m
[32m2024-10-17 17:01:05.024[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mn_estimators: 0.1517495406555196[0m
[32m2024-10-17 17:01:05.024[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mnum_leaves: 0.062487301549861415[0m
[32m2024-10-17 17:01:05.025[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1msubsample: 0.03888912473913061[0m
[32m2024-10-17 17:01:05.025[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mmin_child