In [None]:
import optuna
import matplotlib.pyplot as plt
from collections import defaultdict
import pandas as pd
import synergy_dataset as sd
from IPython.display import display

# Path to your SQLite3 database
db_path = "sqlite:///svm_db.sqlite3" # Replace with your database path

# Get all study summaries
study_summaries = optuna.get_all_study_summaries(storage=db_path)

for summary in study_summaries:
    print(f"- {summary.study_name}")

In [None]:
study_name = "ASReview2 2024-12-20 at 14.49.22"
study = optuna.load_study(study_name=study_name, storage=db_path)
print(study.trials[0].params)

dataset_names = []
for i in sd.iter_datasets():
    if i.name != "Chou_2004":
        dataset_names.append(i.name)

dataset_names.sort()

In [6]:
# Prepare data for visualization
data = []

for trial in study.trials:
    if trial.intermediate_values:
        for dataset_id, value in enumerate(trial.intermediate_values.values()):
            params = trial.params  # Extract trial parameters
            # Record dataset_id, loss (intermediate value), and parameters
            data.append({
                "dataset_id": dataset_id,
                "loss": value,
                "ratio": params.get("ratio", None),
                "c": params.get("log__C", None)
            })

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Initialize variables to store the best trial per dataset
num_datasets = len(study.trials[0].intermediate_values)  # Assuming all trials have the same number of datasets
best_trials_per_dataset = [None] * num_datasets  # Store best trial numbers
best_losses_per_dataset = [float("inf")] * num_datasets  # Store best loss values
best_params_per_dataset = [None] * num_datasets  # Store best trial parameters

# Loop through all trials to find the best trial for each dataset
for trial in study.trials:
    if trial.intermediate_values:
        # Iterate through each dataset (position in the intermediate_values list)
        for dataset_id, loss in enumerate(trial.intermediate_values.values()):
            if loss < best_losses_per_dataset[dataset_id]:
                # Update the best trial info for this dataset
                best_losses_per_dataset[dataset_id] = loss
                best_trials_per_dataset[dataset_id] = trial.number
                best_params_per_dataset[dataset_id] = trial.params

In [None]:
# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(list(study.best_trial.intermediate_values.items()), columns=["Dataset", "Mean Loss"])
# Rename the rows to indicate the dataset number
df.index = [dataset_names[i] for i in range(len(best_params_per_dataset))]
df.drop("Dataset", inplace=True, axis=1)

display(df)

# Plot the values (optional)
df.plot(kind="bar", figsize=(10, 6), legend=False)
plt.title("Mean Losses per Dataset")
plt.xlabel("Dataset")
plt.ylabel("Mean Loss")
plt.tight_layout()
plt.show()

In [None]:

# Create a pandas DataFrame
df = pd.DataFrame(best_params_per_dataset)

# Rename the rows to indicate the dataset number
df.index = [dataset_names[i] for i in range(len(best_params_per_dataset))]

display(df)

In [None]:
# Create a pandas DataFrame
df = pd.DataFrame(best_params_per_dataset)

# Plot each parameter separately
num_params = len(df.columns)
fig, axes = plt.subplots(num_params, 1, figsize=(8, num_params * 2.5), sharex=False)

for idx, param in enumerate(df.columns):
    ax = axes[idx]
    ax.plot(dataset_names, df[param], marker='o', linestyle='-', color='b', alpha=0.8, label=param)
    ax.set_title(param, fontsize=10)
    ax.set_ylabel("Value", fontsize=8)
    ax.grid(axis="y", linestyle="--", alpha=0.6)
    ax.tick_params(axis="y", labelsize=8)
    ax.legend(fontsize=8, loc="upper left")
    
    # Set dataset names as x-tick labels for each plot
    ax.set_xticks(dataset_names)  # Setting positions explicitly
    ax.set_xticklabels(dataset_names, fontsize=8, rotation=90)  # Setting labels

# Add x-axis label only to the bottom subplot
axes[-1].set_xlabel("Datasets", fontsize=10)

# Adjust layout for better spacing
plt.tight_layout()

# Save or show the plot
plt.savefig("parameter_comparison_lineplots_all_xticks_fixed.pdf", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
# Extract intermediate values grouped by dataset_id
dataset_intermediate_values = defaultdict(list)

for trial in study.trials:
    if trial.intermediate_values:
        # Distribute intermediate values by dataset_id (index in the list)
        for dataset_id, value in enumerate(trial.intermediate_values.values()):
            dataset_intermediate_values[dataset_id].append(value)

# Prepare data for boxplots
datasets = list(dataset_intermediate_values.keys())
boxplot_data = [dataset_intermediate_values[dataset_id] for dataset_id in datasets]

# Plot boxplots
plt.figure(figsize=(12, 6))
plt.boxplot(boxplot_data, labels=dataset_names, 
            showmeans=True, patch_artist=True)
plt.xlabel("Dataset")
plt.ylabel("Loss")
plt.title(f"Boxplot of Losses for Each Dataset {study_name}")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.xticks(rotation=90)  # Rotate dataset names for better readability
plt.tight_layout()
plt.ylim((0, 0.3))

# Show the plot
plt.tight_layout()
plt.savefig(f"boxplot_per_dataset_{study_name}.pdf")
plt.show()
