In [1]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from topollm.analysis.compare_sampling_methods.make_plots import add_subtitle

In [None]:
data_folder_list = [
    "data-multiwoz21_split-train_ctxt-dataset_entry_samples-10000_feat-col-ner_tags/",
    "data-multiwoz21_split-validation_ctxt-dataset_entry_samples-3000_feat-col-ner_tags/",
    "data-multiwoz21_split-test_ctxt-dataset_entry_samples-3000_feat-col-ner_tags/",
]

selected_data_folder = data_folder_list[2]

file_path = pathlib.Path(
    "/Users/ruppik/git-source/Topo_LLM/data/analysis/sample_sizes/run_general_comparisons/analysis/twonn/",
    selected_data_folder,
    "lvl-token/add-prefix-space-True_max-len-512/model-roberta-base_task-masked_lm/layer--1_agg-mean/norm-None/full_local_estimates_df.csv",
)

results_base_directory_path: pathlib.Path = file_path.parent

local_estimates_df: pd.DataFrame = pd.read_csv(
    filepath_or_buffer=file_path,
)

local_estimates_df

In [None]:
# Select a subset of the data with the same parameters.
# This allows comparing over different seeds.

n_neighbors_fixed = 128
data_prep_sampling_samples_fixed = 50000

subset_local_estimates_df: pd.DataFrame = local_estimates_df.copy()

subset_local_estimates_df = subset_local_estimates_df[
    subset_local_estimates_df["data_prep_sampling_method"] == "random"
]

subset_local_estimates_df = subset_local_estimates_df[
    subset_local_estimates_df["deduplication"] == "array_deduplicator"
]

subset_local_estimates_df = subset_local_estimates_df[
    subset_local_estimates_df["data_prep_sampling_samples"] == data_prep_sampling_samples_fixed
]

# We do not fix the local_estimates_samples,
# since we want to compare the results for different sample sizes.
#
# subset_local_estimates_df = subset_local_estimates_df[subset_local_estimates_df["local_estimates_samples"] == 2500]

subset_local_estimates_df = subset_local_estimates_df[subset_local_estimates_df["n_neighbors"] == n_neighbors_fixed]

subset_local_estimates_df

In [None]:
subset_local_estimates_df.describe()

In [None]:
from matplotlib.ticker import AutoLocator, MultipleLocator

plot_save_path = pathlib.Path(
    results_base_directory_path,
    "different_sampling_seeds",
    f"array_data_truncated_mean_boxplot_{n_neighbors_fixed=}_{data_prep_sampling_samples_fixed=}.pdf",
)


def create_boxplot_of_mean_over_different_sampling_seeds(
    subset_local_estimates_df: pd.DataFrame,
    plot_save_path: pathlib.Path | None = None,
    *,
    show_plot: bool = True,
    y_min: float = 7.5,
    y_max: float = 10.0,
) -> None:
    # Create a boxplot of the 'array_data_truncated_mean' values
    # in the subset_local_estimates_df

    plt.figure(figsize=(10, 6))

    # Set the fixed y-axis limits
    plt.ylim(y_min, y_max)

    # Automatically set major and minor tick locators
    plt.gca().yaxis.set_major_locator(AutoLocator())  # Auto-adjust major ticks
    plt.gca().yaxis.set_minor_locator(MultipleLocator(0.1))  # Set minor ticks for finer grid

    # Enable the grid with different styling for major and minor lines
    plt.grid(which="major", axis="y", color="gray", linestyle="-", linewidth=0.6, alpha=0.5)  # Major grid lines
    plt.grid(which="minor", axis="y", color="gray", linestyle="--", linewidth=0.3, alpha=0.3)  # Minor grid lines

    sns.boxplot(
        x="local_estimates_samples",
        y="array_data_truncated_mean",
        data=subset_local_estimates_df,
    )
    sns.stripplot(
        x="local_estimates_samples",
        y="array_data_truncated_mean",
        data=subset_local_estimates_df,
        color="red",
        jitter=False,
        dodge=True,
        marker="o",
        alpha=0.5,
    )

    # Adding additional information about the fixed parameters in the plot
    fixed_params_text: str = "\n".join([f"{n_neighbors_fixed = }" + "\n" + f"{data_prep_sampling_samples_fixed = }"])
    plt.text(
        x=0.55,
        y=0.15,
        s=f"Fixed Parameters:\n{fixed_params_text}",
        transform=plt.gca().transAxes,
        fontsize=10,
        verticalalignment="top",
        bbox={
            "boxstyle": "round",
            "facecolor": "wheat",
            "alpha": 0.5,
        },
    )

    add_subtitle(
        additional_title=str(object=file_path),
    )

    if plot_save_path is not None:
        plot_save_path.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(plot_save_path, bbox_inches="tight")

    if show_plot:
        plt.show()


create_boxplot_of_mean_over_different_sampling_seeds(
    subset_local_estimates_df=subset_local_estimates_df,
    plot_save_path=plot_save_path,
    show_plot=True,
)