## Notebook for analyzing a tool sandbox run

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pathlib

import pandas as pd
import polars as pl
import seaborn as sns

from tool_sandbox.analysis.analysis import extract_meta_stats, extract_aggregated_stats
from tool_sandbox.analysis.data_loading import (
    extract_scenario_results,
    get_scenario_pretty_print_path,
    load_result_summary,
)

In [None]:
result_summary_path = pathlib.Path(
    # "/Users/tholleis/code/ToolSandbox/data/agent_gemini-1.5-pro-001_user_gpt-4o-2024-05-13_06_14_2024_19_56_00_no_tool_assert/result_summary.json"
    "/Users/tholleis/code/ToolSandbox/data/agent_gemini-1.5-pro-001_user_gpt-4o-2024-05-13_06_14_2024_20_20_49/result_summary.json"
)

results = load_result_summary(result_summary_path)

In [None]:
scenario_df = extract_scenario_results(results)
scenario_df.head(3)

### Extract high-level metadata

In [None]:
meta_stats = extract_meta_stats(scenario_df)
meta_stats

In [None]:
exceptions_df = scenario_df.filter(pl.col("exception_type").is_not_null())
exception_count_df = (
    exceptions_df.get_column("exception_type")
    .value_counts()
    .sort(by="count", descending=True)
)
exception_count_df

### Analyze the different types of exceptions

In [None]:
def create_link_to_pretty_print_file(scenario_name: str):
    pretty_print_path = get_scenario_pretty_print_path(
        result_summary_path=result_summary_path, scenario_name=scenario_name
    )
    return f'<a href="{pretty_print_path}">{scenario_name}</a>'


for exception_type in exception_count_df.get_column("exception_type").to_list():
    df = exceptions_df.filter(pl.col("exception_type") == exception_type)
    pd_df = df.to_pandas()
    desired_column_order = [
        "exception_type",
        "name",
        "traceback",
        "milestone_similarity",
        "minefield_similarity",
        "similarity",
        "turn_count",
        "categories",
        "milestone_mapping",
        "minefield_mapping",
    ]
    pd_df = pd_df[desired_column_order]
    display(
        pd_df.style.format({"name": create_link_to_pretty_print_file})
        .set_caption(f"{exception_type} | {len(df)} / {len(exceptions_df)} exceptions")
        .set_properties(**{"text-align": "left"})
    )

### Extract aggregated statistics per scenario category

In [None]:
agg_stats_df = extract_aggregated_stats(results)
with pl.Config(tbl_rows=-1):
    display(agg_stats_df)

### Analyze the top/bottom N samples for each category.

In [None]:
scenario_exploded_categories_df = scenario_df.explode("categories")
scenario_exploded_categories_df = scenario_exploded_categories_df.rename(
    {"categories": "category"}
)
scenario_exploded_categories_df.head(3)

In [None]:
# Let's ignore the augmentation categories for now.
CATEGORIES_WITH_AUGMENTATION = {
    ScenarioCategories.NO_DISTRACTION_TOOLS,
    ScenarioCategories.THREE_DISTRACTION_TOOLS,
    ScenarioCategories.TEN_DISTRACTION_TOOLS,
    ScenarioCategories.ALL_TOOLS_AVAILABLE,
    ScenarioCategories.TOOL_NAME_SCRAMBLED,
    ScenarioCategories.TOOL_DESCRIPTION_SCRAMBLED,
    ScenarioCategories.ARG_DESCRIPTION_SCRAMBLED,
    ScenarioCategories.ARG_NAME_SCRAMBLED,
    ScenarioCategories.ARG_TYPE_SCRAMBLED,
}
CATEGORIES_WITHOUT_AUGMENTATION = {
    category
    for category in ScenarioCategories
    if category not in CATEGORIES_WITH_AUGMENTATION
}

category_strs = {str(c) for c in CATEGORIES_WITHOUT_AUGMENTATION}

# We want to look at scenarios with high/low similarity scores, but not ones where an
# exception occurred.
scenario_exploded_categories_pd_df = scenario_exploded_categories_df.filter(
    pl.col("exception_type").is_null()
).to_pandas()
pd_df = scenario_exploded_categories_pd_df[
    scenario_exploded_categories_pd_df["category"].isin(category_strs)
]

# Show the worst N samples with the lowest similarity for each scenario category.
pd_df = pd_df.sort_values(["category", "similarity"], ascending=[True, True])
N = 3
bottom_n_per_category = pd_df.groupby("category").head(N)
with pd.option_context("display.max_colwidth", None):
    display(
        bottom_n_per_category.style.format({"name": create_link_to_pretty_print_file})
    )

In [None]:
# Show the best N samples with the highest similarity for each scenario category.
pd_df = pd_df.sort_values(["category", "similarity"], ascending=[True, False])
N = 1
top_n_per_category = pd_df.groupby("category").head(N)
with pd.option_context("display.max_colwidth", None):
    # From https://stackoverflow.com/a/51855581 : we want to make the scenario name a
    # clickable link to the `pretty_print.txt` file.
    display(top_n_per_category.style.format({"name": create_link_to_pretty_print_file}))

In [None]:
g = sns.catplot(
    data=scenario_exploded_categories_pd_df,
    x="similarity",
    y="category",
    kind="strip",
    height=10,
)
g.set_axis_labels("Similarity", "")