## Notebook for comparing tool sandbox runs

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pathlib

import polars as pl
import seaborn as sns

from tool_sandbox.analysis.analysis import extract_aggregated_stats, extract_meta_stats
from tool_sandbox.analysis.data_loading import (
    extract_scenario_results,
    load_result_summary,
)

In [None]:
name_to_results_summary_path = {
    "baseline": pathlib.Path(
        "/path/to/your/result_summary.json"
    ),
    "with_some_change": pathlib.Path(
        "/path/to/your/other/result_summary.json"
    ),
}

name_to_results = {
    name: load_result_summary(results_path)
    for name, results_path in name_to_results_summary_path.items()
}
print("Loaded results for these experiments:\n", sorted(name_to_results.keys()))

### Extract per scenario results

In [None]:
name_to_scenario_results = {
    name: extract_scenario_results(results) for name, results in name_to_results.items()
}

# Add a new column for the experiment name and then concatenate the data frames. This
# makes it easy to compare the experiments as tables or in plots. Note that we use
# `select` instead of `with_new_columns` since we want to ensure that the experiment
# name becomes the first column.
all_experiments_scenario_results_all_columns_df = pl.concat(
    (
        df.select([pl.lit(name).alias("experiment"), pl.all()])
        for name, df in name_to_scenario_results.items()
    ),
    how="vertical_relaxed",  # < to handle mix of null and string values for exceptions
)
all_experiments_scenario_results_all_columns_df

In [None]:
# We exclude the categories, milestone and minefield mappings as they cannot be
# converted to a pandas dataframe.
all_experiments_scenario_results_df = (
    all_experiments_scenario_results_all_columns_df.select(
        pl.exclude(
            ["categories", "traceback", "milestone_mapping", "minefield_mapping"]
        )
    )
)

In [None]:
sns.histplot(
    all_experiments_scenario_results_df,
    x="turn_count",
    hue="experiment",
    bins=30,
)

In [None]:
# We exclude the categories and milestone mappings as they cannot be converted to a
# pandas dataframe.
sns.histplot(
    all_experiments_scenario_results_df,
    x="similarity",
    hue="experiment",
)

### Extract high-level metadata from the scenario results

In [None]:
name_to_meta_stats = {
    name: extract_meta_stats(scenarios_df)
    for name, scenarios_df in name_to_scenario_results.items()
}

# Add a new column for the experiment name and then concatenate the data frames. This
# makes it easy to compare the experiments as tables or in plots. Note that we use
# `select` instead of `with_new_columns` since we want to ensure that the experiment
# name becomes the first column.
all_experiments_meta_stats_df = pl.concat(
    df.select([pl.lit(name).alias("experiment"), pl.all()])
    for name, df in name_to_meta_stats.items()
)
all_experiments_meta_stats_df

In [None]:
# Also print the table as markdown so that it can be copied into a Github comment.
with pl.Config() as cfg:
    cfg.set_tbl_formatting("ASCII_MARKDOWN")
    print(all_experiments_meta_stats_df)

### Extract aggregated statistics per scenario category

In [None]:
name_to_agg_results = {
    name: extract_aggregated_stats(results) for name, results in name_to_results.items()
}

# Add a new column for the experiment name and then concatenate the data frames. This
# makes it easy to create a bar chart comparing the individual categories for each
# experiment.
all_experiments_df = pl.concat(
    df.with_columns(experiment=pl.lit(name)) for name, df in name_to_agg_results.items()
)

In [None]:
ax = sns.barplot(all_experiments_df, x="similarity", y="category", hue="experiment")
# Move the legend above the plot.
sns.move_legend(
    ax, "lower center", bbox_to_anchor=(0.5, 1), ncol=3, title=None, frameon=False
)