In [13]:
import os
import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv(override=True)
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output"

In [15]:
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns(
    {"Question": "question", "Final answer": "true_answer", "Level": "task"}
)
eval_df = pd.DataFrame(eval_ds)


The repository for gaia-benchmark/GAIA contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/gaia-benchmark/GAIA
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.

Using the latest cached version of the module from /Users/aymeric/.cache/huggingface/modules/datasets_modules/datasets/gaia-benchmark--GAIA/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005 (last modified on Tue May 28 10:04:32 2024) since it couldn't be found locally at gaia-benchmark/GAIA, or remotely on the Hugging Face Hub.


In [16]:
pd.Series(eval_ds["task"]).value_counts()

2    86
1    53
3    26
Name: count, dtype: int64

# 1. Load all results

In [17]:
import glob

answer_file_path = f"{OUTPUT_DIR}/validation/answers.jsonl"

result_df = pd.concat(
    [
        pd.read_json(f, lines=True)
        for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl")
        if "answers.jsonl" not in f
    ]
)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df.to_json(answer_file_path, lines=True, orient="records")

In [18]:
from scripts.evaluation.gaia_scorer import question_scorer, check_close_call
import re
from collections import Counter

result_df["is_correct"] = result_df.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[
        eval_df["question"].apply(lambda x: x in question), "file_name"
    ]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

String 2 The Lord of the Rings High fantasy A Song of Ice and Fire cannot be normalized to number str.
String higher than 665 cannot be normalized to number str.
String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.
String 100

This represents the percentage of standards from the 1959 document that have been superseded by a new version as of August 2023 rounded to the nearest percent. cannot be normalized to number str.
String If you have rerited several time try a completely different approach.

 IF you have'vector' or 'Ticks' here
qcgebrcf8 monk historic “RoOKIES”I mean “tomorrow.

 IF you have'vector' or 'TICK' here
qcgebrcf8brakk108335

 flamexurt EL(OR monk historic “RoOKIE”I mean “tomorrow

 IF you have'vector' or 'TICK' here
qcgebrcf8 mon(k カsize'vector' or 'Ticks' here
qcgebrcf8 monow СЕЛЬНICE 2008 =)(*SIZE* 
url: [1]
                   - Appears]] but more things in my final answer are related to OceansIC products researcher who Nathan BornSTEIN' and t


Answer lists have different lengths, returning False.



In [19]:
def get_thoughts(x):
    try:
        output = x[0]["task"]
        for y in x[1:]:
            try:
                if "observation" in y:
                    output += y["llm_output"] + "\nObservation:" + y["observation"]
                else:
                    output += y["llm_output"] + "\Error:" + str(y["error"])
            except:
                pass
        return output
    except:
        return None


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_thoughts(x))

In [20]:
result_df["agent_name"].value_counts()

agent_name
react_code_gpt4o_23-june_planning2_newprompt5                                        165
react_code_gpt4o_6_aug_noplanning_nogrammar                                          164
react_code_gpt4o_6_aug_structuredplanning_nogrammar                                  164
react_code_gpt4o_13_aug_managedagent_noplanning_nogrammar                            164
react_code_gpt4o_6_aug_planning_nogrammar                                            164
react_code_gpt-4o_03_sept_managedagent-summary_planning                              164
gpt-4o_03_sept_orchestrator                                                          164
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar              153
react_code_gpt4o_31_aug_structuredplanning_nogrammar_htmltags                        135
react_code_gpt-4o_30_aug_managedagent-withsummary_planning3_nogrammar                102
react_code_gpt-4o_31_aug_managedagent-withsummary_planning3_nogrammar                102
react_code

# 2. Inspect specific runs

In [21]:
gpt4o = "react_code_gpt4o_23-june_planning2_newprompt5"
gpt4o_planning = "react_code_gpt4o_6_aug_planning_nogrammar"
gpt4o_noplanning = "react_code_gpt4o_6_aug_noplanning_nogrammar"
gpt4o_structuredplanning = "react_code_gpt4o_6_aug_structuredplanning_nogrammar"
gpt4o_multiagent = "react_code_gpt4o_13_aug_managedagent_noplanning_nogrammar"

gpt4o_newmultiagent_summary = (
    "react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar"
)
gpt4o_htmltags = "react_code_gpt4o_31_aug_structuredplanning_nogrammar_htmltags"
gpt4o_notags = "react_code_gpt4o_31_aug_defaultplanning_nogrammar_normaltags"
gpt4o_nonavigationalsearchtool = (
    "react_code_gpt4o_31_aug_defaultplanning_nogrammar_normaltags_nonavigationalsearch"
)
gpt4o_orchestrator = "gpt-4o_03_sept_orchestrator"
gpt4o_managed_noplanning = (
    "react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar"
)
gpt4o_managed = "react_code_gpt-4o_03_sept_managedagent-summary_planning"
llama70b_managed = "react_code_llama-31-70B_06_sept_managedagent-summary_planning"

list_versions = [
    gpt4o,
    gpt4o_planning,
    # gpt4o_noplanning,
    # gpt4o_structuredplanning,
    gpt4o_multiagent,
    gpt4o_newmultiagent_summary,
    gpt4o_htmltags,
    gpt4o_notags,
    gpt4o_nonavigationalsearchtool,
    gpt4o_orchestrator,
    gpt4o_managed_noplanning,
    gpt4o_managed,
    llama70b_managed,
]

# submission_selection_name = "react_code_llama3-70b_02-05_full-gaia-validation-code"
sel_df = result_df.loc[
    (result_df["agent_name"].isin(list_versions))
    # & (~result_df["question"].isin(UNSOLVED_QUESTIONS))
].reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)
# assert sel_df["question"].value_counts().max() == len(list_versions), "Some questions are duplicate!"

agent_name
react_code_gpt4o_23-june_planning2_newprompt5                                        165
react_code_gpt4o_13_aug_managedagent_noplanning_nogrammar                            164
gpt-4o_03_sept_orchestrator                                                          164
react_code_gpt-4o_03_sept_managedagent-summary_planning                              164
react_code_gpt4o_6_aug_planning_nogrammar                                            164
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar              153
react_code_gpt4o_31_aug_structuredplanning_nogrammar_htmltags                        135
react_code_gpt4o_31_aug_defaultplanning_nogrammar_normaltags_nonavigationalsearch    100
react_code_gpt4o_31_aug_defaultplanning_nogrammar_normaltags                          68
react_code_llama-31-70B_06_sept_managedagent-summary_planning                         58
react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar                51
Name: coun

agent_name                                                                         task
gpt-4o_03_sept_orchestrator                                                        2       85
                                                                                   1       53
                                                                                   3       26
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar            2       81
                                                                                   1       49
                                                                                   3       23
react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar             2       29
                                                                                   1       15
                                                                                   3        7
react_code_gpt-4o_03_sept_managedagent-summary_planning           

Total length: 1386 - is complete: False


In [22]:
from scripts.evaluation.hard_questions import HARD_QUESTIONS

sel_df.loc[
    (sel_df["is_correct"] == False) & (sel_df["is_near_correct"] == True),
    ["question", "prediction", "true_answer"],
]

Unnamed: 0,question,prediction,true_answer
150,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, corn, fresh basil, green beans, lettuce, sweet potatoes, zucchini","broccoli, celery, fresh basil, lettuce, sweet potatoes"
398,"All of the individuals who formally held the position of United States secretary of homeland security prior to April 2019, excluding those who held the position in an acting capacity, have a bachelor's degree. Of the universities that these bachelor's degrees were from, which is the westernmost university and which is the easternmost university? Give them to me as a comma-separated list, I only want the name of the cities where the universities are located, with the westernmost city listed first.","Santa Clara, CA, Boston, MA","Santa Clara, Boston"
534,"According to the World Bank, which countries had gross savings of over 35% of GDP for every year in the period 2001-2010? Give your answer as a comma-separated list of countries in alphabetical order. Use the countries most common names in english when answering.","Brunei Darussalam, China, Morocco, Singapore","Brunei, China, Morocco, Singapore"
562,"All of the individuals who formally held the position of United States secretary of homeland security prior to April 2019, excluding those who held the position in an acting capacity, have a bachelor's degree. Of the universities that these bachelor's degrees were from, which is the westernmost university and which is the easternmost university? Give them to me as a comma-separated list, I only want the name of the cities where the universities are located, with the westernmost city listed first.","Santa Clara, CA, Boston, MA","Santa Clara, Boston"
572,"A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?",Egalitarianism,egalitarian
739,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","bell pepper, broccoli, celery, corn, fresh basil, green beans, lettuce, sweet potatoes, zucchini","broccoli, celery, fresh basil, lettuce, sweet potatoes"
892,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, fresh basil, lettuce, sweet potatoes, zucchini","broccoli, celery, fresh basil, lettuce, sweet potatoes"
940,"Could you help me out with this assignment? Our professor sprung it on us at the end of class Friday, and I'm still trying to figure it out. The question he asked us was about an anagram. I've attached an audio recording of the question that he asked, so if you could please take a listen and give me the answer, I'd really appreciate the help. Please limit your response to the anagram text that could be generated from the original line which fulfills the professor's request, without any other commentary. Also, please don't include any punctuation in your response.",To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them to die to sleep,To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune
1222,"A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?",Egalitarianism,egalitarian
1319,"According to the World Bank, which countries had gross savings of over 35% of GDP for every year in the period 2001-2010? Give your answer as a comma-separated list of countries in alphabetical order. Use the countries most common names in english when answering.","Brunei Darussalam, China, Morocco, Singapore","Brunei, China, Morocco, Singapore"


In [23]:
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
    sel_df.groupby(["agent_name", "task"])[
        ["is_correct", "is_near_correct", "count_steps", "question"]
    ]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
        }
    )
    .rename(columns={"question": "count"})
)

'Average score:'

Unnamed: 0_level_0,is_correct
agent_name,Unnamed: 1_level_1
gpt-4o_03_sept_orchestrator,0.409
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar,0.333
react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar,0.353
react_code_gpt-4o_03_sept_managedagent-summary_planning,0.433
react_code_gpt4o_13_aug_managedagent_noplanning_nogrammar,0.366
react_code_gpt4o_23-june_planning2_newprompt5,0.442
react_code_gpt4o_31_aug_defaultplanning_nogrammar_normaltags,0.426
react_code_gpt4o_31_aug_defaultplanning_nogrammar_normaltags_nonavigationalsearch,0.29
react_code_gpt4o_31_aug_structuredplanning_nogrammar_htmltags,0.333
react_code_gpt4o_6_aug_planning_nogrammar,0.366


Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct,is_near_correct,count_steps,count
agent_name,task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gpt-4o_03_sept_orchestrator,1,0.528302,0.528302,3.679245,53
gpt-4o_03_sept_orchestrator,2,0.435294,0.447059,3.658824,85
gpt-4o_03_sept_orchestrator,3,0.076923,0.076923,3.923077,26
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar,1,0.428571,0.44898,7.571429,49
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar,2,0.333333,0.333333,9.740741,81
react_code_gpt-4o_02_sept_managedagent-withsummary_noplanning_nogrammar,3,0.130435,0.130435,12.478261,23
react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar,1,0.533333,0.533333,8.333333,15
react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar,2,0.344828,0.344828,9.310345,29
react_code_gpt-4o_02_sept_managedagent-withsummary_planning3_nogrammar,3,0.0,0.0,10.857143,7
react_code_gpt-4o_03_sept_managedagent-summary_planning,1,0.566038,0.566038,7.471698,53


In [24]:
import plotly.express as px

cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[
            row["index"]
        ][:50]
        return res
    except Exception as e:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)
# cumulative_df["question"] = [el[:50] for el in sel_df["question"].values]

# cumulative_df["is_correct"] = cumulative_df["is_correct"] * (165 - 68) / 165

px.line(
    cumulative_df,
    color="agent_name",
    x="index",
    y="is_correct",
    hover_data="question",
)

# 3. Dive deeper into one run

In [26]:
sel_df = result_df.loc[result_df["agent_name"] == gpt4o_multiagent_summary]
print(len(sel_df))

NameError: name 'gpt4o_multiagent_summary' is not defined

### Count errors

In [20]:
import numpy as np

error_types = [
    "AgentParsingError",
    "AgentExecutionError",
    "AgentMaxIterationsError",
    "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
    if isinstance(row["intermediate_steps"], list):
        row["Count steps"] = len(row["intermediate_steps"])
        for step in row["intermediate_steps"]:
            if isinstance(step, dict) and "error" in step:
                try:
                    row[str(step["error"]["error_type"])] += 1
                except:
                    pass
    return row


sel_df = sel_df.apply(count_errors, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [21]:
import plotly.express as px


aggregate_errors = (
    sel_df.groupby(["is_correct"])[error_types + ["Count steps"]]
    .mean()
    .reset_index()
    .melt(id_vars=["is_correct"])
)

fig = px.bar(
    aggregate_errors,
    y="value",
    x="variable",
    color="is_correct",
    labels={
        "agent_name": "<b>LLM Engine</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "value": "<b>Average count</b>",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    height=500,
    width=800,
    barmode="group",
    bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("figures/aggregate_errors.png", scale=3)
fig.show()

### Count tool calls

In [15]:
tools_calls = pd.DataFrame.from_records(sel_df["tool_calls"].values).fillna(0)

# Exclude the tools that were not used enough
tools_calls = tools_calls.loc[:, tools_calls.sum() > 10]

# Sort the columns by the sum of the values
tools_calls = tools_calls[tools_calls.sum().sort_values(ascending=False).index]
display(tools_calls)
sel_with_calls = pd.concat(
    [sel_df[["question", "is_correct", "task"]], tools_calls], axis=1
)
sel_with_calls = (
    sel_with_calls.drop("question", axis=1).groupby(["is_correct", "task"]).mean()
)
# sel_with_calls = sel_with_calls.melt(id_vars=['question', 'is_correct', 'task'], var_name="tool", value_name='count')

Unnamed: 0,print,ask_search_agent,final_answer,len,range,inspect_file_as_text,set,visualizer,parse_square,sum,...,max,join,generate_prefixes,sorted,get,lower,f,search_birthdate,items,abs
0,8.0,3.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,5.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,3.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162,7.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163,20.0,8.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
sel_with_calls = sel_with_calls.reset_index().melt(
    id_vars=["is_correct", "task"], var_name="tool", value_name="average_count"
)

In [17]:
import plotly.express as px

fig = px.bar(
    sel_with_calls,
    x="tool",
    y="average_count",
    color="is_correct",
    facet_row="task",
    labels={
        "agent_name": "<b>Agent variant</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "eval_score_GPT4": "<b>Score</b>",
        "agent_type": "<b>Agent type</b>",
        "average_count": "<b>Average #calls per run</b>",
    },
)
fig.update_layout(
    barmode="group",
    height=800,
    width=1000,
    title="<b>" + "</b>",
)

### Inspect result by file extension type

In [18]:
display(
    sel_df.groupby(["agent_name", "attachment_type"])[
        ["is_correct", "count_steps", "question"]
    ].agg({"is_correct": "mean", "count_steps": "mean", "question": "count"})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct,count_steps,question
agent_name,attachment_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
react_code_gpt4o_23-june_planning2_newprompt5,,0.440945,9.19685,127
react_code_gpt4o_23-june_planning2_newprompt5,csv,0.0,7.0,1
react_code_gpt4o_23-june_planning2_newprompt5,docx,0.0,9.0,1
react_code_gpt4o_23-june_planning2_newprompt5,jpg,0.0,9.5,2
react_code_gpt4o_23-june_planning2_newprompt5,jsonld,0.0,16.0,1
react_code_gpt4o_23-june_planning2_newprompt5,mp3,1.0,8.333333,3
react_code_gpt4o_23-june_planning2_newprompt5,pdb,0.0,7.0,1
react_code_gpt4o_23-june_planning2_newprompt5,pdf,0.333333,5.666667,3
react_code_gpt4o_23-june_planning2_newprompt5,png,0.125,6.75,8
react_code_gpt4o_23-june_planning2_newprompt5,pptx,1.0,4.0,1


In [44]:
# Inspect specific file types
# sel_df.loc[
#     sel_df["attachment_type"].isin(["pdb", "docx", "csv"]),
#     [
#         "attachment_type",
#         "question",
#         "prediction",
#         "true_answer",
#         "is_correct",
#         "thoughts",
#     ],
# ]

# 4. Ensembling methods

### 4.1 Simple retry mechanism

In [54]:
first_run_gpt4 = result_df.loc[result_df["agent_name"] == gpt4o].copy()
second_run_gpt4 = result_df.loc[result_df["agent_name"] == noanchorplan].copy()


def replace_answer_if_incomplete(row, result_df_replacement):
    try:
        if (
            "Unable to determine" in row["intermediate_steps"]
            or "AgentMaxIterationsError" in str(row["intermediate_steps"])
            # or "AgentExecutionError" in str(row["intermediate_steps"])
            # or "AgentGenerationError" in str(row["intermediate_steps"])
            or "Error in generating final llm output" in str(row["intermediate_steps"])
        ):
            matching_answer = result_df_replacement.loc[
                (result_df_replacement["question"] == row["question"]), "prediction"
            ].values[0]
            print("replaced")
            gold_answer = matching_answer
        else:
            gold_answer = row["prediction"]
    except:
        gold_answer = row["prediction"]
    return gold_answer


combined_gpt4 = first_run_gpt4.copy()
combined_gpt4["prediction"] = combined_gpt4.apply(
    lambda x: replace_answer_if_incomplete(x, second_run_gpt4), axis=1
)

combined_gpt4["is_correct"] = combined_gpt4.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)

replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced


In [55]:
print("First run:")
print(first_run_gpt4.groupby(["task"])["is_correct"].mean())
print(first_run_gpt4["is_correct"].mean())

print("Second run:")
print(second_run_gpt4.groupby(["task"])["is_correct"].mean())
print(second_run_gpt4["is_correct"].mean())

print("Combined run:")
print(combined_gpt4.groupby(["task"])["is_correct"].mean())
print(combined_gpt4["is_correct"].mean())

First run:
task
1    0.566038
2    0.418605
3    0.200000
Name: is_correct, dtype: float64
0.4329268292682927
Second run:
task
1    0.528302
2    0.372093
3    0.200000
Name: is_correct, dtype: float64
0.39634146341463417
Combined run:
task
1    0.566038
2    0.395349
3    0.160000
Name: is_correct, dtype: float64
0.4146341463414634


### 4.2 Ideal ensembling

In [87]:
third_run = result_df.loc[result_df["agent_name"] == noanchorplan].copy()
INCLUDE_THIRD_RUN = False


# test ideal ensembling
def score_best_both(row, result_df_replacement):
    try:
        if row["is_correct"]:
            return True

        else:
            matching_answer = result_df_replacement.loc[
                (result_df_replacement["question"] == row["question"])
            ].iloc[0]
            if matching_answer["is_correct"]:
                return True
            else:
                return False
    except:
        return row["is_correct"]


combined_gpt4 = first_run_gpt4.copy()
combined_gpt4["is_correct"] = combined_gpt4.apply(
    lambda x: score_best_both(x, second_run_gpt4), axis=1
)
if INCLUDE_THIRD_RUN:
    combined_gpt4["is_correct"] = combined_gpt4.apply(
        lambda x: score_best_both(x, third_run), axis=1
    )
print("Ideal combined run:")
print(combined_gpt4.groupby(["task"])["is_correct"].mean())
print(combined_gpt4["is_correct"].mean())

Ideal combined run:
task
1    0.641509
2    0.465116
3    0.240000
Name: is_correct, dtype: float64
0.4878048780487805
