In [1]:
!pip install -q langchain sentencepiece plotly minijinja pyautogen markdownify pathvalidate puremagic mammoth python-pptx torchaudio pandas datasets google-search-results pydub easyocr pdfminer.six youtube_transcript_api openpyxl

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from dotenv import load_dotenv
import datasets

load_dotenv(override=True)
pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output_gaia"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from huggingface_hub import login
import os

login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/aymeric/.cache/huggingface/token
Login successful


In [5]:
from scripts.web_surfer import (
    browser,
    SearchInformationTool,
    NavigationalSearchTool,
    VisitTool,
    DownloadTool,
    PageUpTool,
    PageDownTool,
    FinderTool,
    FindNextTool,
)

WEB_TOOLS = [
    SearchInformationTool(),
    NavigationalSearchTool(),
    VisitTool(),
    DownloadTool(),
    PageUpTool(),
    PageDownTool(),
    FinderTool(),
    FindNextTool(),
]

In [6]:
from openai import OpenAI
from transformers.agents.llm_engine import MessageRole, get_clean_message_list


role_conversions = {
    MessageRole.TOOL_RESPONSE: MessageRole.USER,
}


class OpenAIModel:
    def __init__(self, model_name="gpt-4o-2024-05-13"):
        self.model_name = model_name
        self.client = OpenAI(
            api_key=os.getenv("OPENAI_API_KEY"),
        )

    def __call__(self, messages, stop_sequences=[]):
        messages = get_clean_message_list(messages, role_conversions=role_conversions)

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
            stop=stop_sequences,
        )
        return response.choices[0].message.content


oai_llm_engine = OpenAIModel()

In [7]:
from transformers.agents import HfEngine

hf_llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")

In [8]:
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns(
    {"Question": "question", "Final answer": "true_answer", "Level": "task"}
)


def preprocess_file_paths(row):
    if len(row["file_name"]) > 0:
        row["file_name"] = "data/gaia/validation/" + row["file_name"]
    return row


eval_ds = eval_ds.map(preprocess_file_paths)

eval_df = pd.DataFrame(eval_ds)

In [9]:
pd.Series(eval_ds["task"]).value_counts()

2    86
1    53
3    26
Name: count, dtype: int64

### Define tools

In [10]:
from transformers.agents import ReactJsonAgent, HfEngine

websurfer_llm_engine = HfEngine(
    model="CohereForAI/c4ai-command-r-plus"
)  # chosen for its high context length

# Replace with OAI
websurfer_llm_engine = oai_llm_engine


surfer_agent = ReactJsonAgent(
    llm_engine=websurfer_llm_engine,
    tools=WEB_TOOLS,
    max_iterations=12,
    verbose=1,
)

In [11]:
cleaned_df = pd.read_json(
    "output_gaia/react_code_llama3-70b_02-05_full-gaia-validation-code.jsonl",
    lines=True,
)
print(len(cleaned_df))
cleaned_df = cleaned_df.loc[
    cleaned_df["intermediate_steps"].apply(
        lambda s: "Chat template must be a string" not in str(s)
        and "Model is overloaded." not in str(s)
        and "in 'if/else' conditions" not in str(s)
        # and "Stream has ended unexpectedly" not in str(s)
        and "Payload Too Large" not in str(s)
        and "Cannot read an empty file" not in str(s)
    )
]
print(len(cleaned_df))

# cleaned_df.to_json(
#     "output_gaia/react_code_llama3-70b_02-05_full-gaia-validation-code.jsonl",
#     orient="records",
#     lines=True,
# )

165
165


In [12]:
from transformers.agents.default_tools import Tool
from pypdf import PdfReader
from markdownify import markdownify as md
from scripts.visual_qa import VisualQATool, VisualQAGPT4Tool
from transformers.agents import SpeechToTextTool
import shutil

params = {
    "engine": "bing",
    "gl": "us",
    "hl": "en",
}


class SearchTool(Tool):
    name = "ask_search_agent"
    description = "A search agent that will browse the internet to answer a query. Use it to gather informations, not for problem-solving."

    inputs = {
        "query": {
            "description": "Your query, as a natural language sentence. You are talking to an human, so provide them with as much context as possible!",
            "type": "text",
        }
    }
    output_type = "text"

    def forward(self, query: str) -> str:
        return surfer_agent.run(query)


def extract_text_from_pdf(pdf_path):
    pdf = PdfReader(pdf_path)
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
    return md(text)


from scripts.mdconvert import MarkdownConverter


class ZipInspectorTool(Tool):
    name = "extract_inspect_zip_folder"
    description = "Use this to extract and inspect the contents of a zip folder."
    inputs = {
        "folder": {
            "description": "The path to the zip folder you want to inspect.",
            "type": "text",
        }
    }
    output_type = "text"

    def forward(self, folder: str) -> str:
        folder_name = folder.replace(".zip", "")
        os.makedirs(folder_name, exist_ok=True)
        shutil.unpack_archive(folder, folder_name)

        # Convert the extracted files
        result = "We extracted all files from the zip into a directory: find the extracted files at the following paths:\n"
        for root, dirs, files in os.walk(folder_name):
            for file in files:
                result += f"- {os.path.join(root, file)}\n"

        return result


class TextInspectorTool(Tool):
    name = "inspect_file_as_text"
    description = "You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it."

    inputs = {
        "question": {
            "description": "Your question, as a natural language sentence. Provide as much context as possible.",
            "type": "text",
        },
        "file_path": {
            "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT USE THIS TOOL FOR A WEBPAGE: use the search tool instead!",
            "type": "text",
        },
    }
    output_type = "text"
    md_converter = MarkdownConverter()

    def forward(self, question: str, file_path) -> str:

        result = self.md_converter.convert(file_path)

        if ".zip" in file_path:
            return result.text_content

        messages = [
            {
                "role": "user",
                "content": "You will have to write a short caption for this file, then answer this question:"
                + question,
            },
            {
                "role": "user",
                "content": "Here is the complete file:\n### "
                + str(result.title)
                + "\n\n"
                + result.text_content[:70000],
            },
            {
                "role": "user",
                "content": "Now write a short caption for the file, then answer this question:"
                + question,
            },
        ]
        return websurfer_llm_engine(messages)


visualizer = VisualQATool()
visualizer_gpt4v = VisualQAGPT4Tool()
stt_tool = SpeechToTextTool()
titool = TextInspectorTool()
zip_tool = ZipInspectorTool()

# Let's test our tools!
image_path = "./data/gaia/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.png"
# print(visualizer(question="What is the oldest Blu-Ray sold?", image_path=image_path))
# print(
#     visualizer_gpt4v(question="What is the oldest Blu-Ray sold?", image_path=image_path)
# )
# print(
#     titool(
#         question="Could you cite the content at the beginning of page 4 of this document?",
#         file_path="https://arxiv.org/pdf/1608.06411",
#     )
# )

# print(
#     titool(
#         question="Could you tell what's happening at the end of this video? Descibre in detail",
#         file_path="https://www.youtube.com/watch?v=PsIwRGYMjTA",
#     )
# )

# print(
#     titool(
#         question="What's happening in there?",
#         file_path="data/gaia/validation/7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb",
#     )
# )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
file_path = "data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip"
content = zip_tool(folder=file_path)
print(content)

We extracted all files from the zip into a directory: find the extracted files at the following paths:
- data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f/food_duplicates.xls
- data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f/CATEGORIES.xml



In [14]:
from transformers.agents import PythonInterpreterTool

TASK_SOLVING_TOOLBOX = [
    SearchTool(),
    VisualQAGPT4Tool(),  # VisualQATool(),
    SpeechToTextTool(),
    TextInspectorTool(),
    ZipInspectorTool(),
    # PythonInterpreterTool(),
]

# Define agents

In [15]:
from transformers.agents import ReactCodeAgent, HfEngine
from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT

GAIA_PROMPT = (
    DEFAULT_REACT_CODE_SYSTEM_PROMPT
    + """
Remember: Your $FINAL_ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
ADDITIONALLY, your $FINAL_ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
If you are unable to determine the final answer, use 'final_answer("Unable to determine")'
Never try to guess file paths for files that do not exist.
"""
)

hf_llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")

react_agent = ReactCodeAgent(
    llm_engine=hf_llm_engine,
    tools=TASK_SOLVING_TOOLBOX,
    max_iterations=10,
    verbose=0,
    memory_verbose=True,
    system_prompt=GAIA_PROMPT,
    additional_authorized_imports=["requests", "zipfile", "os", "pandas"],
)

In [16]:
async def call_transformers(agent, question: str, **kwargs) -> str:
    result = agent.run(question, **kwargs)
    return {
        "output": str(result),
        "intermediate_steps": [
            {key: value for key, value in log.items() if key != "agent_memory"}
            for log in agent.logs
        ],
    }

In [17]:
# Stop here when running the notebook
assert False

AssertionError: 

In [None]:
res = await call_transformers(
    react_agent,
    "What is the age of the current prime minister of Guatemala, raised to the power 0.36?",
)
res

[37;1mWhat is the age of the current prime minister of Guatemala, raised to the power 0.36?[0m
[33;1m==== Agent is executing the code below:[0m
[0m[38;5;60;03m# Let's find the current prime minister or head of government of Guatemala and their age.[39;00m
[38;5;7mcurrent_head_of_guatemala[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7mask_search_agent[39m[38;5;7m([39m[38;5;7mquery[39m[38;5;109;01m=[39;00m[38;5;144m"[39m[38;5;144mcurrent prime minister of Guatemala age[39m[38;5;144m"[39m[38;5;7m)[39m
[38;5;109mprint[39m[38;5;7m([39m[38;5;7mcurrent_head_of_guatemala[39m[38;5;7m)[39m[0m
[33;1m====[0m
[37;1mcurrent prime minister of Guatemala age[0m
[33;1mCalling tool: 'informational_web_search' with arguments: {'query': 'current Prime Minister of Guatemala 2023'}[0m
[33;1mCalling tool: 'informational_web_search' with arguments: {'query': 'Bernardo Arévalo birthdate'}[0m
[33;1mCalling tool: 'final_answer' with arguments: {'answer'

{'output': '4.4941628807880765',
 'intermediate_steps': [{'system_prompt': 'You will be given a task to solve as best you can.\nTo do so, you have been given access to *tools*: these tools are basically Python functions which you can call with code.\nTo solve the task, you must plan forward to proceed in a series of steps, in a cycle of \'Thought:\', \'Code:\', and \'Observation:\' sequences.\n\nAt each step, in the \'Thought:\' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.\nThen in the \'Code:\' sequence, you should write the code in simple Python. The code sequence must end with \'<end_action>\' sequence.\nDuring each intermediate step, you can use \'print()\' to save whatever important information you will then need.\nThese print outputs will then appear in the \'Observation:\' field, which will be available as input for the next step.\nIn the end you have to return a final answer using the `final_answer` tool.\n\nHere

In [None]:
assert False

In [None]:
from scripts.run_agents import answer_questions

results = await answer_questions(
    eval_ds,
    react_agent,
    "react_code_llama3_30-may_with_gpt4o_vision",
    output_folder=OUTPUT_DIR,
    agent_call_function=call_transformers,
    add_optional_visualizer_tool=True,
)

  results = pd.read_json(output_path, lines=True).to_dict(orient="records")


Expected object or value
Found no usable records! 🤔 Starting new.


[37;1mA paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?[0m
[33;1m==== Agent is executing the code below:[0m
[0m[38;5;7mai_paper[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7mask_search_agent[39m[38;5;7m([39m[38;5;7mquery[39m[38;5;109;01m=[39;00m[38;5;144m"[39m[38;5;144mAI regulation paper submitted to arXiv.org in June 2022[39m[38;5;144m"[39m[38;5;7m)[39m
[38;5;109mprint[39m[38;5;7m([39m[38;5;144m"[39m[38;5;144mAI Paper:[39m[38;5;144m"[39m[38;5;7m,[39m[38;5;7m [39m[38;5;7mai_paper[39m[38;5;7m)[39m[0m
[33;1m====[0m
[37;1mAI regulation paper submitted to arXiv.org in June 2022[0m
[33;1mCalling tool: 'informational_web_search' with arguments: {'query': 'AI regulation paper submitte

# Evaluate

In [155]:
import glob

answer_file_path = f"{OUTPUT_DIR}/answers.jsonl"

result_df = pd.concat(
    [
        pd.read_json(f, lines=True)
        for f in glob.glob(f"{OUTPUT_DIR}/*.jsonl")
        if "answers.jsonl" not in f
    ]
)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df.to_json(answer_file_path, lines=True, orient="records")

In [156]:
# display(result_df.loc[result_df["agent_name"] == "react_chat_zephyr"].head(5))

# display(result_df.loc[result_df["agent_name"] == "react_code_llama3-70b_29-04_reallybetterprompt", ["question", "prediction", "true_answer", "is_correct", "agent_memory"]])

In [157]:
from scripts.gaia_scorer import question_scorer
import re
from collections import Counter

result_df["is_correct"] = result_df.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[eval_df["question"] == question, "file_name"]

    if len(matches) == 0:
        return None
    file_path = matches.values[0]

    if isinstance(file_path, str):
        return file_path.split(".")[-1]
    else:
        return None


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

Evaluating Failed by reaching max iterations. as a string.
Evaluating Let's try to answer the question manually.

According to Wikipedia, the Moon's perigee (closest approach) is approximately 356400 kilometers.

Eliud Kipchoge's record-making marathon pace is approximately 2.82 meters per second (source: Wikipedia).

Let's calculate the time it would take him to run the distance between the Earth and the Moon at its closest approach:

distance_to_moon = 356400 km = 356400000 m

speed = 2.82 m/s

time = distance / speed
= 356400000 m / (2.82 m/s)
= 126341818 s

 Convert seconds to hours:
= 126341818 s / 3600
= 35100 hours

Rounded to the nearest 1000 hours:
= 35100 hours ≈ 35000 hours

So, it would take Eliud Kipchoge approximately 35000 hours to run the distance between the Earth and the Moon at its closest approach, if he could maintain his record-making marathon pace indefinitely. as a number.
String Let's try to answer the question manually.

According to Wikipedia the Moon's perig


Answer lists have different lengths, returning False.



In [158]:
def get_elements(x):
    output = x[0]["task"]
    for y in x[1:]:
        try:
            if "observation" in y:
                output += y["llm_output"] + "\nObservation:" + y["observation"]
            else:
                output += y["llm_output"] + "\Error:" + str(y["error"])
        except:
            pass
    return output


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_elements(x))

In [159]:
result_df["agent_name"].value_counts()

agent_name
react_code_llama3-70b_29-04_full-gaia-test                             474
react_code_llama3-70b_02-05_full-gaia-validation-code                  165
react_code_llama3-70b_06-05_full-gaia-validation-idefics               165
react_code_gpt4o_28-may_full-gaia-validation                           164
react_code_llama3-70b_29-04_full-gaia-validation-cleaned               155
react_code_llama3-70b_06-05_full-gaia-validation-idefics-gaiaprompt    154
react_code_llama3-70b_29-04_full-gaia-validation-3                     100
react_code_llama3-70b_29-04_full-gaia-validation-cleaned-2              82
react_code_gpt4o_29-may_full-gaia-validation                            81
react_code_llama3-70b_29-04_full-gaia-validation                        71
react_code_llama3_26-04-final_answer                                    60
react_code_llama3-70b_28-may_full-gaia-validation                       60
react_code_llama3_24-04-betterinterpreter                               60
react_code_lla

# Inspect one specific run

In [160]:
selection_name = "react_code_gpt4o_29-may_full-gaia-validation"
submission_selection_name = "react_code_llama3-70b_02-05_full-gaia-validation-code"
result_df = result_df.loc[
    result_df["agent_name"].isin(
        [
            selection_name,
        ]
    )
]
result_df["task"].value_counts()

task
2    44
1    24
3    13
Name: count, dtype: int64

In [164]:
tools_calls = pd.DataFrame.from_records(result_df["tool_calls"].values).fillna(0)

# Exclude the tools that were not used enough
tools_calls = tools_calls.loc[:, tools_calls.sum() > 10]

# Sort the columns by the sum of the values
tools_calls = tools_calls[tools_calls.sum().sort_values(ascending=False).index]

result_with_calls = pd.concat(
    [result_df[["question", "is_correct", "task"]], tools_calls], axis=1
)
result_with_calls = (
    result_with_calls.drop("question", axis=1).groupby(["is_correct", "task"]).mean()
)
# result_with_calls = result_with_calls.melt(id_vars=['question', 'is_correct', 'task'], var_name="tool", value_name='count')

In [165]:
result_with_calls = result_with_calls.reset_index().melt(
    id_vars=["is_correct", "task"], var_name="tool", value_name="average_count"
)

In [166]:
import plotly.express as px

fig = px.bar(
    result_with_calls,
    x="tool",
    y="average_count",
    color="is_correct",
    facet_row="task",
    labels={
        "agent_name": "<b>Agent variant</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "eval_score_GPT4": "<b>Score</b>",
        "agent_type": "<b>Agent type</b>",
        "average_count": "<b>Average count of calls</b>",
    },
)
fig.update_layout(
    barmode="group",
    height=800,
    title="<b>" + "</b>",
)

In [167]:
print(result_df.groupby(["agent_name", "task"])[["is_correct", "count_steps"]].mean())
print(result_df["is_correct"].mean())

                                                   is_correct  count_steps
agent_name                                   task                         
react_code_gpt4o_29-may_full-gaia-validation 1       0.333333     4.875000
                                             2       0.272727     5.363636
                                             3       0.000000     7.307692
0.24691358024691357


# Test naive ensembling method

In [None]:
# def replace_answer_if_incomplete(row, result_df_replacement):
#     if row["prediction"] == "Unable to determine" or "MaxIterationsError" in str(row['intermediate_steps']):
#         matching_answer = result_df_replacement.loc[
#             (result_df_replacement["question"] == row["question"]), "prediction"
#         ].values[0]
#         print('replaced')
#         gold_answer = matching_answer
#     else:
#         gold_answer = row["prediction"]
#     return gold_answer

# result_df["gold_answer"] = result_df.apply(lambda x: replace_answer_if_incomplete(x, result_df_replacement), axis=1)

In [None]:
result_df.groupby(["agent_name"])["is_correct"].mean()

agent_name
react_code_gpt4o_28-may_full-gaia-validation    0.29697
Name: is_correct, dtype: float64

In [None]:
df_inspect = result_df.loc[
    result_df["agent_name"] == "react_code_llama3_26-04-final_answer"
]

In [None]:
logs = list(df_inspect["intermediate_steps"].values)
memories = [log[-2]["agent_memory"] for log in logs if "agent_memory" in log[-2]]

error_memories = [memory for memory in memories if "error" in str(memory).lower()]
error_memories

In [None]:
result_df.keys()