In [4]:
!pip install -q langchain numexpr sentencepiece plotly


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chat_models import ChatHuggingFace
import pandas as pd
from dotenv import load_dotenv
from scripts.evaluation import load_benchmark
import datasets
from huggingface_hub import login
import os

load_dotenv(override=True)
pd.set_option("max_colwidth", None)

In [3]:
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/aymeric/.cache/huggingface/token
Login successful


In [4]:
from langchain.chat_models import ChatOpenAI
from transformers.agents import HfEngine
from transformers.agents.llm_engine import (
    get_clean_message_list,
    llama_role_conversions,
)

command_r_id = "CohereForAI/c4ai-command-r-plus"
zephyr_id = "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1"
mixtral_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llama3_8b_id = "meta-llama/Meta-Llama-3-8B-Instruct"
llama3_70b_id = "meta-llama/Meta-Llama-3-70B-Instruct"

# llm = HuggingFaceEndpoint(
#     repo_id=zephyr_id,
#     task="text-generation",
#     max_new_tokens=1024,
#     do_sample=False,
#     repetition_penalty=1.03,
# )

# llm_engine_hf = ChatHuggingFace(llm=llm)


class OAIEngine:
    def __init__(self):
        self.client = ChatOpenAI(model="gpt-4-1106-preview")

    def __call__(self, messages, stop=[]) -> str:
        # Get clean message list
        messages = get_clean_message_list(
            messages, role_conversions=llama_role_conversions
        )

        return self.client.invoke(messages, stop=stop).content


eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
eval_model_name = "GPT4"


# def call_llm_hf(input: str, stop=["Observation", "Final Answer"]) -> str:
#     return llm_engine_hf.invoke(input, stop=stop).content


# call_llm_hf("Please output 'Observation'")

  warn_deprecated(


In [5]:
OPENAI_MODEL_ID = "gpt-4-0125-preview"
OUTPUT_DIR = "output_reasoning"

In [6]:
eval_df = load_benchmark()
eval_df["true_answer"] = eval_df["true_answer"].astype(str)
eval_ds = datasets.Dataset.from_pandas(eval_df)

### Define tools

In [7]:
from transformers.agents.default_tools import CalculatorTool, Tool
from langchain_community.utilities import SerpAPIWrapper

params = {
    "engine": "bing",
    "gl": "us",
    "hl": "en",
}
langchain_serpapi = SerpAPIWrapper(params=params)


class SearchTool(Tool):
    name = "search"
    description = "A search engine. Useful for when you need to answer questions about current events. Input should be a search query."

    inputs = {"query": {"description": "your search query", "type": "text"}}
    output_type = "text"

    def forward(self, query: str) -> str:
        return langchain_serpapi.run(query)

In [8]:
TOOLBOX_CODE = [SearchTool()]

TOOLBOX_JSON = [SearchTool(), CalculatorTool()]

# Define agents

In [9]:
from transformers import CodeAgent, ReactCodeAgent, ReactJSONAgent

from scripts.agents import build_hf_agent_with_tools, build_openai_agent_with_tools


react_agent_openai = ReactCodeAgent(
    llm_engine=OAIEngine(), tools=TOOLBOX_CODE, max_iterations=7
)

# langchain_agent = build_hf_agent_with_tools(
#     repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1"
# )
# langchain_openai_agent = build_openai_agent_with_tools()

react_json_agent_llama3_70 = ReactJSONAgent(
    llm_engine=HfEngine(model=llama3_70b_id),
    tools=TOOLBOX_JSON,
    max_iterations=7,
)

react_code_agent_llama3_70 = ReactCodeAgent(
    llm_engine=HfEngine(model=llama3_70b_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_code_agent_llama3_8 = ReactCodeAgent(
    llm_engine=HfEngine(model=llama3_8b_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_code_agent_mixtral_8x7 = ReactCodeAgent(
    llm_engine=HfEngine(model=mixtral_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_json_agent_mixtral_8x7 = ReactJSONAgent(
    llm_engine=HfEngine(model=mixtral_id),
    tools=TOOLBOX_JSON,
    max_iterations=7,
)

agents = {
    "react_code_llama3-70b_25-04": react_code_agent_llama3_70,
    # "react_code_mixtral_8x7_23-04": react_code_agent_mixtral_8x7,
    # "react_json_mixtral_8x7_24-04": react_json_agent_mixtral_8x7,
    "react_code_gpt4_25-04": react_agent_openai,
    # "react_json_llama3-70b_23-04": react_json_agent_llama3_70,
    # "react_code_llama3-8b_24-04": react_code_agent_llama3_8,
}


No tool description template is defined for this tokenizer - using a default tool description template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.tool_description_template` to an appropriate template. 


In [10]:
async def call_transformers_agent(agent, question: str) -> str:
    result = agent.run(question)
    return {
        "output": str(result),
        "intermediate_steps": agent.logs.copy(),
    }


async def call_langchain(agent, question: str) -> str:
    output = agent.invoke({"input": question})
    return {
        "output": output["output"],
        "intermediate_steps": [el[0].log for el in output["intermediate_steps"]],
    }


async def call_vanilla_llm(agent, question: str) -> str:
    result = agent(question)
    return {
        "output": str(result),
        "intermediate_steps": [],
    }


# await call_langchain(langchain_agent, "Please provide a final answer of '7'")

In [11]:
res = await call_transformers_agent(react_agent_openai, "How much is 3 power 0.57?")

1.8705060412286219


In [12]:
agents

{'react_code_llama3-70b_25-04': <transformers.agents.agents.ReactCodeAgent at 0x2cfb6e9d0>,
 'react_code_gpt4_25-04': <transformers.agents.agents.ReactCodeAgent at 0x2cec565d0>}

In [14]:
assert False

AssertionError: 

In [17]:
from scripts.run_agents import run_full_tests

results = await run_full_tests(
    eval_ds,
    agents,
    output_folder=OUTPUT_DIR,
    agent_call_function=call_transformers_agent,
    key_for_answer="true_answer",
)

  results = pd.read_json(output_path, lines=True).to_dict(orient="records")


Expected object or value
Found no usable records! 🤔 Starting new.


  0%|          | 0/90 [00:00<?, ?it/s]16.0
  1%|          | 1/90 [00:06<10:12,  6.88s/it]19
  2%|▏         | 2/90 [00:19<15:14, 10.39s/it]Olaf has 196 toy cars in total.
  3%|▎         | 3/90 [00:28<13:44,  9.48s/it]The remaining amount in Emma's account is 4 dollars.
  4%|▍         | 4/90 [00:36<12:46,  8.91s/it]15 kilometers
  6%|▌         | 5/90 [00:40<10:28,  7.39s/it]50.0
  7%|▋         | 6/90 [00:49<10:53,  7.78s/it]The final score is 75.0
  8%|▊         | 7/90 [00:53<09:13,  6.67s/it]The total width of the splashes is 7.0 meters.
  9%|▉         | 8/90 [00:59<08:36,  6.30s/it]Each person gets 3 omelets.
 10%|█         | 9/90 [01:05<08:17,  6.14s/it]55
 11%|█         | 10/90 [01:16<10:19,  7.74s/it]2.0
 12%|█▏        | 11/90 [01:20<08:52,  6.74s/it]The total number of books in the library is 160.
 13%|█▎        | 12/90 [01:27<08:47,  6.76s/it]Nicky makes a profit of 5 dollars.
 14%|█▍        | 13/90 [01:31<07:34,  5.90s/it]2.0
 16%|█▌        | 14/90 [01:36<07:10,  5.67s/it]4.0
 17

Expected object or value
Found no usable records! 🤔 Starting new.


  0%|          | 0/90 [00:00<?, ?it/s]16.0
  1%|          | 1/90 [00:12<17:59, 12.13s/it]19
  2%|▏         | 2/90 [00:29<22:28, 15.32s/it]196
  3%|▎         | 3/90 [00:47<23:58, 16.54s/it]4
  4%|▍         | 4/90 [01:03<23:12, 16.19s/it]15.0
  6%|▌         | 5/90 [01:12<19:08, 13.51s/it]50.0
  7%|▋         | 6/90 [01:35<23:43, 16.95s/it]75.0
  8%|▊         | 7/90 [01:43<19:13, 13.89s/it]7.0
  9%|▉         | 8/90 [01:55<18:14, 13.35s/it]Each person gets 3 omelets.
 10%|█         | 9/90 [02:13<19:58, 14.79s/it]55
 11%|█         | 10/90 [02:27<19:17, 14.47s/it]2.0
 12%|█▏        | 11/90 [02:36<17:07, 13.00s/it]160.0
 13%|█▎        | 12/90 [02:48<16:12, 12.46s/it]5
 14%|█▍        | 13/90 [02:59<15:32, 12.11s/it]2.0
 16%|█▌        | 14/90 [03:09<14:27, 11.42s/it]4.0
 17%|█▋        | 15/90 [03:23<15:16, 12.22s/it]3.0
 18%|█▊        | 16/90 [03:39<16:34, 13.44s/it]150.0
 19%|█▉        | 17/90 [03:48<14:46, 12.14s/it]12
 20%|██        | 18/90 [04:00<14:28, 12.06s/it]22.0
 21%|██        | 19/90 

# Evaluate

In [13]:
import glob

answer_file_path = f"{OUTPUT_DIR}/answers.jsonl"

result_df = pd.concat(
    [pd.read_json(f, lines=True) for f in glob.glob(f"{OUTPUT_DIR}/*.jsonl")]
)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df.to_json(answer_file_path, lines=True, orient="records")

### Exact match for GSM8K

In [14]:
from scripts.evaluation import score_any_match

results_math = result_df.loc[result_df["task"] == "gsm8k"].copy()
results_math["exact_match"] = -1
results_math["exact_match"] = results_math.apply(
    lambda row: score_any_match(row["prediction"], float(row["true_answer"])), axis=1
)

Error when extracting string: could not convert string to float: '64G'
Error when extracting string: could not convert string to float: '19t'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string

In [15]:
results_math.groupby(["agent_name", "task"])["exact_match"].mean()

agent_name                    task 
react_code_gpt4_23-04         gsm8k    0.900000
react_code_gpt4_25-04         gsm8k    0.925000
react_code_llama3-70b_23-04   gsm8k    0.950000
react_code_llama3-70b_24-04   gsm8k    0.900000
react_code_llama3-70b_25-04   gsm8k    0.975000
react_code_llama3-8b_24-04    gsm8k    0.374101
react_code_mixtral_8x7_23-04  gsm8k    0.600000
react_json_llama3-70b_23-04   gsm8k    0.925000
react_json_mixtral_8x7_24-04  gsm8k    0.555000
Name: exact_match, dtype: float64

### LLM judge for others

In [16]:
answers_nonmath = result_df.loc[result_df["task"] != "gsm8k"].to_dict(orient="records")

In [17]:
from langchain.chat_models import ChatOpenAI

eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
eval_model_name = "GPT4"

In [18]:
# evaluations = pd.read_json('output_reasoning/evaluation.jsonl', lines=True)
# print(len(evaluations))
# evaluations['agent_name'].unique()

# evaluations = evaluations.loc[~evaluations["agent_name"].str.contains("8b")]
# print(len(evaluations))

# evaluations.to_json('output_reasoning/evaluation.jsonl', lines=True, orient='records')

In [19]:
from scripts.evaluation import evaluate_answers
from scripts.prompts import EVALUATION_PROMPT_TEMPLATE

output_file_path = f"{OUTPUT_DIR}/evaluation.jsonl"

run_evaluation = True
if run_evaluation:
    evaluated_answers_nonmath = await evaluate_answers(
        answers_nonmath,
        eval_chat_model,
        "GPT4",
        EVALUATION_PROMPT_TEMPLATE,
        output_file_path=output_file_path,
    )
    print("Evaluation is complete!")

Previous evaluations:
                       agent_name  \
0           react_code_gpt4_23-04   
1     react_json_llama3-70b_23-04   
2     react_code_llama3-70b_23-04   
3     react_code_llama3-70b_23-04   
4     react_code_llama3-70b_23-04   
..                            ...   
406  react_json_mixtral_8x7_24-04   
407  react_json_mixtral_8x7_24-04   
408    react_code_llama3-8b_24-04   
409    react_code_llama3-8b_24-04   
410  react_json_mixtral_8x7_24-04   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

  0%|          | 0/135 [00:00<?, ?it/s]

Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating example
Evaluating e

100%|██████████| 135/135 [00:50<00:00,  2.69it/s]

Evaluation is complete!



Exception in thread Thread-5 (write_line):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner


    self.run()
  File "/Users/aymeric/venvs/agents/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/aymeric/Documents/Code/agent_reasoning_benchmark/scripts/evaluation.py", line 133, in write_line
    json.dump(annotated_example, output_file)
  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/__init__.py", line 180, in dump
    fp.write(chunk)
ValueError: write to closed file


In [20]:
# df = pd.DataFrame(evaluated_answers_nonmath)
# df.loc[(df["agent_name"] == "react_code_llama3-8b_24-04")].head(10)

In [21]:
results_nonmath = pd.DataFrame.from_dict(evaluated_answers_nonmath)


def interpret_score(eval_score):
    try:
        return (float(eval_score) - 1) / 4
    except:
        return 0


results_nonmath["eval_score_GPT4"] = results_nonmath["eval_score_GPT4"].apply(
    interpret_score
)
results_nonmath.loc[results_nonmath["task"] == "GAIA"].groupby("agent_name")[
    "eval_score_GPT4"
].mean()

agent_name
react_code_gpt4_23-04           0.512500
react_code_gpt4_25-04           0.750000
react_code_llama3-70b_23-04     0.323529
react_code_llama3-70b_24-04     0.350000
react_code_llama3-70b_25-04     0.312500
react_code_llama3-8b_24-04      0.037500
react_code_mixtral_8x7_23-04    0.000000
react_json_llama3-70b_23-04     0.409091
react_json_mixtral_8x7_24-04    0.150000
Name: eval_score_GPT4, dtype: float64

### Aggregate evaluations

In [22]:
result_df = pd.concat([results_math, results_nonmath])
result_df["aggregate_score"] = (
    result_df["exact_match"].fillna(0) + result_df["eval_score_GPT4"].fillna(0)
) * 100
result_df.groupby(["agent_name", "task"])["aggregate_score"].mean()

agent_name                    task    
react_code_gpt4_23-04         GAIA            51.25
                              HotpotQA    60.833333
                              gsm8k            90.0
react_code_gpt4_25-04         GAIA             75.0
                              HotpotQA    73.333333
                              gsm8k            92.5
react_code_llama3-70b_23-04   GAIA        32.352941
                              HotpotQA    65.833333
                              gsm8k            95.0
react_code_llama3-70b_24-04   GAIA             35.0
                              HotpotQA    59.166667
                              gsm8k            90.0
react_code_llama3-70b_25-04   GAIA            31.25
                              HotpotQA    58.333333
                              gsm8k            97.5
react_code_llama3-8b_24-04    GAIA             3.75
                              HotpotQA    24.166667
                              gsm8k       37.410072
react_code_mixtral_8x7_23

In [None]:
list_correct_langchain = result_df.loc[
    (result_df["agent_name"] == "langchain_gpt-4")
    & (result_df["task"] == "GAIA")
    & (result_df["aggregate_score"] >= 50),
    "question",
].unique()
list_wrong_transformers = result_df.loc[
    (result_df["agent_name"] == "react_text_gpt-4")
    & (result_df["task"] == "GAIA")
    & (result_df["aggregate_score"] < 70),
    "question",
].unique()

intersection = [el for el in list_correct_langchain if el in list_wrong_transformers]

### Display

In [None]:
result_df["intermediate_steps"].values[1][5]

{'llm_output': ' Thought: The calculator result shows that Frankie has 14 pets in total. To complete the task, I need to provide the final answer as the total number of pets Frankie has.\n\nAction:\n{\n  "action": "final_answer",\n  "action_input": {"answer": "14"}\n}',
 'rationale': ' Thought: The calculator result shows that Frankie has 14 pets in total. To complete the task, I need to provide the final answer as the total number of pets Frankie has.\n\n',
 'tool_call': {'tool_name': 'final_answer',
  'tool_arguments': {'answer': '14'}}}

In [None]:
# result_df.loc[result_df["intermediate_steps"].apply(lambda x: "error" in str(x))]

In [None]:
import plotly.express as px

aggregate = (
    result_df.groupby(["agent_name", "task"])[["aggregate_score"]].mean().reset_index()
)
# aggregate["agent_name"] = aggregate["agent_name"].map(
#     {
#         # "vanilla_llm": "LLM",
#         # "react": "ReactAgent (Mixtral - messages)",
#         # "code": "CodeAgent",
#         # "react_prev": "ReactAgent (Mixtral - text)",
#         # "react_messages_chat_model": "ReactAgent (Nous - messages)",
#         # "react_text_chat_model": "ReactAgent (Nous - text)",
#         # "react_messages_openai": "ReactAgent (OpenAI - messages)",
#         "react_text_openai": "OpenAI: GPT3.5",
#         # "react_text_mixtral2": "ReactAgent (Mixtral2 - text)",
#         # "react_text_mixtral3": "ReactAgent (Mixtral3 - text)",
#         # "react_text_mixtral4": "Mixtral (Ours)",
#         # "react_text_mixtral-15-04": "Mixtral-8x7b",
#         "react_messages_llama3-19-04": "OS: Llama3-70B-Instruct",
#         "react_text_llama3-8b-19-04": "OS: Llama3-8B-Instruct",
#         # "react_text_mixtral_nojson": "ReactAgent (Mixtral4 - nojson)",
#         # "langchain_agent": "Langchain",
#         "react_text_command-r-plus": "OS: Command-R-Plus",
#         "react_text_llama3-70b-19-04-2": "OS: Llama3-70B-Instruct Code",
#         # "react_text_mixtral_huge": "Mixtral-8x22B",
#         "react_text_gpt-4": "OpenAI: GPT4",
#         # "langchain_gpt-4": "GPT4 (LangChain)",
#     }
# )
aggregate = aggregate.sort_values("agent_name", ascending=True)
fig = px.bar(
    aggregate,
    x="agent_name",
    y="aggregate_score",
    color="task",
    labels={
        "agent_name": "<b>Agent</b>",
        "task": "<b>Task</b>",
        "aggregate_score": "Performance",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    width=len(aggregate["agent_name"].unique()) * 100 + 200,
    height=600,
    barmode="group",
    bargap=0.35,
    bargroupgap=0.0,
    yaxis_range=[0, 105],
)
fig.update_traces(texttemplate="%{y:.0f}", textposition="outside")
fig.layout.yaxis.ticksuffix = "%"
fig.show()

  sf: grouped.get_group(s if len(s) > 1 else s[0])
