In [13]:
!pip install -q langchain numexpr sentencepiece plotly openai python-dotenv

In [14]:
%reload_ext autoreload
%autoreload 2

In [15]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chat_models import ChatHuggingFace
import pandas as pd
from dotenv import load_dotenv
from scripts.evaluation import load_benchmark
import datasets
from huggingface_hub import login
import os

load_dotenv(override=True)
pd.set_option("max_colwidth", None)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [16]:
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/aymeric/.cache/huggingface/token
Login successful


In [17]:
from langchain.chat_models import ChatOpenAI
from transformers.agents import HfEngine
from transformers.agents.llm_engine import (
    get_clean_message_list,
    llama_role_conversions,
)

command_r_id = "CohereForAI/c4ai-command-r-plus"
zephyr_id = "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1"
mixtral_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llama3_8b_id = "meta-llama/Meta-Llama-3-8B-Instruct"
llama3_70b_id = "meta-llama/Meta-Llama-3-70B-Instruct"
mixtral_large_id = "mistralai/Mixtral-8x22B-Instruct-v0.1"

# llm = HuggingFaceEndpoint(
#     repo_id=zephyr_id,
#     task="text-generation",
#     max_new_tokens=1024,
#     do_sample=False,
#     repetition_penalty=1.03,
# )

# llm_engine_hf = ChatHuggingFace(llm=llm)


class OAIEngine:
    def __init__(self):
        self.client = ChatOpenAI(model="gpt-4-1106-preview")

    def __call__(self, messages, stop=[]) -> str:
        # Get clean message list
        messages = get_clean_message_list(
            messages, role_conversions=llama_role_conversions
        )

        return self.client.invoke(messages, stop=stop).content


eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
eval_model_name = "GPT4"


# def call_llm_hf(input: str, stop=["Observation", "Final Answer"]) -> str:
#     return llm_engine_hf.invoke(input, stop=stop).content


# call_llm_hf("Please output 'Observation'")

In [18]:
OPENAI_MODEL_ID = "gpt-4-0125-preview"
OUTPUT_DIR = "output_reasoning"

In [19]:
eval_df = load_benchmark()
eval_df["true_answer"] = eval_df["true_answer"].astype(str)
eval_ds = datasets.Dataset.from_pandas(eval_df)

### Define tools
To run the langchain SERPAPI tool, you will need to have a [SerpAPI](https://serpapi.com/dashboard) API key: for this you need a paid account. 

In [20]:
from transformers.agents.default_tools import PythonInterpreterTool, Tool
from scripts.agents import CalculatorTool
from langchain_community.utilities import SerpAPIWrapper

params = {
    "engine": "bing",
    "gl": "us",
    "hl": "en",
}
langchain_serpapi = SerpAPIWrapper(params=params)


class SearchTool(Tool):
    name = "search"
    description = "A search engine. Useful for when you need to answer questions about current events. Input should be a search query."

    inputs = {"query": {"description": "your search query", "type": "text"}}
    output_type = "text"

    def forward(self, query: str) -> str:
        return langchain_serpapi.run(query)

In [21]:
TOOLBOX_CODE = [SearchTool()]

TOOLBOX_JSON = [SearchTool(), PythonInterpreterTool()]

TOOLBOX_CALC = [SearchTool(), CalculatorTool()]

# Define agents

In [25]:
from transformers import CodeAgent, ReactCodeAgent, ReactJsonAgent

from scripts.agents import build_hf_agent_with_tools, build_openai_agent_with_tools


react_code_agent_openai = ReactCodeAgent(
    llm_engine=OAIEngine(), tools=TOOLBOX_CODE, max_iterations=7
)
react_json_agent_openai = ReactJsonAgent(
    llm_engine=OAIEngine(), tools=TOOLBOX_JSON, max_iterations=7
)

langchain_agent = build_hf_agent_with_tools(repo_id=llama3_70b_id)
langchain_openai_agent = build_openai_agent_with_tools()

react_json_agent_llama3_70 = ReactJsonAgent(
    llm_engine=HfEngine(model=llama3_70b_id),
    tools=TOOLBOX_JSON,
    max_iterations=7,
)

react_code_agent_llama3_70 = ReactCodeAgent(
    llm_engine=HfEngine(model=llama3_70b_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_code_agent_llama3_8 = ReactCodeAgent(
    llm_engine=HfEngine(model=llama3_8b_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_code_agent_mixtral_8x7 = ReactCodeAgent(
    llm_engine=HfEngine(model=mixtral_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_json_agent_mixtral_8x7 = ReactJsonAgent(
    llm_engine=HfEngine(model=mixtral_id),
    tools=TOOLBOX_JSON,
    max_iterations=7,
)

react_code_agent_zephyr_large = ReactCodeAgent(
    llm_engine=HfEngine(model=zephyr_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_json_agent_zephyr_large = ReactJsonAgent(
    llm_engine=HfEngine(model=zephyr_id),
    tools=TOOLBOX_JSON,
    max_iterations=7,
)

# react_code_agent_mixtral_large = ReactCodeAgent(
#     llm_engine=HfEngine(model=mixtral_large_id),
#     tools=TOOLBOX_CODE,
#     max_iterations=7,
# )

# react_json_agent_mixtral_large = ReactJsonAgent(
#     llm_engine=HfEngine(model=mixtral_large_id),
#     tools=TOOLBOX_JSON,
#     max_iterations=7,
# )

react_code_agent_command_r = ReactCodeAgent(
    llm_engine=HfEngine(model=command_r_id),
    tools=TOOLBOX_CODE,
    max_iterations=7,
)

react_json_agent_command_r = ReactJsonAgent(
    llm_engine=HfEngine(model=command_r_id),
    tools=TOOLBOX_JSON,
    max_iterations=7,
)


agents = {
    "react_code_llama3-70b_17-may": react_code_agent_llama3_70,
    # "react_json_llama3-70b_17-may": react_json_agent_llama3_70,
    # "react_code_mixtral_8x7_06-may2": react_code_agent_mixtral_8x7,
    # "react_json_mixtral_8x7_06-may2": react_json_agent_mixtral_8x7,
    # "react_code_zephyr_large_06-may2": react_code_agent_zephyr_large,
    # "react_json_zephyr_large_06-may2": react_json_agent_zephyr_large,
    # "react_code_command_r_06-may": react_code_agent_command_r,
    # "react_json_command_r_06-may": react_json_agent_command_r,
    # "react_json_gpt4_06-may": react_json_agent_openai,
    # "react_code_gpt4_06-may": react_code_agent_openai,
    # "langchain_GPT-4_06-may2": langchain_openai_agent,
    # "langchain-llama3-70B_06-may2": langchain_agent,
    # "react_code_llama3-8b_24-04": react_code_agent_llama3_8,
}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/aymeric/.cache/huggingface/token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Last message must be a HumanMessage!
System prompt not supported


In [23]:
import asyncio


async def call_transformers_agent(agent, question: str) -> str:
    # result = agent.run(question)
    loop = asyncio.get_event_loop()
    result = await loop.run_in_executor(None, agent.run, question)
    return {
        "output": str(result),
        "intermediate_steps": agent.logs.copy(),
    }


async def call_langchain(agent, question: str) -> str:
    output = await agent.ainvoke({"input": question})
    return {
        "output": output["output"],
        "intermediate_steps": [el[0].log for el in output["intermediate_steps"]],
    }


async def call_vanilla_llm(agent, question: str) -> str:
    result = agent(question)
    return {
        "output": str(result),
        "intermediate_steps": [],
    }


# await call_langchain(langchain_agent, "Please provide a final answer of '7'")

In [24]:
assert False

AssertionError: 

In [26]:
from scripts.run_agents import run_full_tests

results = await run_full_tests(
    eval_ds,
    agents,
    output_folder=OUTPUT_DIR,
    agent_call_function=call_transformers_agent,
)

  results = pd.read_json(output_path, lines=True).to_dict(orient="records")


Expected object or value
Found no usable records! 🤔 Starting new.


[37;1mMimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?[0m
[33;1m==== Agent is executing the code below:[0m
[0m[38;5;7mmimi_seashells[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;139m2[39m[38;5;7m [39m[38;5;109;01m*[39;00m[38;5;7m [39m[38;5;139m12[39m[38;5;7m  [39m[38;5;60;03m# 1 dozen = 12[39;00m
[38;5;109mprint[39m[38;5;7m([39m[38;5;144m"[39m[38;5;144mMimi[39m[38;5;144m'[39m[38;5;144ms seashells:[39m[38;5;144m"[39m[38;5;7m,[39m[38;5;7m [39m[38;5;7mmimi_seashells[39m[38;5;7m)[39m[0m
[33;1m====[0m
[33;1mPrint outputs:[0m
[32;20mMimi's seashells: 24
[0m
[33;1m==== Agent is executing the code below:[0m
[0m[38;5;7mkyle_seashells[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;139m2[39m[38;5;7m [39m[38;5;109;01m*[39;00m[38;5;7m [39m[38;5;7mmi

# Evaluate

In [None]:
import glob

answer_file_path = f"{OUTPUT_DIR}/answers.jsonl"

res = []
for f in glob.glob(f"{OUTPUT_DIR}/*.jsonl"):
    print(f)
    if "answers" not in f and "evaluation" not in f:
        res.append(pd.read_json(f, lines=True))
result_df = pd.concat(res)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df.to_json(answer_file_path, lines=True, orient="records")

### Exact match for GSM8K

In [None]:
from scripts.evaluation import score_any_match

results_math = result_df.loc[result_df["task"] == "gsm8k"].copy()
results_math["exact_match"] = -1
results_math["exact_match"] = results_math.apply(
    lambda row: score_any_match(row["prediction"], float(row["true_answer"])), axis=1
)

In [None]:
results_math.groupby(["agent_name", "task"])["exact_match"].mean()

agent_name                       task 
langchain-llama3-70B_06-may      gsm8k    0.825
langchain-llama3-70B_06-may2     gsm8k    0.900
langchain-llama3-70B_29-04       gsm8k    0.750
langchain-mixtral-8x7b_29-04     gsm8k    0.475
langchain_GPT-4_06-may           gsm8k    0.875
langchain_GPT-4_06-may2          gsm8k    0.850
langchain_GPT-4_29-04            gsm8k    0.825
react_code_command_r_06-may      gsm8k    0.725
react_code_command_r_29-04       gsm8k    0.775
react_code_gpt4_06-may           gsm8k    0.975
react_code_gpt4_29-04            gsm8k    0.975
react_code_llama3-70b_06-may     gsm8k    0.950
react_code_llama3-70b_06-may2    gsm8k    0.975
react_code_llama3-70b_15-may     gsm8k    0.900
react_code_llama3-70b_29-04      gsm8k    0.925
react_code_mixtral_8x7_06-may    gsm8k    0.675
react_code_mixtral_8x7_06-may2   gsm8k    0.750
react_code_mixtral_8x7_29-04     gsm8k    0.625
react_code_zephyr_large_06-may   gsm8k    0.725
react_code_zephyr_large_06-may2  gsm8k    0.675
r

In [None]:
results_math[["agent_name", "task"]].value_counts()

agent_name                       task 
langchain-llama3-70B_06-may      gsm8k    40
react_json_llama3-70b_06-may2    gsm8k    40
react_code_zephyr_large_29-04    gsm8k    40
react_json_command_r_06-may      gsm8k    40
react_json_command_r_29-04       gsm8k    40
react_json_gpt4_06-may           gsm8k    40
react_json_gpt4_29-04            gsm8k    40
react_json_llama3-70b_06-may     gsm8k    40
react_json_llama3-70b_15-may     gsm8k    40
react_code_zephyr_large_06-may   gsm8k    40
react_json_llama3-70b_29-04      gsm8k    40
react_json_mixtral_8x7_06-may    gsm8k    40
react_json_mixtral_8x7_06-may2   gsm8k    40
react_json_mixtral_8x7_29-04     gsm8k    40
react_json_zephyr_large_06-may   gsm8k    40
react_json_zephyr_large_06-may2  gsm8k    40
react_code_zephyr_large_06-may2  gsm8k    40
react_code_mixtral_8x7_29-04     gsm8k    40
langchain-llama3-70B_06-may2     gsm8k    40
react_code_command_r_29-04       gsm8k    40
langchain-llama3-70B_29-04       gsm8k    40
langchain-mixtra

### LLM judge for others

In [None]:
answers_nonmath = result_df.loc[result_df["task"] != "gsm8k"].to_dict(orient="records")

In [None]:
from langchain.chat_models import ChatOpenAI

eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
eval_model_name = "GPT4"

In [None]:
# evaluations = pd.read_json('output_reasoning/evaluation.jsonl', lines=True)
# print(len(evaluations))
# evaluations['agent_name'].unique()

# evaluations = evaluations.loc[~evaluations["agent_name"].str.contains("8b")]
# print(len(evaluations))

# evaluations.to_json('output_reasoning/evaluation.jsonl', lines=True, orient='records')

In [None]:
from scripts.evaluation import evaluate_answers
from scripts.prompts import EVALUATION_PROMPT_TEMPLATE

output_file_path = f"{OUTPUT_DIR}/evaluation.jsonl"

run_evaluation = True
if run_evaluation:
    evaluated_answers_nonmath = await evaluate_answers(
        answers_nonmath,
        eval_chat_model,
        "GPT4",
        EVALUATION_PROMPT_TEMPLATE,
        output_file_path=output_file_path,
    )
    print("Evaluation is complete!")

100%|██████████| 100/100 [00:38<00:00,  2.60it/s]
Exception in thread Thread-7 (write_line):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/threading.py", line 1016, in _bootstrap_inner


    self.run()
  File "/Users/aymeric/venvs/disposable/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/aymeric/Documents/Code/agent_reasoning_benchmark/scripts/evaluation.py", line 130, in write_line
    json.dump(annotated_example, output_file)
  File "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/__init__.py", line 180, in dump
    fp.write(chunk)
ValueError: write to closed file


In [None]:
# df = pd.DataFrame(evaluated_answers_nonmath)
# df.loc[(df["agent_name"] == "react_code_llama3-8b_24-04")].head(10)

In [None]:
results_nonmath = pd.DataFrame.from_dict(evaluated_answers_nonmath)


def interpret_score(eval_score):
    try:
        return (float(eval_score) - 1) / 4
    except:
        return 0


results_nonmath["eval_score_GPT4"] = results_nonmath["eval_score_GPT4"].apply(
    interpret_score
)
results_nonmath.loc[results_nonmath["task"] == "GAIA"].groupby("agent_name")[
    "eval_score_GPT4"
].mean()

agent_name
langchain-llama3-70B_06-may        0.437500
langchain-llama3-70B_06-may2       0.300000
langchain-llama3-70B_29-04         0.262500
langchain-mixtral-8x7b_29-04       0.175000
langchain_GPT-4_06-may             0.453125
langchain_GPT-4_06-may2            0.537500
langchain_GPT-4_29-04              0.412500
react_code_command_r_06-may        0.000000
react_code_command_r_29-04         0.225000
react_code_gpt4_06-may             0.425000
react_code_gpt4_23-04              0.512500
react_code_gpt4_25-04              0.600000
react_code_gpt4_26-04              0.387500
react_code_gpt4_29-04              0.350000
react_code_llama3-70b_06-may       0.412500
react_code_llama3-70b_06-may2      0.400000
react_code_llama3-70b_15-may       0.412500
react_code_llama3-70b_23-04        0.323529
react_code_llama3-70b_24-04        0.325000
react_code_llama3-70b_25-04        0.300000
react_code_llama3-70b_26-04        0.362500
react_code_llama3-70b_29-04        0.262500
react_code_llama3-8b_

### Aggregate evaluations

In [None]:
result_df.loc[result_df["agent_name"].str.contains("langchain")][
    ["agent_name"]
].value_counts()

agent_name                  
langchain-llama3-70B_06-may     90
langchain-llama3-70B_06-may2    90
langchain-llama3-70B_29-04      90
langchain-mixtral-8x7b_29-04    90
langchain_GPT-4_06-may2         90
langchain_GPT-4_29-04           90
langchain_GPT-4_06-may          86
Name: count, dtype: int64

In [None]:
result_df = pd.concat([results_math, results_nonmath])
result_df["aggregate_score"] = (
    result_df["exact_match"].fillna(0) + result_df["eval_score_GPT4"].fillna(0)
) * 100

result_df["agent_name"] = result_df["agent_name"].str.replace("06-may2", "06-may")
display(result_df.groupby(["agent_name", "task"])["aggregate_score"].mean())

agent_name                      task    
langchain-llama3-70B_06-may     GAIA           36.875
                                HotpotQA    60.416667
                                gsm8k           86.25
langchain-llama3-70B_29-04      GAIA            26.25
                                HotpotQA    45.833333
                                gsm8k            75.0
langchain-mixtral-8x7b_29-04    GAIA             17.5
                                HotpotQA         60.0
                                gsm8k            47.5
langchain_GPT-4_06-may          GAIA             50.0
                                HotpotQA    74.166667
                                gsm8k           86.25
langchain_GPT-4_29-04           GAIA            41.25
                                HotpotQA    71.666667
                                gsm8k            82.5
react_code_command_r_06-may     GAIA              0.0
                                HotpotQA    63.333333
                                gsm8k    

### Display

In [None]:
result_df["intermediate_steps"].values[1][5]

{'llm_output': 'Thought: I will use python code to calculate the total number of pets.\nCode:\n```\nsnakes = cats + 6\nparrots = cats - 1\ntotal_pets = cats + snakes + parrots + dogs\nprint("Total pets:", total_pets)\n```',
 'rationale': 'Thought: I will use python code to calculate the total number of pets.\n',
 'tool_call': {'tool_name': 'code interpreter',
  'tool_arguments': 'snakes = cats + 6\nparrots = cats - 1\ntotal_pets = cats + snakes + parrots + dogs\nprint("Total pets:", total_pets)'},
 'observation': '\nTotal pets: 7',
 'agent_memory': [{'role': 'system',
   'content': 'Solve the following task as best you can. You have access to the following tools:\n\nTo solve the task, you must plan forward to proceed in a series of steps, in a cycle of \'Thought:\', \'Code:\', and \'Observation:\' sequences.\n\nAt each step, in the \'Thought:\' sequence, you should first explain which tool you will use and for what reason, then in the \'Code:\' sequence, you shold write the code in sim

In [None]:
# result_df.loc[result_df["intermediate_steps"].apply(lambda x: "error" in str(x))]

In [None]:
import plotly.express as px

aggregate = (
    result_df.groupby(["agent_name", "task"])[["aggregate_score"]].mean().reset_index()
)
aggregate["agent_name"] = aggregate["agent_name"].map(
    {
        "react_code_llama3-70b_06-may": "Llama3-70B-Instruct Code",
        "react_json_llama3-70b_06-may": "Llama3-70B-Instruct JSON",
        "react_code_llama3-70b_15-may": "Llama3-70B-Instruct Code 2",
        "react_json_llama3-70b_15-may": "Llama3-70B-Instruct JSON 2",
        "react_code_gpt4_06-may": "GPT-4-Turbo Code",
        "react_json_gpt4_06-may": "GPT-4-Turbo JSON",
        # "langchain_GPT-4_06-may": "Langchain: GPT4",
        # "langchain-llama3-70B_06-may": "Langchain: Llama3-70B-Instruct",
        # "react_json_command_r_06-may": "Command R+ JSON",
        # "react_code_command_r_06-may": "Command R+ Code",
        # "react_json_zephyr_large_06-may": "Zephyr-141B JSON",
        # "react_code_zephyr_large_06-may": "Zephyr-141B Code",
        "react_code_mixtral_8x7_06-may": "Mixtral-8x7B Code",
        "react_json_mixtral_8x7_06-may": "Mixtral-8x7B JSON",
    }
)
aggregate = aggregate.sort_values(["agent_name", "task"], ascending=False)
fig = px.bar(
    aggregate,
    x="agent_name",
    y="aggregate_score",
    color="task",
    labels={
        "agent_name": "<b>LLM Engine</b>",
        "task": "<b>Task</b>",
        "aggregate_score": "<b>Performance</b>",
        "eval_score_GPT4": "<b>Score</b>",
        "OS: Mixtral-8x7B Code": "<b>Mixtral-8x7B Code</b>",
    },
)
fig.update_layout(
    width=len(aggregate["agent_name"].unique()) * 100 + 200,
    height=600,
    barmode="group",
    bargap=0.35,
    bargroupgap=0.0,
    yaxis_range=[0, 105],
)
fig.update_traces(texttemplate="%{y:.0f}", textposition="outside")
fig.layout.yaxis.ticksuffix = "%"
fig.write_image("aggregate_score.png", scale=3)
fig.show()