In [26]:
from huggingface_hub import InferenceClient


client = InferenceClient("EleutherAI/gpt-neox-20b")

prompt = """Génère un poème en 10 vers de douze pieds, avec des rimes alternées.
Chaque vers doit contenir un chiffre croissant de 1 à 10 : le premier contient « un », le deuxième un « deux », et ainsi de suite.

Poème:
"""

# prompt = "Here is a poem with 10 verses:"
res = client.text_generation(prompt, max_new_tokens=200, return_full_text=False)

print(res)


Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix,
Un, deux, trois, quatre, cinq, six, sept, h


In [27]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

response = client.completions.create(
    prompt=prompt, model="davinci-002", temperature=0.5, max_tokens=200
)
print(response.choices[0].text)

Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix

Résultat:

Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix
Un deux trois quatre cinq six sept huit neuf dix



In [1]:
import pandas as pd
from dotenv import load_dotenv
import datasets

load_dotenv(override=True)
pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output"

In [None]:
from huggingface_hub import login
import os

login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

: 

In [None]:
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns(
    {"Question": "question", "Final answer": "true_answer", "Level": "task"}
)
eval_df = pd.DataFrame(eval_ds)

: 

In [None]:
pd.Series(eval_ds["task"]).value_counts()

: 

# 1. Load all results

In [None]:
import glob

answer_file_path = f"{OUTPUT_DIR}/validation/answers.jsonl"

result_df = pd.concat(
    [
        pd.read_json(f, lines=True)
        for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl")
        if "answers.jsonl" not in f
    ]
)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df.to_json(answer_file_path, lines=True, orient="records")

: 

In [None]:
from scripts.evaluation.gaia_scorer import question_scorer, check_close_call
import re
from collections import Counter

result_df["is_correct"] = result_df.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[
        eval_df["question"].apply(lambda x: x in question), "file_name"
    ]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

: 

In [None]:
def get_thoughts(x):
    try:
        output = x[0]["task"]
        for y in x[1:]:
            try:
                if "observation" in y:
                    output += y["llm_output"] + "\nObservation:" + y["observation"]
                else:
                    output += y["llm_output"] + "\Error:" + str(y["error"])
            except:
                pass
        return output
    except:
        return None


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_thoughts(x))

: 

In [None]:
result_df["agent_name"].value_counts()

: 

# 2. Inspect specific runs

In [None]:
gpt4o = "react_code_gpt4o_23-june_planning2_newprompt5"
gpt4o_planning = "react_code_gpt4o_6_aug_planning_nogrammar"
gpt4o_noplanning = "react_code_gpt4o_6_aug_noplanning_nogrammar"
gpt4o_structuredplanning = "react_code_gpt4o_6_aug_structuredplanning_nogrammar"
gpt4o_multiagent = "react_code_gpt4o_13_aug_managedagent_noplanning_nogrammar"

list_versions = [
    gpt4o,
    gpt4o_planning,
    gpt4o_noplanning,
    gpt4o_structuredplanning,
    gpt4o_multiagent,
]

# submission_selection_name = "react_code_llama3-70b_02-05_full-gaia-validation-code"
sel_df = result_df.loc[
    (result_df["agent_name"].isin(list_versions))
    # & (~result_df["question"].isin(UNSOLVED_QUESTIONS))
].reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)
# assert sel_df["question"].value_counts().max() == len(list_versions), "Some questions are duplicate!"

: 

In [None]:
from scripts.evaluation.hard_questions import HARD_QUESTIONS

sel_df.loc[
    (sel_df["is_correct"] == False) & (sel_df["is_near_correct"] == True),
    ["question", "prediction", "true_answer"],
]

: 

In [None]:
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
    sel_df.groupby(["agent_name", "task"])[
        ["is_correct", "is_near_correct", "count_steps", "question"]
    ]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
        }
    )
    .rename(columns={"question": "count"})
)

: 

In [None]:
import plotly.express as px

cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[
            row["index"]
        ][:50]
        return res
    except Exception as e:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)
# cumulative_df["question"] = [el[:50] for el in sel_df["question"].values]


# cumulative_df["is_correct"] = cumulative_df["is_correct"] * (165 - 68) / 165

px.line(
    cumulative_df,
    color="agent_name",
    x="index",
    y="is_correct",
    hover_data="question",
)

: 

In [79]:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# Email credentials
sender_email = "timic77@yahoo.fr"
sender_password = (
    "yujabgmvixlwyrnh"  # Consider using an App Password if you have 2FA enabled
)
receiver_email = "aymeric@huggingface.co"

# Email content
subject = "Test Email"
body = "Hello world"

# Creating the MIMEText object
msg = MIMEMultipart()
msg["From"] = sender_email
msg["To"] = receiver_email
msg["Subject"] = subject

# Attach the email body to the message
msg.attach(MIMEText(body, "plain"))

print("OKOKOK finished!")

# Sending the email
try:
    # Connect to Yahoo's SMTP server
    server = smtplib.SMTP_SSL("smtp.mail.yahoo.com", 465)
    server.login(sender_email, sender_password)
    server.ehlo()

    text = msg.as_string()
    server.sendmail(sender_email, receiver_email, text)  # Send the email
    print("Email sent successfully")
except smtplib.SMTPServerDisconnected as e:
    print(f"SMTP server disconnected unexpectedly: {e}")
except smtplib.SMTPAuthenticationError as e:
    print(f"Authentication failed: {e}")
except smtplib.SMTPException as e:
    print(f"SMTP error occurred: {e}")
except Exception as e:
    print(f"Failed to send email: {e}")
finally:
    try:
        server.quit()  # Close the connection
    except Exception as e:
        print(f"Failed to close the connection: {e}")

OKOKOK finished!
Email sent successfully


# 3. Dive deeper into one run

In [12]:
sel_df = result_df.loc[result_df["agent_name"] == gpt4o]
print(len(sel_df))

165


### Count errors

In [13]:
import numpy as np

error_types = [
    "AgentParsingError",
    "AgentExecutionError",
    "AgentMaxIterationsError",
    "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
    if isinstance(row["intermediate_steps"], list):
        row["Count steps"] = len(row["intermediate_steps"])
        for step in row["intermediate_steps"]:
            if isinstance(step, dict) and "error" in step:
                try:
                    row[str(step["error"]["error_type"])] += 1
                except:
                    pass
    return row


sel_df = sel_df.apply(count_errors, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [14]:
import plotly.express as px


aggregate_errors = (
    sel_df.groupby(["is_correct"])[error_types + ["Count steps"]]
    .mean()
    .reset_index()
    .melt(id_vars=["is_correct"])
)

fig = px.bar(
    aggregate_errors,
    y="value",
    x="variable",
    color="is_correct",
    labels={
        "agent_name": "<b>LLM Engine</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "value": "<b>Average count</b>",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    height=500,
    width=800,
    barmode="group",
    bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("figures/aggregate_errors.png", scale=3)
fig.show()

### Count tool calls

In [15]:
tools_calls = pd.DataFrame.from_records(sel_df["tool_calls"].values).fillna(0)

# Exclude the tools that were not used enough
tools_calls = tools_calls.loc[:, tools_calls.sum() > 10]

# Sort the columns by the sum of the values
tools_calls = tools_calls[tools_calls.sum().sort_values(ascending=False).index]
display(tools_calls)
sel_with_calls = pd.concat(
    [sel_df[["question", "is_correct", "task"]], tools_calls], axis=1
)
sel_with_calls = (
    sel_with_calls.drop("question", axis=1).groupby(["is_correct", "task"]).mean()
)
# sel_with_calls = sel_with_calls.melt(id_vars=['question', 'is_correct', 'task'], var_name="tool", value_name='count')

Unnamed: 0,print,ask_search_agent,final_answer,len,range,inspect_file_as_text,set,visualizer,parse_square,sum,...,max,join,generate_prefixes,sorted,get,lower,f,search_birthdate,items,abs
0,8.0,3.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,5.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,3.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162,7.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163,20.0,8.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
sel_with_calls = sel_with_calls.reset_index().melt(
    id_vars=["is_correct", "task"], var_name="tool", value_name="average_count"
)

In [17]:
import plotly.express as px

fig = px.bar(
    sel_with_calls,
    x="tool",
    y="average_count",
    color="is_correct",
    facet_row="task",
    labels={
        "agent_name": "<b>Agent variant</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "eval_score_GPT4": "<b>Score</b>",
        "agent_type": "<b>Agent type</b>",
        "average_count": "<b>Average #calls per run</b>",
    },
)
fig.update_layout(
    barmode="group",
    height=800,
    width=1000,
    title="<b>" + "</b>",
)

### Inspect result by file extension type

In [18]:
display(
    sel_df.groupby(["agent_name", "attachment_type"])[
        ["is_correct", "count_steps", "question"]
    ].agg({"is_correct": "mean", "count_steps": "mean", "question": "count"})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct,count_steps,question
agent_name,attachment_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
react_code_gpt4o_23-june_planning2_newprompt5,,0.440945,9.19685,127
react_code_gpt4o_23-june_planning2_newprompt5,csv,0.0,7.0,1
react_code_gpt4o_23-june_planning2_newprompt5,docx,0.0,9.0,1
react_code_gpt4o_23-june_planning2_newprompt5,jpg,0.0,9.5,2
react_code_gpt4o_23-june_planning2_newprompt5,jsonld,0.0,16.0,1
react_code_gpt4o_23-june_planning2_newprompt5,mp3,1.0,8.333333,3
react_code_gpt4o_23-june_planning2_newprompt5,pdb,0.0,7.0,1
react_code_gpt4o_23-june_planning2_newprompt5,pdf,0.333333,5.666667,3
react_code_gpt4o_23-june_planning2_newprompt5,png,0.125,6.75,8
react_code_gpt4o_23-june_planning2_newprompt5,pptx,1.0,4.0,1


In [44]:
# Inspect specific file types
# sel_df.loc[
#     sel_df["attachment_type"].isin(["pdb", "docx", "csv"]),
#     [
#         "attachment_type",
#         "question",
#         "prediction",
#         "true_answer",
#         "is_correct",
#         "thoughts",
#     ],
# ]

# 4. Ensembling methods

### 4.1 Simple retry mechanism

In [54]:
first_run_gpt4 = result_df.loc[result_df["agent_name"] == gpt4o].copy()
second_run_gpt4 = result_df.loc[result_df["agent_name"] == noanchorplan].copy()


def replace_answer_if_incomplete(row, result_df_replacement):
    try:
        if (
            "Unable to determine" in row["intermediate_steps"]
            or "AgentMaxIterationsError" in str(row["intermediate_steps"])
            # or "AgentExecutionError" in str(row["intermediate_steps"])
            # or "AgentGenerationError" in str(row["intermediate_steps"])
            or "Error in generating final llm output" in str(row["intermediate_steps"])
        ):
            matching_answer = result_df_replacement.loc[
                (result_df_replacement["question"] == row["question"]), "prediction"
            ].values[0]
            print("replaced")
            gold_answer = matching_answer
        else:
            gold_answer = row["prediction"]
    except:
        gold_answer = row["prediction"]
    return gold_answer


combined_gpt4 = first_run_gpt4.copy()
combined_gpt4["prediction"] = combined_gpt4.apply(
    lambda x: replace_answer_if_incomplete(x, second_run_gpt4), axis=1
)

combined_gpt4["is_correct"] = combined_gpt4.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)

replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced
replaced


In [55]:
print("First run:")
print(first_run_gpt4.groupby(["task"])["is_correct"].mean())
print(first_run_gpt4["is_correct"].mean())

print("Second run:")
print(second_run_gpt4.groupby(["task"])["is_correct"].mean())
print(second_run_gpt4["is_correct"].mean())

print("Combined run:")
print(combined_gpt4.groupby(["task"])["is_correct"].mean())
print(combined_gpt4["is_correct"].mean())

First run:
task
1    0.566038
2    0.418605
3    0.200000
Name: is_correct, dtype: float64
0.4329268292682927
Second run:
task
1    0.528302
2    0.372093
3    0.200000
Name: is_correct, dtype: float64
0.39634146341463417
Combined run:
task
1    0.566038
2    0.395349
3    0.160000
Name: is_correct, dtype: float64
0.4146341463414634


### 4.2 Ideal ensembling

In [87]:
third_run = result_df.loc[result_df["agent_name"] == noanchorplan].copy()
INCLUDE_THIRD_RUN = False


# test ideal ensembling
def score_best_both(row, result_df_replacement):
    try:
        if row["is_correct"]:
            return True

        else:
            matching_answer = result_df_replacement.loc[
                (result_df_replacement["question"] == row["question"])
            ].iloc[0]
            if matching_answer["is_correct"]:
                return True
            else:
                return False
    except:
        return row["is_correct"]


combined_gpt4 = first_run_gpt4.copy()
combined_gpt4["is_correct"] = combined_gpt4.apply(
    lambda x: score_best_both(x, second_run_gpt4), axis=1
)
if INCLUDE_THIRD_RUN:
    combined_gpt4["is_correct"] = combined_gpt4.apply(
        lambda x: score_best_both(x, third_run), axis=1
    )
print("Ideal combined run:")
print(combined_gpt4.groupby(["task"])["is_correct"].mean())
print(combined_gpt4["is_correct"].mean())

Ideal combined run:
task
1    0.641509
2    0.465116
3    0.240000
Name: is_correct, dtype: float64
0.4878048780487805
