In [None]:
from datasets import Dataset, load_dataset, concatenate_datasets
import pandas as pd
import numpy as np

### HotpotQA

In [None]:
hotpotqa_dataset = load_dataset("hotpot_qa", "distractor")

# let's sample a few examples from each level (of difficulty) and type (comparion or bridge)
dataset_df = pd.DataFrame(hotpotqa_dataset["train"])
sample_indicies = dataset_df.groupby(["level", "type"]).sample(4, random_state=10).index.values
hotpotqa_dataset.reset_format()
hotpotqa_dataset_leftout = hotpotqa_dataset["train"].select(
    [i for i in range(len(hotpotqa_dataset["train"])) if i not in sample_indicies]
)
hotpotqa_dataset = hotpotqa_dataset["train"].select(sample_indicies)

hotpotqa_dataset_leftout_df = pd.DataFrame(hotpotqa_dataset_leftout)
hotpotqa_dataset_leftout_df = (
    hotpotqa_dataset_leftout_df.groupby(["level", "type"])
    .sample(6, random_state=42)
    .reset_index(drop=True)
)
hotpotqa_dataset_leftout = Dataset.from_pandas(hotpotqa_dataset_leftout_df)

task_column = [f"HotpotQA-{level}" for level in hotpotqa_dataset["level"]]
hotpotqa_dataset = hotpotqa_dataset.add_column("task", task_column).select_columns(
    ["question", "answer", "task"]
)

task_column = [f"HotpotQA-{level}" for level in hotpotqa_dataset_leftout["level"]]
hotpotqa_dataset_leftout = hotpotqa_dataset_leftout.add_column("task", task_column).select_columns(
    ["question", "answer", "task"]
)

In [None]:
hotpotqa_dataset = concatenate_datasets([hotpotqa_dataset, hotpotqa_dataset_leftout])
print(len(hotpotqa_dataset), len(pd.Series(hotpotqa_dataset["question"]).unique()))

### GSM8K

In [None]:
np.random.seed(42)

math_dataset = load_dataset("gsm8k", "main")["train"]

first_selection = np.random.randint(0, len(math_dataset), 15)
second_selection = np.random.randint(0, len(math_dataset), 15)
second_selection_first_excluded = [i for i in second_selection if i not in first_selection][:20]

math_dataset = math_dataset.select(
    list(first_selection) + list(second_selection_first_excluded)[:5]
)


task_column = ["GSM8K"] * len(math_dataset)
math_dataset = math_dataset.add_column("task", task_column).select_columns(
    ["question", "answer", "task"]
)

### GAIA

In [None]:
gaia_dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1")["validation"]
gaia_dataset.set_format("pandas")
gaia_dataset_df = gaia_dataset[:]
gaia_dataset_df["number_of_steps"] = gaia_dataset_df["Annotator Metadata"].apply(
    lambda row: int(row["Number of steps"])
)
gaia_dataset_df["tools_used"] = gaia_dataset_df["Annotator Metadata"].apply(
    lambda row: row["Tools"]
)
gaia_dataset_df = gaia_dataset_df.loc[
    ~gaia_dataset_df["tools_used"]
    .str.lower()
    .str.contains(
        "pdf|excel|image|video|parsing|audio|word|file|speech|viewer|markdown|python|editor|model"
    )
]

In [None]:
pd.set_option("display.max_colwidth", None)
gaia_dataset_df

In [None]:
selected_indicies = [0, 2, 11, 12, 23, 28, 25, 29, 32, 35, 36, 37, 39, 40, 41, 42, 43, 47, 49, 52]
print(len(selected_indicies))
gaia_dataset = gaia_dataset.rename_columns(
    {"Question": "question", "Final answer": "answer"}
).select_columns(["question", "answer"])
gaia_dataset.reset_format()
gaia_dataset = gaia_dataset.select(selected_indicies)

task_column = ["GAIA"] * len(gaia_dataset)
gaia_dataset = gaia_dataset.add_column("task", task_column)

In [None]:
full_eval_dataset = concatenate_datasets([math_dataset, hotpotqa_dataset, gaia_dataset])
pd.Series(full_eval_dataset["task"]).value_counts()

### Export

In [None]:
full_eval_dataset.push_to_hub("ciremya/agents_small_benchmark")