In [1]:
import json
import os
from copy import deepcopy
from pathlib import Path

import dspy
import pandas as pd
import typer
from bellem.musique.eval import (
    aggregate_scores,
    compute_scores,
    compute_scores_dataframe,
)
from bellem.utils import set_seed
from datasets import load_dataset
from dotenv import load_dotenv
from dspy.evaluate import Evaluate
from dspy.teleprompt.ensemble import Ensemble
from rich.console import Console

print = Console(stderr=True).print

load_dotenv()

set_seed(89)

* 'fields' has been removed


In [2]:
# import weave
# weave.init(project_name="mhqa-dspy")

In [3]:
# import mlflow

# mlflow.set_tracking_uri("http://127.0.0.1:5000/")
# mlflow.set_experiment("mhqa-dspy")
# mlflow.dspy.autolog()

In [4]:
def configure_lm(model, temperature):
    lm = dspy.LM(
        "openai/" + model,
        temperature=temperature,
        cache=False,
        api_base=os.getenv("OPENAI_BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    dspy.configure(lm=lm)


In [5]:
from mhqa.react import ReAct
from mhqa.search import make_search_tool


def format_paragraph(paragraph):
    text = paragraph["paragraph_text"]
    title = paragraph["title"]
    return f"# {title}\n{text}"


def make_example(record):
    docs = [{"text": format_paragraph(p), "idx": p["idx"]} for p in record["paragraphs"]]
    return dspy.Example(
        id=record["id"],
        question=record["question"],
        docs=docs,
        question_decomposition=record["question_decomposition"],
        answers=[record["answer"], *record["answer_aliases"]],
    ).with_inputs("question", "docs")


def make_program():
    search_tool = make_search_tool()
    return ReAct("question -> answer", tools=[search_tool])


def evaluate_answer(example, pred, trace=None):
    scores = compute_scores(pred.answer, example.answers)
    return scores["f1"]


def dynamic_import(module, name):
    import importlib

    return getattr(importlib.import_module(module), name)


def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_answer
    return cls(**kwargs)


def preprocess_result(result):
    example, pred, score = result
    predictions = {f"predicted_{k}": v for k, v in dict(pred).items()}
    return {**dict(example), **predictions, "score": float(score)}


def make_results_dataframe(results):
    dataf = pd.json_normalize([preprocess_result(result) for result in results])
    dataf["n_hops"] = dataf["question_decomposition"].apply(len)
    dataf["predicted_answer"] = dataf["predicted_answer"].fillna("No Answer")
    return compute_scores_dataframe(dataf)


def train_main(
    dataset_path: str = typer.Option(..., help="Path to the dataset"),
    dataset_name: str = typer.Option(..., help="Name of the dataset"),
    dataset_split: str = typer.Option(..., help="Dataset split to use (e.g., 'train', 'validation')"),
    model: str = typer.Option(..., help="Name of the model to use"),
    temperature: float = typer.Option(..., help="Temperature parameter for the model"),
    load_from: str = typer.Option(default="UNSET", help="Path to a saved model to load"),
    optimizer_path: Path = typer.Option(..., help="Path to the optimizer config"),
    ensemble: str = typer.Option("no", help="Whether to use an ensemble of models"),
    out: Path = typer.Option(..., help="Output file for trained program"),
):
    out.parent.mkdir(parents=True, exist_ok=True)

    # Set up LLM
    configure_lm(model, temperature)

    # Load and preprocess datasets
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [make_example(record) for record in ds]
    print(f"Loaded {len(examples)} examples")

    # Create the program
    program = make_program()
    if load_from and load_from != "UNSET":
        print(f"Loading model from {load_from}")
        program.load(load_from)

    # Train the program
    with open(optimizer_path) as f:
        optimizer_config = json.load(f)

    if optimizer_config:
        optimizer = make_optimizer(optimizer_config)
        compile_params = optimizer_config.get("compile_params", {})
        trained_program = optimizer.compile(program, trainset=examples, **compile_params)
    else:
        trained_program = program

    if ensemble == "yes":
        ensemble_optimizer = Ensemble(reduce_fn=dspy.majority)
        candidate_programs = [x[-1] for x in trained_program.candidate_programs]
        trained_program = ensemble_optimizer.compile(candidate_programs)

    # Save the trained program
    trained_program.save(out)

    return trained_program

def evaluate_main(
    dataset_path: str = typer.Option(..., help="Path to the dataset"),
    dataset_name: str = typer.Option(..., help="Name of the dataset"),
    dataset_split: str = typer.Option(..., help="Dataset split to use (e.g., 'train', 'validation')"),
    model: str = typer.Option(..., help="Name of the model to use"),
    temperature: float = typer.Option(..., help="Temperature parameter for the model"),
    load_from: str = typer.Option(default="UNSET", help="Path to a saved model to load"),
    out: Path = typer.Option(..., help="Output directory for generated results"),
):
    out.mkdir(parents=True, exist_ok=True)

    # Set up LLM
    configure_lm(model, temperature)

    # Load and preprocess datasets
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [make_example(record) for record in ds]
    print(f"Loaded {len(examples)} examples")

    # Create the program
    program = make_program()
    if load_from and load_from != "UNSET":
        print(f"Loading model from {load_from}")
        program.load(load_from)

    # Evaluate the program
    evaluate_program = Evaluate(
        metric=evaluate_answer,
        devset=examples,
        num_threads=4,
        display_progress=True,
        return_outputs=True,
    )
    _, results = evaluate_program(program)

    # Save the results
    result_df = make_results_dataframe(results)
    result_df.to_json(out / "results.jsonl", orient="records", lines=True)

    # Save the scores
    scores = aggregate_scores(result_df)
    for n_hops in result_df["n_hops"].unique():
        scores[f"{n_hops}hops"] = aggregate_scores(result_df[result_df["n_hops"] == n_hops])

    with open(out / "scores.json", "w") as f:
        json.dump(scores, f, indent=2)


In [6]:
model='llama-3.3-70b-tgi'
# model='meta-llama/Llama-3.3-70B-Instruct-Turbo'
# model='llama3.1:8b-instruct-q8_0'
# model='llama-3.1-8b-instant'
# model='gemini-2.0-flash-lite-preview-02-05'

In [7]:
trained_program_filepath = Path('trained-program.json')

trained_program = train_main(
    dataset_path='bdsaglam/musique-mini',
    dataset_name='answerable',
    dataset_split='train[:50]',
    model=model,
    temperature=0.1,
    load_from='UNSET',
    optimizer_path='../data/raw/optimizer-configs/bfsrs-light.json',
    out=trained_program_filepath,
)

Loading default t5 model for language en
Default Model: unicamp-dl/InRanker-base
Loading T5Ranker model unicamp-dl/InRanker-base (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model unicamp-dl/InRanker-base, this might take a while...
Using device cuda.
Using dtype torch.float32.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5 true token set to ▁true
T5 false token set to ▁false
Returning normalised scores...
Inputs template set to Query: {query} Document: {text} Relevant:
Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 4 candidate sets.
  0%|          | 0/50 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 1.27 / 3 (42.4%):   6%|▌         | 3/50 [00:13<02:35,  3.32s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 2.43 / 6 (40.4%):  12%|█▏        | 6/50 [00:26<02:17,  3.11s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 2.43 / 7 (34.7%):  14%|█▍        | 7/50 [00:36<04:01,  5.62s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 3.43 / 8 (42.8%):  16%|█▌        | 8/50 [00:39<03:12,  4.59s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 4.43 / 9 (49.2%):  18%|█▊        | 9/50 [00:44<03:10,  4.64s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 4.43 / 10 (44.3%):  20%|██        | 10/50 [00:52<03:53,  5.84s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 4.89 / 11 (44.4%):  22%|██▏       | 11/50 [01:03<04:46,  7.35s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 5.55 / 14 (39.6%):  28%|██▊       | 14/50 [01:14<02:47,  4.65s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 6.48 / 15 (43.2%):  30%|███       | 15/50 [01:17<02:29,  4.27s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 6.62 / 16 (41.4%):  32%|███▏      | 16/50 [01:23<02:43,  4.80s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 7.12 / 17 (41.9%):  34%|███▍      | 17/50 [01:28<02:40,  4.86s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 7.59 / 19 (39.9%):  38%|███▊      | 19/50 [01:38<02:24,  4.65s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 8.59 / 21 (40.9%):  42%|████▏     | 21/50 [01:40<01:22,  2.85s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 8.80 / 23 (38.3%):  46%|████▌     | 23/50 [01:57<02:17,  5.10s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 9.80 / 25 (39.2%):  50%|█████     | 25/50 [02:11<02:23,  5.75s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 9.80 / 26 (37.7%):  52%|█████▏    | 26/50 [02:12<01:39,  4.17s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 11.30 / 28 (40.4%):  56%|█████▌    | 28/50 [02:19<01:22,  3.75s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 12.30 / 29 (42.4%):  58%|█████▊    | 29/50 [02:24<01:24,  4.03s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 13.70 / 32 (42.8%):  64%|██████▍   | 32/50 [02:33<00:55,  3.08s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 14.70 / 33 (44.5%):  66%|██████▌   | 33/50 [02:35<00:48,  2.88s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 15.85 / 35 (45.3%):  70%|███████   | 35/50 [02:45<00:53,  3.57s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 15.85 / 36 (44.0%):  72%|███████▏  | 36/50 [02:46<00:40,  2.91s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 15.85 / 37 (42.8%):  74%|███████▍  | 37/50 [02:56<01:03,  4.86s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 16.75 / 39 (43.0%):  78%|███████▊  | 39/50 [03:00<00:36,  3.34s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 16.95 / 42 (40.4%):  84%|████████▍ | 42/50 [03:15<00:27,  3.47s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 17.95 / 43 (41.8%):  86%|████████▌ | 43/50 [03:20<00:28,  4.02s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 19.95 / 45 (44.3%):  90%|█████████ | 45/50 [03:28<00:18,  3.73s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 21.08 / 47 (44.8%):  94%|█████████▍| 47/50 [03:34<00:09,  3.20s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 22.18 / 50 (44.4%): 100%|██████████| 50/50 [03:47<00:00,  4.54s/it]

2025/02/11 14:39:25 INFO dspy.evaluate.evaluate: Average Metric: 22.17803028551273 / 50 (44.4%)



New best score: 44.36 for seed -3
Scores so far: [44.36]
Best score so far: 44.36
  0%|          | 0/50 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 0.27 / 2 (13.6%):   4%|▍         | 2/50 [00:13<04:35,  5.74s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 1.27 / 3 (42.4%):   6%|▌         | 3/50 [00:16<03:28,  4.43s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 1.43 / 4 (35.6%):   8%|▊         | 4/50 [00:26<05:10,  6.75s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 1.43 / 5 (28.5%):  10%|█         | 5/50 [00:28<03:46,  5.04s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 1.43 / 6 (23.8%):  12%|█▏        | 6/50 [00:34<04:00,  5.47s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 2.43 / 7 (34.7%):  14%|█▍        | 7/50 [00:37<03:09,  4.41s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 3.43 / 8 (42.8%):  16%|█▌        | 8/50 [00:41<03:02,  4.35s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 3.93 / 9 (43.6%):  18%|█▊        | 9/50 [00:51<04:10,  6.12s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 4.93 / 11 (44.8%):  22%|██▏       | 11/50 [00:54<02:23,  3.67s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 7.29 / 14 (52.1%):  28%|██▊       | 14/50 [01:10<02:25,  4.04s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 7.43 / 15 (49.5%):  30%|███       | 15/50 [01:11<01:51,  3.19s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 7.43 / 16 (46.4%):  32%|███▏      | 16/50 [01:19<02:31,  4.45s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 7.93 / 17 (46.7%):  34%|███▍      | 17/50 [01:22<02:17,  4.15s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 9.33 / 20 (46.7%):  40%|████      | 20/50 [01:33<01:40,  3.37s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 9.33 / 21 (44.5%):  42%|████▏     | 21/50 [01:42<02:20,  4.83s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 9.55 / 24 (39.8%):  48%|████▊     | 24/50 [01:50<01:17,  2.97s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 10.05 / 25 (40.2%):  50%|█████     | 25/50 [01:59<02:02,  4.91s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 11.05 / 26 (42.5%):  52%|█████▏    | 26/50 [02:06<02:11,  5.46s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 12.05 / 27 (44.6%):  54%|█████▍    | 27/50 [02:14<02:24,  6.30s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 14.05 / 29 (48.4%):  58%|█████▊    | 29/50 [02:20<01:29,  4.28s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 14.19 / 30 (47.3%):  60%|██████    | 30/50 [02:23<01:21,  4.09s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 14.59 / 31 (47.1%):  62%|██████▏   | 31/50 [02:25<01:06,  3.50s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 14.59 / 32 (45.6%):  64%|██████▍   | 32/50 [02:32<01:17,  4.32s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 14.74 / 33 (44.7%):  66%|██████▌   | 33/50 [02:39<01:27,  5.15s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 15.74 / 34 (46.3%):  68%|██████▊   | 34/50 [02:43<01:18,  4.89s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 16.74 / 35 (47.8%):  70%|███████   | 35/50 [02:47<01:10,  4.72s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 16.74 / 36 (46.5%):  72%|███████▏  | 36/50 [02:50<00:59,  4.24s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 17.24 / 37 (46.6%):  74%|███████▍  | 37/50 [02:51<00:41,  3.22s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 18.14 / 39 (46.5%):  78%|███████▊  | 39/50 [03:06<00:51,  4.64s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 18.14 / 40 (45.4%):  80%|████████  | 40/50 [03:11<00:47,  4.75s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 18.14 / 41 (44.2%):  82%|████████▏ | 41/50 [03:17<00:46,  5.18s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 18.41 / 42 (43.8%):  84%|████████▍ | 42/50 [03:18<00:32,  4.05s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 19.41 / 43 (45.1%):  86%|████████▌ | 43/50 [03:25<00:33,  4.80s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 20.41 / 44 (46.4%):  88%|████████▊ | 44/50 [03:29<00:27,  4.62s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 21.41 / 45 (47.6%):  90%|█████████ | 45/50 [03:31<00:19,  3.93s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 23.41 / 47 (49.8%):  94%|█████████▍| 47/50 [03:39<00:10,  3.66s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 24.41 / 48 (50.9%):  96%|█████████▌| 48/50 [03:44<00:07,  3.78s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Average Metric: 24.61 / 50 (49.2%): 100%|██████████| 50/50 [03:52<00:00,  4.64s/it]

2025/02/11 14:43:18 INFO dspy.evaluate.evaluate: Average Metric: 24.608474566369303 / 50 (49.2%)



New best score: 49.22 for seed -2
Scores so far: [44.36, 49.22]
Best score so far: 49.22


  0%|          | 0/50 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

  2%|▏         | 1/50 [00:09<07:27,  9.12s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

  4%|▍         | 2/50 [00:18<07:11,  9.00s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

  6%|▌         | 3/50 [02:19<47:20, 60.43s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

  8%|▊         | 4/50 [02:36<33:16, 43.40s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

 10%|█         | 5/50 [02:52<24:58, 33.31s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

 12%|█▏        | 6/50 [03:25<24:16, 33.11s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

 14%|█▍        | 7/50 [08:06<1:21:57, 114.36s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

 16%|█▌        | 8/50 [08:22<58:08, 83.07s/it]   

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

 18%|█▊        | 9/50 [13:19<1:42:26, 149.91s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

 20%|██        | 10/50 [13:37<54:28, 81.72s/it]   


Bootstrapped 8 full traces after 10 examples for up to 10 rounds, amounting to 31 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

2025/02/11 14:57:14 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__97238_154727', 'question': 'What year did the war Jameel Sayhood fought in start?', 'docs': [{'text': "# Muslim Atayev\nMuslim Atayev (June 24, 1973 – January 27, 2005), also known as Emir Sayfullah, was the founder of the militant organization Yarmuk Jamaat, which later became part of the Caucasus Front's Kabardino-Balkarian Sector in the Russian-held Caucasian Muslim state Kabardino-Balkaria of the Second Chechen War. Atayev was an ethnic Balkar and started his military career as a volunteer fighting in Chechnya.", 'idx': 0}, {'text': '# Warsaw Pact\nFor 36 years, NATO and the Warsaw Pact never directly waged war against each other in Europe; the United States and the Soviet Union and their respective allies implemented strategic policies aimed at the containment of each other in Europe, while working and fighting for influence within the wider Cold War on the international stage.', 'idx': 

Average Metric: 0.00 / 0 (0%):   2%|▏         | 1/50 [00:19<15:40, 19.20s/it]



Average Metric: 0.00 / 0 (0%):   4%|▍         | 2/50 [00:20<06:50,  8.56s/it]

2025/02/11 14:57:16 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__71611_90450', 'question': 'Who was president when the place where the majority of sweet corn is grown became a state?', 'docs': [{'text': "# Politics of Texas\nIn a reversal of alignments, since the late 1960s the Republican Party has grown more prominent within the state based on an influx of primarily white voters (the majority in the state) from the Democratic Party. By the mid-1990s, it became the state's dominant political party.", 'idx': 0}, {'text': '# Corn production in the United States\nIowa, the largest producer of corn in the US, grows three times as much corn as Mexico. Iowa harvested 3,548 acres (1,436 ha) of sweet corn in 2007. In 2011, the state had 92,300 corn farms on 30,700,000 acres (12,400,000 ha), the average size being 333 acres (135 ha), and the average dollar value per acre being US $6,708. In the same year, there were 13.7 million harvested acres of corn for grain, p

Average Metric: 0.00 / 0 (0%):   6%|▌         | 3/50 [00:20<03:51,  4.94s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

2025/02/11 14:57:20 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__819850_32467', 'question': 'How old are some of the private schools in the city where Basilica of the Co-Cathedral of the Sacred Heart is located?', 'docs': [{'text': '# Barbara Boggs Sigmund\nA graduate of Stone Ridge School of the Sacred Heart and Manhattanville College, she taught at the Stuart Country Day School of the Sacred Heart (Princeton, N.J.), which, in honor of her life, now annually awards the Barbara Boggs Sigmund Alumnae Award.', 'idx': 0}, {'text': '# Mosaic\nSometimes not only church interiors but façades were also decorated with mosaics in Italy like in the case of the St Mark\'s Basilica in Venice (mainly from the 17th–19th centuries, but the oldest one from 1270–75, "The burial of St Mark in the first basilica"), the Cathedral of Orvieto (golden Gothic mosaics from the 14th century, many times redone) and the Basilica di San Frediano in Lucca (huge, striking golden mosaic 

Average Metric: 0.00 / 0 (0%):   8%|▊         | 4/50 [00:24<03:26,  4.49s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

2025/02/11 14:57:29 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__612535_47295', 'question': 'What is the area code for the state where Thomas H. Makiyama was born?', 'docs': [{'text': '# Hamilton City, California\nHamilton City (formerly, Hamilton) is a census-designated place (CDP) in Glenn County, California, United States. The population was 1,759 at the 2010 census, down from 1,903 at the 2000 census. Hamilton City is located east of Orland, and 10 miles west of Chico at an elevation of 151 feet (46 m). The community is inside area code 530. The default prefix used for wired telephones in the Hamilton City area is 826. The postal ZIP Code is 95951.', 'idx': 0}, {'text': '# Alhambra, Montana\nAlhambra is a populated place in Jefferson County, Montana, United States. It is a subdivision about a mile south of Clancy and shares a postal code (59634) with that town. Alhambra is part of the Helena Micropolitan Area, and its population is counted within the C

Average Metric: 0.00 / 0 (0%):  10%|█         | 5/50 [00:33<04:35,  6.12s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

2025/02/11 14:57:33 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__829722_55566', 'question': "Who did Don Arlich's team play in the playoffs?", 'docs': [{'text': '# Don Arlich\nDonald Louis "Don" Arlich (born February 15, 1943) is an American retired professional baseball player whose career spanned nine seasons, including parts of two in Major League Baseball with the Houston Astros in 1965 and 1966. During his major league career, Arlich compiled a record of 0–1 with an 8.10 earned run average (ERA) in eight games, one start. He also played in the minor leagues with the Class-A Jacksonville Jets; the Class-B, and later Class-A Durham Bulls; the Double-A San Antonio Bullets; the Double-A Amarillo Sonics; the Triple-A Oklahoma City 89ers; the Double-A Austin Braves; and the Triple-A Richmond Braves. While in the minors, Arlich compiled a record of 45–48 with a 3.86 ERA in 221 games, 107 starts.', 'idx': 0}, {'text': '# New York Jets\nThe team was founded in

Average Metric: 0.00 / 0 (0%):  12%|█▏        | 6/50 [00:37<03:56,  5.38s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

2025/02/11 14:57:33 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__281903_421645', 'question': 'Who is the spouse of the performer of Mistletoe and Holly?', 'docs': [{'text': "# Blue Suede Shoes\n``Blue Suede Shoes ''is a rock - and - roll standard written and first recorded by Carl Perkins in 1955. It is considered one of the first rockabilly (rock - and - roll) records, incorporating elements of blues, country and pop music of the time. Perkins' original version of the song was on the Cashbox Best Selling Singles list for 16 weeks and spent two weeks in the number two position. Elvis Presley performed his version of the song three different times on national television. It was also recorded by Buddy Holly and Eddie Cochran, among many others.", 'idx': 0}, {'text': "# The Day the Music Died\nAt the time, Holly and his band, consisting of Waylon Jennings, Tommy Allsup, and Carl Bunch, were playing on the ``Winter Dance Party ''tour across the Midwest. Rising

Average Metric: 0.00 / 0 (0%):  14%|█▍        | 7/50 [00:38<02:42,  3.78s/it]

2025/02/11 14:57:34 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__497910_351187', 'question': 'What county is the city where Kristen Graczyk was born located in?', 'docs': [{'text': '# Ap Lo Chun\nAp Lo Chun () is a small island in the New Territories of Hong Kong. It is located in Ap Chau Bay () between Ap Chau in the east and Sai Ap Chau in the west, with the islet of Ap Tan Pai nearby in the northeast. It is under the administration of North District.', 'idx': 0}, {'text': "# Biysky District\nBiysky District () is an administrative and municipal district (raion), one of the fifty-nine in Altai Krai, Russia. It is located in the east of the krai and borders with Zonalny, Tselinny, Soltonsky, Krasnogorsky, Sovetsky, and Smolensky Districts, as well as with the territory of the City of Biysk. The area of the district is . Its administrative center is the city of Biysk (which is not administratively a part of the district). District's population:", 'idx': 1}

Average Metric: 0.00 / 0 (0%):  16%|█▌        | 8/50 [00:39<02:00,  2.87s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

2025/02/11 14:57:46 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__130718_62851', 'question': 'Who helped resolve the dispute between Virginia and the state where Jacob Highbarger House is located?', 'docs': [{'text': '# Science Museum of Virginia\nThe Science Museum of Virginia is a science museum located in Richmond, Virginia. Established in 1970, it is an agency of the Commonwealth of Virginia. It is housed in the former Broad Street Station, built in 1917.', 'idx': 0}, {'text': '# Fort Seybert, West Virginia\nFort Seybert is an unincorporated community located in Pendleton County, West Virginia, United States. This town was named for Captain Jacob Seybert who built an early stockade here. It was captured by Native Americans in 1758, who spared only eleven lives (see Bemino). Fort Seybert is the only place in the United States with this name.', 'idx': 1}, {'text': '# Jacobs Fork, West Virginia\nJacobs Fork is an unincorporated community in McDowell County

Average Metric: 0.00 / 0 (0%):  18%|█▊        | 9/50 [00:50<03:52,  5.66s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': "litellm.BadRequestError: OpenAIException - Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 12583 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}\nReceived Model Group=llama-3.3-70b-tgi\nAvailable Model Group Fallbacks=None", 'type': None, 'param': None, 'code': '400'}}

In [None]:
out = Path('out')

evaluate_main(
    dataset_path='bdsaglam/musique',
    dataset_name='answerable',
    dataset_split='train[100:200]',
    model=model,
    temperature=0.1,
    load_from=trained_program_filepath,
    out=out,
)