In [2]:
import json
import os
from copy import deepcopy
from pathlib import Path

import dspy
import pandas as pd
import typer
import weave
from dotenv import load_dotenv
from dspy.evaluate import Evaluate
from dspy.teleprompt.ensemble import Ensemble
from rich.console import Console

from mhqa.agent import make_decomposing_agent, make_simple_agent
from mhqa.evaluation import (
    aggregate_scores,
    compute_scores,
    compute_scores_dataframe,
)
from mhqa.multihop import make_multihop_program
from mhqa.qa import make_qa_program
from mhqa.search import make_retriever
from mhqa.utils import configure_lm, dynamic_import, set_seed
import weave

print = Console(stderr=True).print

load_dotenv()

set_seed(89)

* 'fields' has been removed


In [3]:
weave.init(project_name="mhqa-dspy-debug")


def preprocess_examples(examples: list[dspy.Example], technique: str):
    if "agent" in technique or "multihop" in technique:
        return [example.with_inputs("docs", "question") for example in examples]
    else:
        return [example.with_inputs("context", "question") for example in examples]


def make_program(technique: str, retriever_name: str, top_k: int):
    retriever = make_retriever(retriever_name, top_k=top_k)
    if technique == "agent-simple":
        return make_simple_agent(retriever)
    elif technique == "agent-decompose":
        return make_decomposing_agent(retriever)
    elif technique == "multihop-decompose":
        return make_multihop_program(retriever)
    else:
        return make_qa_program(technique)


@weave.op()
def evaluate_answer(example, pred, trace=None):
    scores = compute_scores(pred.answer, example.answers)
    return scores["f1"]


def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_answer
    return cls(**kwargs)


def preprocess_result(result):
    example, pred, score = result
    predictions = {f"predicted_{k}": v for k, v in dict(pred).items()}
    return {**dict(example), **predictions, "score": float(score)}


def make_results_dataframe(results):
    dataf = pd.json_normalize([preprocess_result(result) for result in results])
    dataf["n_hops"] = dataf["question_decomposition"].apply(len)
    dataf["predicted_answer"] = dataf["predicted_answer"].fillna("No Answer")
    return compute_scores_dataframe(dataf)




Logged in as Weights & Biases user: bdsaglam.
View Weave data at https://wandb.ai/bdsaglam/mhqa-dspy-debug/weave


In [6]:
# Hardcoded values based on params.yaml

# Create necessary Python objects with hardcoded values
dataset_path = "bdsaglam/musique-sweep"
dataset_name = "answerable"
dataset_split = "train"

optimizer = "noop"
ensemble = "no"

evaluation_dataset_path = "bdsaglam/musique-sweep"
evaluation_dataset_name = "answerable"
evaluation_dataset_split = "validation"

retriever = "t5"
top_k = 3

model = "llama-3.3-70b"
temperature = 0.1
technique = "multihop-decompose"

load_from = "UNSET"
out = Path("output_directory")  # Placeholder for output directory

In [5]:
# Set up LLM
configure_lm(model, temperature)

# Load and preprocess datasets
if "musique" in dataset_path:
    from mhqa.musique import load_examples

    examples = preprocess_examples(load_examples(dataset_path, dataset_name, dataset_split), technique)[:10]
    print(f"Loaded {len(examples)} examples")
else:
    raise ValueError(f"Unknown dataset: {dataset_path}")

# Create the program
program = make_program(technique=technique, retriever_name=retriever, top_k=top_k)
if load_from and load_from != "UNSET":
    print(f"Loading model from {load_from}")
    program.load(load_from)

# Evaluate the program
evaluate_program = Evaluate(
    metric=evaluate_answer,
    devset=examples,
    num_threads=1,
    display_progress=True,
    return_outputs=True,
)
_, results = evaluate_program(program)

Loading default t5 model for language en
Default Model: unicamp-dl/InRanker-base
Loading T5Ranker model unicamp-dl/InRanker-base (this message can be suppressed by setting verbose=0)
No device set
Using device cpu
No dtype set
Using dtype torch.float32
Loading model unicamp-dl/InRanker-base, this might take a while...
Using device cpu.
Using dtype torch.float32.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5 true token set to ▁true
T5 false token set to ▁false
Returning normalised scores...
Inputs template set to Query: {query} Document: {text} Relevant:
  0%|          | 0/10 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-498f-7132-8adb-1dd58c437154
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-60ec-7ed3-a525-7bbf45936cc6
Average Metric: 1.00 / 10 (10.0%):  10%|█         | 1/10 [00:05<00:53,  5.99s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-60fa-7403-9a57-356ca48a80af
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-8f69-7f83-905f-9d21178cbaf8
Average Metric: 1.00 / 10 (10.0%):  20%|██        | 2/10 [00:17<01:15,  9.47s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-8f71-7200-b6c5-7a8ae94ef3a0
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-b6fd-7eb3-8272-c12c38824c2f
Average Metric: 1.00 / 10 (10.0%):  30%|███       | 3/10 [00:28<01:08,  9.77s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-b707-78b0-9b4f-e5be40e3c4b5
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-ced4-7212-a3ba-4f6a9166fba1
Average Metric: 1.00 / 10 (10.0%):  40%|████      | 4/10 [00:34<00:49,  8.32s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-ced9-7601-a2d6-7243210119c9
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-f65c-7372-8a47-90e77c1b9f62
Average Metric: 1.18 / 10 (11.8%):  50%|█████     | 5/10 [00:44<00:44,  8.97s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde1-f665-7823-bb65-8e3eee606e76
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-2680-7411-bf47-3ef5195796da
Average Metric: 2.18 / 10 (21.8%):  60%|██████    | 6/10 [00:56<00:40, 10.11s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-2688-7e51-889a-ee8bb85f6058
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-58a6-70a0-b78d-b9d429da0810
Average Metric: 3.18 / 10 (31.8%):  70%|███████   | 7/10 [01:09<00:33, 11.00s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-58aa-7071-8a07-9bde62d75a31
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-72e9-7af2-8d2a-c80e01857744
Average Metric: 4.18 / 10 (41.8%):  80%|████████  | 8/10 [01:16<00:19,  9.64s/it]

2025/02/13 08:57:44 ERROR dspy.utils.parallelizer: Error processing item Example({'id': '2hop__142965_178655', 'question': 'What is the record label of the composer whose fifth studio album was titled Nine Million Bicycles?', 'context': '# Nine Million Bicycles\n"Nine Million Bicycles" is a song written and produced by Mike Batt for the singer Katie Melua\'s second album, "Piece by Piece". It was released as the album\'s first single in September 2005 and reached number five on the UK Singles Chart, becoming Melua\'s first top five hit as a solo artist. It was a finalist for The Record of the Year prize, losing to "You Raise Me Up" by Westlife.\n\n# Ketevan (album)\nKetevan is the sixth studio album by Georgian-British singer Katie Melua, released in the United Kingdom on 16 September 2013 through Dramatico. At birth Melua was given the name Ketevan, but later she adopted the name Katie.', 'docs': [{'idx': 0, 'text': '# El Turista\nEl Turista is the eighth studio album by the singer-so

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-72f2-7ef0-a54a-936c0b2156a1
Average Metric: 4.18 / 10 (41.8%):  90%|█████████ | 9/10 [01:17<00:06,  6.94s/it]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

Scoring...:   0%|          | 0/1 [00:00<?, ?it/s]

🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-76d2-7ab3-ae9b-ad6c1bd781be
🍩 https://wandb.ai/bdsaglam/mhqa-dspy-debug/r/call/0194fde2-b35c-7540-86f7-a90cd0a8c889
Average Metric: 4.52 / 10 (45.2%): 100%|██████████| 10/10 [01:32<00:00,  9.26s/it]

2025/02/13 08:58:00 INFO dspy.evaluate.evaluate: Average Metric: 4.515151515151515 / 10 (45.2%)





TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
out.mkdir(parents=True, exist_ok=True)

# Save the results
result_df = make_results_dataframe(results)
result_df.to_json(out / "results.jsonl", orient="records", lines=True)

# Save the scores
scores = aggregate_scores(result_df)
for n_hops in result_df["n_hops"].unique():
    scores[f"{n_hops}hops"] = aggregate_scores(result_df[result_df["n_hops"] == n_hops])

with open(out / "scores.json", "w") as f:
    json.dump(scores, f, indent=2)
