In [1]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [2]:
from rerankers import Reranker
reranker = Reranker("t5")

* 'fields' has been removed


Loading default t5 model for language en
Default Model: unicamp-dl/InRanker-base
Loading T5Ranker model unicamp-dl/InRanker-base (this message can be suppressed by setting verbose=0)
No device set
Using device cpu
No dtype set
Using dtype torch.float32
Loading model unicamp-dl/InRanker-base, this might take a while...
Using device cpu.
Using dtype torch.float32.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5 true token set to ▁true
T5 false token set to ▁false
Returning normalised scores...
Inputs template set to Query: {query} Document: {text} Relevant:


In [3]:
import json
import os
from copy import deepcopy
from pathlib import Path

import dspy
import pandas as pd
import typer
from bellem.musique.eval import (
    aggregate_scores,
    compute_scores,
    compute_scores_dataframe,
)
from bellem.utils import set_seed
from datasets import load_dataset
from dotenv import load_dotenv
from dspy.evaluate import Evaluate
from dspy.teleprompt.ensemble import Ensemble
from rich.console import Console

print = Console(stderr=True).print

load_dotenv()

set_seed(89)

ModuleNotFoundError: No module named 'bellem'

In [2]:
# import weave
# weave.init(project_name="mhqa-dspy")

: 

In [3]:
# import mlflow

# mlflow.set_tracking_uri("http://127.0.0.1:5000/")
# mlflow.set_experiment("mhqa-dspy")
# mlflow.dspy.autolog()

: 

In [4]:
def configure_lm(model, temperature):
    lm = dspy.LM(
        "openai/" + model,
        temperature=temperature,
        cache=False,
        api_base=os.getenv("OPENAI_BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    dspy.configure(lm=lm)


: 

In [5]:
from mhqa.react import ReAct
from mhqa.search import make_search_tool


def format_paragraph(paragraph):
    text = paragraph["paragraph_text"]
    title = paragraph["title"]
    return f"# {title}\n{text}"


def make_example(record):
    docs = [{"text": format_paragraph(p), "idx": p["idx"]} for p in record["paragraphs"]]
    return dspy.Example(
        id=record["id"],
        question=record["question"],
        docs=docs,
        question_decomposition=record["question_decomposition"],
        answers=[record["answer"], *record["answer_aliases"]],
    ).with_inputs("question", "docs")


def make_program():
    search_tool = make_search_tool()
    return ReAct("question -> answer", tools=[search_tool])


def evaluate_answer(example, pred, trace=None):
    scores = compute_scores(pred.answer, example.answers)
    return scores["f1"]


def dynamic_import(module, name):
    import importlib

    return getattr(importlib.import_module(module), name)


def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_answer
    return cls(**kwargs)


def preprocess_result(result):
    example, pred, score = result
    predictions = {f"predicted_{k}": v for k, v in dict(pred).items()}
    return {**dict(example), **predictions, "score": float(score)}


def make_results_dataframe(results):
    dataf = pd.json_normalize([preprocess_result(result) for result in results])
    dataf["n_hops"] = dataf["question_decomposition"].apply(len)
    dataf["predicted_answer"] = dataf["predicted_answer"].fillna("No Answer")
    return compute_scores_dataframe(dataf)


def train_main(
    dataset_path: str = typer.Option(..., help="Path to the dataset"),
    dataset_name: str = typer.Option(..., help="Name of the dataset"),
    dataset_split: str = typer.Option(..., help="Dataset split to use (e.g., 'train', 'validation')"),
    model: str = typer.Option(..., help="Name of the model to use"),
    temperature: float = typer.Option(..., help="Temperature parameter for the model"),
    load_from: str = typer.Option(default="UNSET", help="Path to a saved model to load"),
    optimizer_path: Path = typer.Option(..., help="Path to the optimizer config"),
    ensemble: str = typer.Option("no", help="Whether to use an ensemble of models"),
    out: Path = typer.Option(..., help="Output file for trained program"),
):
    out.parent.mkdir(parents=True, exist_ok=True)

    # Set up LLM
    configure_lm(model, temperature)

    # Load and preprocess datasets
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [make_example(record) for record in ds]
    print(f"Loaded {len(examples)} examples")

    # Create the program
    program = make_program()
    if load_from and load_from != "UNSET":
        print(f"Loading model from {load_from}")
        program.load(load_from)

    # Train the program
    with open(optimizer_path) as f:
        optimizer_config = json.load(f)

    if optimizer_config:
        optimizer = make_optimizer(optimizer_config)
        compile_params = optimizer_config.get("compile_params", {})
        trained_program = optimizer.compile(program, trainset=examples, **compile_params)
    else:
        trained_program = program

    if ensemble == "yes":
        ensemble_optimizer = Ensemble(reduce_fn=dspy.majority)
        candidate_programs = [x[-1] for x in trained_program.candidate_programs]
        trained_program = ensemble_optimizer.compile(candidate_programs)

    # Save the trained program
    trained_program.save(out)

    return trained_program

def evaluate_main(
    dataset_path: str = typer.Option(..., help="Path to the dataset"),
    dataset_name: str = typer.Option(..., help="Name of the dataset"),
    dataset_split: str = typer.Option(..., help="Dataset split to use (e.g., 'train', 'validation')"),
    model: str = typer.Option(..., help="Name of the model to use"),
    temperature: float = typer.Option(..., help="Temperature parameter for the model"),
    load_from: str = typer.Option(default="UNSET", help="Path to a saved model to load"),
    out: Path = typer.Option(..., help="Output directory for generated results"),
):
    out.mkdir(parents=True, exist_ok=True)

    # Set up LLM
    configure_lm(model, temperature)

    # Load and preprocess datasets
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [make_example(record) for record in ds]
    print(f"Loaded {len(examples)} examples")

    # Create the program
    program = make_program()
    if load_from and load_from != "UNSET":
        print(f"Loading model from {load_from}")
        program.load(load_from)

    # Evaluate the program
    evaluate_program = Evaluate(
        metric=evaluate_answer,
        devset=examples,
        num_threads=4,
        display_progress=True,
        return_outputs=True,
    )
    _, results = evaluate_program(program)

    # Save the results
    result_df = make_results_dataframe(results)
    result_df.to_json(out / "results.jsonl", orient="records", lines=True)

    # Save the scores
    scores = aggregate_scores(result_df)
    for n_hops in result_df["n_hops"].unique():
        scores[f"{n_hops}hops"] = aggregate_scores(result_df[result_df["n_hops"] == n_hops])

    with open(out / "scores.json", "w") as f:
        json.dump(scores, f, indent=2)


: 

In [6]:
model='llama-3.3-70b-tgi'
# model='meta-llama/Llama-3.3-70B-Instruct-Turbo'
# model='llama3.1:8b-instruct-q8_0'
# model='llama-3.1-8b-instant'
# model='gemini-2.0-flash-lite-preview-02-05'

: 

In [None]:
trained_program_filepath = Path('trained-program.json')

trained_program = train_main(
    dataset_path='bdsaglam/musique-mini',
    dataset_name='answerable',
    dataset_split='train[:50]',
    model=model,
    temperature=0.1,
    load_from='UNSET',
    optimizer_path='../data/raw/optimizer-configs/bfsrs-light.json',
    out=trained_program_filepath,
)

: 

In [None]:
out = Path('out')

evaluate_main(
    dataset_path='bdsaglam/musique',
    dataset_name='answerable',
    dataset_split='train[100:200]',
    model=model,
    temperature=0.1,
    load_from=trained_program_filepath,
    out=out,
)

: 