In [13]:
#|export
import dspy
from dspy.primitives.program import Module
from dspy.signatures.signature import ensure_signature


def validate_triples_format(triples: str, sep: str = ";") -> bool:
    """Check if the triples are in the correct format."""
    return any(len(triple.split(sep)) != 3 for triple in triples.splitlines())

_prefix = """
Let's identify the relevant entity-relation-entity triples in the format of 'subj;relation;obj'. For example,
London;capital of;United Kingdom
Barrack Obama;birth place;Hawaii
""".strip()

class ConnectTheEntities(Module):
    def __init__(self, signature, rationale_type=None, activated=True, **config):
        super().__init__()

        self.activated = activated

        self.signature = signature = ensure_signature(signature)

        desc = "${triples}"
        rationale_type = rationale_type or dspy.OutputField(prefix=_prefix, desc=desc)

        # Add "triples" field to the output signature.
        extended_signature = signature.prepend("triples", rationale_type, type_=str)

        self._predict = dspy.Predict(extended_signature, **config)
        self._predict.extended_signature = extended_signature

    def forward(self, **kwargs):
        assert self.activated in [True, False]

        signature = kwargs.pop("new_signature", self._predict.extended_signature if self.activated else self.signature)
        pred = self._predict(signature=signature, **kwargs)
        dspy.Suggest(
            validate_triples_format(pred.triples),
            "Triples must be in the format of 'subj;relation;obj' and each triple must be in a new line.",
            target_module=self._predict,
        )
        return pred

    @property
    def demos(self):
        return self._predict.demos

    @property
    def extended_signature(self):
        return self._predict.extended_signature

In [14]:
from copy import deepcopy
import json
import os
import pandas as pd
import typer
from pathlib import Path

import dspy
from dspy.evaluate import Evaluate
from datasets import load_dataset
from bellek.utils import set_seed
from bellek.musique.eval import (
    aggregate_scores,
    compute_scores,
    compute_scores_dataframe,
)
from dotenv import load_dotenv
from rich.console import Console

print = Console(stderr=True).print

load_dotenv()

set_seed(89)


def configure_lm(model, temperature):
    lm = dspy.LM(
        "openai/" + model,
        temperature=temperature,
        cache=False,
        api_base=os.getenv("OPENAI_BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    dspy.configure(lm=lm)


def format_paragraph(paragraph):
    text = paragraph["paragraph_text"]
    title = paragraph["title"]
    return f"# {title}\n{text}"


def make_example(record):
    supporting_paragraphs = [p for p in record["paragraphs"] if p["is_supporting"]]
    context = "\n\n".join([format_paragraph(p) for p in supporting_paragraphs])
    return dspy.Example(
        id=record["id"],
        question=record["question"],
        question_decomposition=record["question_decomposition"],
        context=context,
        answer=record["answer"],
        answers=[record["answer"], *record["answer_aliases"]],
    ).with_inputs("question", "context")


class GenerateAnswer(dspy.Signature):
    """Answer the question based on the given context."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")


class QAModule(dspy.Module):
    def __init__(self, predict_cls=dspy.Predict):
        super().__init__()
        self.generate_answer = predict_cls(GenerateAnswer)

    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)


def get_predict_cls(technique):
    if technique == "standard":
        return dspy.Predict
    elif technique == "cot":
        return dspy.ChainOfThought
    elif technique == "cte":

        return ConnectTheEntities
    else:
        raise ValueError(f"Unknown technique: {technique}")


def evaluate_answer(example, pred, trace=None):
    scores = compute_scores(pred.answer, example.answers)
    return scores["f1"]


def dynamic_import(module, name):
    import importlib

    return getattr(importlib.import_module(module), name)


def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_answer
    return cls(**kwargs)


def preprocess_result(result):
    example, pred, score = result
    predictions = {f"predicted_{k}": v for k, v in dict(pred).items()}
    return {**dict(example), **predictions, "score": float(score)}


def make_results_dataframe(results):
    dataf = pd.json_normalize([preprocess_result(result) for result in results])
    dataf["n_hops"] = dataf["question_decomposition"].apply(len)
    dataf['predicted_answer'] = dataf['predicted_answer'].fillna("No Answer")
    return compute_scores_dataframe(dataf)


def train_main(
    dataset_path: str = typer.Option(..., help="Path to the dataset"),
    dataset_name: str = typer.Option(..., help="Name of the dataset"),
    dataset_split: str = typer.Option(..., help="Dataset split to use (e.g., 'train', 'validation')"),
    model: str = typer.Option(..., help="Name of the model to use"),
    temperature: float = typer.Option(..., help="Temperature parameter for the model"),
    technique: str = typer.Option(..., help="Prompting technique to use"),
    load_from: str = typer.Option(default="UNSET", help="Path to a saved model to load"),
    optimizer_path: Path = typer.Option(..., help="Path to the optimizer config"),
):
    # Set up LLM
    configure_lm(model, temperature)

    # Load and preprocess datasets
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [make_example(record) for record in ds]
    print(f"Loaded {len(examples)} examples")

    # Create the program
    program = QAModule(predict_cls=get_predict_cls(technique))
    if load_from and load_from != "UNSET":
        print(f"Loading model from {load_from}")
        program.load(load_from)

    # Train the program
    with open(optimizer_path) as f:
        optimizer_config = json.load(f)

    if optimizer_config:
        optimizer = make_optimizer(optimizer_config)
        compile_params = optimizer_config.get("compile_params", {})
        trained_program = optimizer.compile(program, trainset=examples, **compile_params)
    else:
        trained_program = program

    # Save the trained program
    return trained_program


def evaluate_main(
    dataset_path: str = typer.Option(..., help="Path to the dataset"),
    dataset_name: str = typer.Option(..., help="Name of the dataset"),
    dataset_split: str = typer.Option(..., help="Dataset split to use (e.g., 'train', 'validation')"),
    model: str = typer.Option(..., help="Name of the model to use"),
    temperature: float = typer.Option(..., help="Temperature parameter for the model"),
    technique: str = typer.Option(..., help="Prompting technique to use"),
    program = None,
):
    # Set up LLM
    configure_lm(model, temperature)

    # Load and preprocess datasets
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [make_example(record) for record in ds]
    print(f"Loaded {len(examples)} examples")

    # Create the program
    if program is None:
        program = QAModule(predict_cls=get_predict_cls(technique))

    # Evaluate the program
    evaluate_program = Evaluate(
        metric=evaluate_answer,
        devset=examples,
        num_threads=16,
        display_progress=True,
        return_outputs=True,
    )
    _, results = evaluate_program(program)

    # Save the results
    result_df = make_results_dataframe(results)

    # Save the scores
    scores = aggregate_scores(result_df)
    for n_hops in result_df["n_hops"].unique():
        scores[f"{n_hops}hops"] = aggregate_scores(result_df[result_df["n_hops"] == n_hops])


    return result_df, scores

In [15]:
trained_program = train_main(
    dataset_path="bdsaglam/musique-mini",
    dataset_name="answerable",
    dataset_split="train",
    model="llama-3-70b-tgi",
    temperature=0.1,
    technique="cte",
    optimizer_path=Path("bfsrs-medium.json"),
    load_from="UNSET",
)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


Average Metric: 19.933333333333334 / 23  (86.7):   8%|▊         | 23/300 [00:13<02:06,  2.19it/s][2m2024-10-26T07:47:13.834872Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Triples must be in the format of 'subj;relation;obj' and each triple must be in a new line.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 19.933333333333334 / 25  (79.7):   8%|▊         | 24/300 [00:15<02:57,  1.56it/s][2m2024-10-26T07:47:13.966459Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Triples must be in the format of 'subj;relation;obj' and each triple must be in a new line.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 93.99737274220033 / 116  (81.0):  38%|███▊      | 115/300 [00:54<0

DSPySuggestionError: Triples must be in the format of 'subj;relation;obj' and each triple must be in a new line.

In [16]:
dataset_path="bdsaglam/musique-mini"
dataset_name="answerable"
dataset_split="train"
model="llama-3-70b-tgi"
temperature=0.1
technique="cte"
optimizer_path=Path("bfsrs-medium.json")
load_from="UNSET"

# Set up LLM
configure_lm(model, temperature)

# Load and preprocess datasets
ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
examples = [make_example(record) for record in ds]
print(f"Loaded {len(examples)} examples")

# Create the program
program = QAModule(predict_cls=ConnectTheEntities)

# Train the program
# optimizer_config = json.load(optimizer_path.open())
# optimizer = make_optimizer(optimizer_config)
# compile_params = optimizer_config.get("compile_params", {})
# trained_program = optimizer.compile(program, trainset=examples, **compile_params)


Prediction(
    triples='Centerpoint Medical Center is located in Missouri. Missouri has a mean temperature of 24 ° C (75 ° F) in the summer.',
    answer='75 ° F'
)

In [17]:
example = examples[0]
print(example.context)
print(example.question)
pred = program.forward(context=example.context, question=example.question)
pred

Prediction(
    triples='Centerpoint Medical Center is located in Missouri. Missouri has a mean temperature of 24 ° C (75 ° F) in the summer.',
    answer='75 ° F'
)