In [1]:
from copy import deepcopy
import json
import os
import pandas as pd
from pathlib import Path

import dspy
from dspy.evaluate import Evaluate
from datasets import load_dataset
from bellem.utils import set_seed
from bellem.musique.eval import (
    aggregate_scores,
    compute_scores,
    compute_scores_dataframe,
)
from dotenv import load_dotenv


load_dotenv()

set_seed(89)


def configure_lm(model, temperature):
    lm = dspy.LM(
        "openai/" + model,
        temperature=temperature,
        cache=False,
        api_base=os.getenv("OPENAI_BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    dspy.configure(lm=lm)


def format_paragraph(paragraph):
    text = paragraph["paragraph_text"]
    title = paragraph["title"]
    return f"# {title}\n{text}"


def make_example(record):
    supporting_paragraphs = [p for p in record["paragraphs"] if p["is_supporting"]]
    context = "\n\n".join([format_paragraph(p) for p in supporting_paragraphs])
    return dspy.Example(
        id=record["id"],
        question=record["question"],
        question_decomposition=record["question_decomposition"],
        context=context,
        answer=record["answer"],
        answers=[record["answer"], *record["answer_aliases"]],
    ).with_inputs("question", "context")


class GenerateAnswer(dspy.Signature):
    """Answer the question based on the given context."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")


class QAModule(dspy.Module):
    def __init__(self, predict_cls=dspy.Predict):
        super().__init__()
        self.generate_answer = predict_cls(GenerateAnswer)

    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)


def get_predict_cls(technique):
    if technique == "standard":
        return dspy.Predict
    elif technique == "cot":
        return dspy.ChainOfThought
    elif technique == "cot-fixed":
        from bellem.dspy.predict.cot import ChainOfThought
        return ChainOfThought
    elif technique == "cte":
        from bellem.dspy.predict.cte import ConnectTheEntities

        return ConnectTheEntities
    else:
        raise ValueError(f"Unknown technique: {technique}")


def evaluate_answer(example, pred, trace=None):
    scores = compute_scores(pred.answer, example.answers)
    return scores["f1"]


def dynamic_import(module, name):
    import importlib

    return getattr(importlib.import_module(module), name)


def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_answer
    return cls(**kwargs)


def preprocess_result(result):
    example, pred, score = result
    predictions = {f"predicted_{k}": v for k, v in dict(pred).items()}
    return {**dict(example), **predictions, "score": float(score)}


def make_results_dataframe(results):
    dataf = pd.json_normalize([preprocess_result(result) for result in results])
    dataf["n_hops"] = dataf["question_decomposition"].apply(len)
    dataf['predicted_answer'] = dataf['predicted_answer'].fillna("No Answer")
    return compute_scores_dataframe(dataf)


In [2]:
dataset_path: str = "bdsaglam/hotpotqa-distractor-mini"
dataset_name: str = "default"
dataset_split: str = "train"
model: str = "llama-3-70b-tgi"
temperature: float = 0.1
technique: str = "cot"
load_from: str = "UNSET"
optimizer = "bfs-medium"
optimizer_path: Path = Path(f"../data/raw/optimizer-configs/{optimizer}.json")
out: Path = Path("compiled-program.json")

In [3]:
# Set up LLM
configure_lm(model, temperature)

# Load and preprocess datasets
ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
examples = [make_example(record) for record in ds]
print(f"Loaded {len(examples)} examples")

# Create the program
program = QAModule(predict_cls=get_predict_cls(technique))
if load_from and load_from != "UNSET":
    print(f"Loading model from {load_from}")
    program.load(load_from)

# Train the program
with open(optimizer_path) as f:
    optimizer_config = json.load(f)

if optimizer_config:
    optimizer = make_optimizer(optimizer_config)
    compile_params = optimizer_config.get("compile_params", {})
    trained_program = optimizer.compile(program, trainset=examples, **compile_params)
else:
    trained_program = program

# Save the trained program
trained_program.save(out)

Loaded 300 examples


  3%|▎         | 10/300 [10:40<5:09:37, 64.06s/it]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Bootstrapped 8 full traces after 1 examples in round 9.





In [4]:
trained_program.named_predictors()[0][1].demos

[Example({'augmented': True, 'context': "# Last Child (band)\nLast Child is an Indonesian pop punk/ alternative rock band formed in 2006.  The band's current members are Virgoun Teguh (vocals and guitar), Rachmad Firdaus (guitar), Dimas Rangga (vocals and bass).  It has released one mini album and two studio albums.\n\n# The Frames\nThe Frames are an Irish band based in Dublin.  Founded in 1990 by Glen Hansard, the band has been influential in the Dublin rock music scene.  The group has released six albums.  In addition to Hansard, the band's current lineup includes original member Colm Mac Con Iomaire, Joe Doyle, Rob Bochnik and Graham Hopkins.", 'question': 'What band as been influential in the Dublin rock music scene, The Frames or  Last Child?', 'reasoning': 'The context states that The Frames have been influential in the Dublin rock music scene, but there is no mention of Last Child being influential in the Dublin rock music scene.', 'answer': 'The Frames'}) (input_keys=None),
 Ex

In [None]:
dataset_split = "validation"
load_from = "compiled-program.json"
out = Path("results")

out.mkdir(parents=True, exist_ok=True)

# Load and preprocess datasets
ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
examples = [make_example(record) for record in ds]
print(f"Loaded {len(examples)} examples")

# Create the program
program = QAModule(predict_cls=get_predict_cls(technique))
if load_from and load_from != "UNSET":
    print(f"Loading model from {load_from}")
    program.load(load_from)

# Evaluate the program
evaluate_program = Evaluate(
    metric=evaluate_answer,
    devset=examples,
    num_threads=16,
    display_progress=True,
    return_outputs=True,
)
_, results = evaluate_program(program)

# Save the results
result_df = make_results_dataframe(results)
result_df.to_json(out / "results.jsonl", orient="records", lines=True)

# Save the scores
scores = aggregate_scores(result_df)
for n_hops in result_df["n_hops"].unique():
    scores[f"{n_hops}hops"] = aggregate_scores(result_df[result_df["n_hops"] == n_hops])

with open(out / "scores.json", "w") as f:
    json.dump(scores, f, indent=2)


Loaded 100 examples
Loading model from compiled-program.json


Average Metric: 52.91904761904761 / 64  (82.7):  64%|██████▍   | 64/100 [06:01<03:49,  6.39s/it] 