In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import random

import dspy
from dspy.evaluate import Evaluate

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

In [4]:
from datasets import load_dataset

dsd = load_dataset('bdsaglam/musique', 'answerable')
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 19938
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 2417
    })
})

In [5]:
def make_example(record):
    decomposition = '\n'.join([f"{i+1}. {item['question']}" for i, item in enumerate(record["question_decomposition"])])
    return dspy.Example(
        question=record["question"],
        decomposition=decomposition,
    ).with_inputs("question")

In [6]:
make_example(dsd['train'][0])

Example({'question': 'When was the institute that owned The Collegian founded?', 'decomposition': '1. The Collegian >> owned by\n2. When was #1 founded?'}) (input_keys={'question'})

In [7]:
from itertools import islice

trainset = [make_example(record) for record in islice(iter(dsd['train']), 100)]
evalset = [make_example(record) for record in islice(iter(dsd['validation']), 100)]

In [8]:
dict(trainset[0])

{'question': 'When was the institute that owned The Collegian founded?',
 'decomposition': '1. The Collegian >> owned by\n2. When was #1 founded?'}

In [9]:
def print_example(example):
    print(f"Question: {example.question}")
    print("# Decomposition")
    print(example.decomposition)

In [10]:
for example in random.sample(trainset,5):
    print_example(example)
    print('-'*80)

Question: When did the country formerly known as Zaire become independent?
# Decomposition
1. zaire is the former name of which african nation
2. when did #1 become independent
--------------------------------------------------------------------------------
Question: What is the record label for the person who sang Beauty and the Beast with Celine Dion?
# Decomposition
1. who did celine dion sing beauty and the beast with
2. #1 >> record label
--------------------------------------------------------------------------------
Question: What year was the end of the company that built the White armored car?
# Decomposition
1. What company built White armored car?
2. What year was the end of #1 ?
--------------------------------------------------------------------------------
Question: When did the institute that gives out the Elliott Cresson Medal award open?
# Decomposition
1. Who gives out the Elliott Cresson Medal award?
2. When did #1 open?
----------------------------------------------

In [11]:
class DecompositionJudge(dspy.Signature):
    """Judge whether the predicted decomposition matches the ground truth.

    Instructions:
    - Given a ground-truth decomposition and a predicted decomposition, assess whether they are equivalent in meaning.
    - Consider whether the steps correspond logically, even if worded differently.
    - Output 'Yes' if they are equivalent, 'No' otherwise.
    """

    ground_truth: str = dspy.InputField(desc="The ground-truth decomposition")
    prediction: str = dspy.InputField(desc="The predicted decomposition")
    equivalent: str = dspy.OutputField(desc="Are the decompositions equivalent? [Yes/No]", prefix="Equivalent[Yes/No]:")


In [12]:
# Update the Signature
class DecomposeQuestion(dspy.Signature):
    """Decompose a complex question into simpler sub-questions."""

    question: str = dspy.InputField()
    decomposition: str = dspy.OutputField(
        desc="Enumerated list of sub-questions, using '#n >>' notation for dependent questions"
    )


# Update the Module
class QuestionDecompositionModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.decompose = dspy.ChainOfThought(DecomposeQuestion)

    def forward(self, question):
        pred = self.decompose(question=question)
        return dspy.Prediction(decomposition=pred.decomposition)


def split_subquestions(decomposition_str):
    for line in decomposition_str.split("\n"):
        if line.strip():
            parts = line.split(". ", 1)
            if len(parts) == 1:
                return parts[0].strip
            elif len(parts) == 2:
                yield parts[1].strip()
            else:
                raise ValueError(f"Invalid decomposition line: {line}")


# Update the evaluation function
def evaluate_decomposition_exact_match(example, pred, trace=None):
    gold_sub_questions = list(split_subquestions(example.decomposition))
    pred_sub_questions = list(split_subquestions(pred.decomposition))

    assert len(gold_sub_questions), "Gold decomposition is empty."

    exact_matches = len([1 for gold, pred in zip(gold_sub_questions, pred_sub_questions) if gold == pred])
    accuracy = exact_matches / len(gold_sub_questions)
    return accuracy


qdecomp_judge = dspy.Predict(DecompositionJudge)


# Updated evaluation function using the judge
def evaluate_decomposition_llm(example, pred, trace=None):
    result = qdecomp_judge(
        ground_truth=example.decomposition,
        prediction=pred.decomposition,
    )
    is_equivalent = result.equivalent.strip().lower()
    return int(is_equivalent == "yes")


def evaluate_decomposition(example, pred, trace=None):
    accuracy = evaluate_decomposition_exact_match(example, pred, trace)
    if accuracy >= 0.8:
        return accuracy
    return evaluate_decomposition_llm(example, pred, trace)

In [13]:
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BootstrapFewShotWithRandomSearch

# ## 6. Implement the optimization process
# Initialize the uncompiled question decomposition module
uncompiled_qd = QuestionDecompositionModule()

# Set up the teleprompter
teleprompter = LabeledFewShot(k=8)

# Compile and optimize the question decomposition module
compiled_qd = teleprompter.compile(uncompiled_qd, trainset=trainset)

print("Question Decomposition module compiled and optimized")

Question Decomposition module compiled and optimized


In [14]:
# Set up the evaluation function
evaluate_qd = Evaluate(devset=evalset, metric=evaluate_decomposition, num_threads=1, display_progress=True)

# Evaluate the uncompiled question decomposition module
uncompiled_score = evaluate_qd(uncompiled_qd)
print(f"Uncompiled Question Decomposition Module Score: {uncompiled_score}")

# Evaluate the compiled question decomposition module
compiled_score = evaluate_qd(compiled_qd)
print(f"Compiled Question Decomposition Module Score: {compiled_score}")

Average Metric: 2 / 2  (100.0):   2%|▏         | 2/100 [01:28<1:13:31, 45.01s/it]

In [None]:
# ## 8. (Optional) Error Analysis


def perform_error_analysis(qd_module, dataset, num_samples=5):
    errors = []
    for example in dataset:
        pred = qd_module(question=example.question)
        if not evaluate_decomposition(example, pred):
            errors.append((example, pred))
    
    print(f"Total errors: {len(errors)}")
    print("\nSample of errors:")
    for example, pred in errors[:num_samples]:
        print(f"Original Question: {example.question}")
        print(f"# True Decomposition\n{example.decomposition}")
        print(f"# Predicted Decomposition\n{pred.decomposition}")
        print()

print("Error analysis for uncompiled Question Decomposition module:")
perform_error_analysis(uncompiled_qd, evalset)

print("\nError analysis for compiled Question Decomposition module:")
perform_error_analysis(compiled_qd, evalset)


compiled_qd.predictors()[0]


compiled_qd.save('compiled-qd.json')