In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import random
import numpy as np
import pandas as pd

import dspy
from dspy.evaluate import Evaluate

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.7,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

In [4]:
from datasets import load_dataset

dsd = load_dataset('bdsaglam/musique', 'answerable')
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 19938
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 2417
    })
})

In [12]:
# a function that samples from the dataset with equal distribution of n_hops
def sample_evenly(dataset, n_samples):
    n_hops = np.unique(dataset['n_hops'])
    samples_per_hop = n_samples // len(n_hops)
    for hop in n_hops:
        hop_samples = dataset.filter(lambda x: len(x['question_decomposition']) == hop).shuffle().select(range(samples_per_hop))
        yield from hop_samples

In [13]:
train_samples = list(sample_evenly(dsd['train'], 100))
val_samples = list(sample_evenly(dsd['validation'], 100))

In [14]:
def make_example(record):
    decomposition = '\n'.join([f"{i+1}. {item['question']}" for i, item in enumerate(record["question_decomposition"])])
    return dspy.Example(
        question=record["question"],
        decomposition=decomposition,
    ).with_inputs("question")

In [15]:
make_example(train_samples[0])

Example({'question': "Who does the actor who played Schindler in the movie Schindler's List play in Star Wars 1?", 'decomposition': "1. who played schindler in the movie schindler's list\n2. who does #1 play in star wars 1"}) (input_keys={'question'})

In [16]:
trainset = [make_example(record) for record in train_samples]
valset = [make_example(record) for record in val_samples]

In [17]:
dict(trainset[0])

{'question': "Who does the actor who played Schindler in the movie Schindler's List play in Star Wars 1?",
 'decomposition': "1. who played schindler in the movie schindler's list\n2. who does #1 play in star wars 1"}

In [23]:
def print_example(example):
    print(example.question)
    print(example.decomposition)

In [24]:
for example in random.sample(trainset,5):
    print_example(example)
    print()

When did the Soviet Union seal off the city where the author of Hotel Savoy worked during the Weimar Republic?
1. Hotel Savoy >> author
2. #1 >> work location
3. when the soviet union sealed off the city of #2

In 2014, what was the unemployment rate in the country separated by the Rhine from the country where Aschenbrodel's composer was a citizen?
1. Aschenbrödel >> composer
2. #1 >> country of citizenship
3. The Rhine forms the border between #2 and what other country?
4. What was the unemployment rate in #3 in 2014?

There was a proposal to connect rural Alaska to the continent where Nguyen Van Nghi was born. What year was this project announced?
1. Nguyen Van Nghi >> place of birth
2. What continent is #1 found on?
3. In what year was a project to connect #2 and rural Alaska announced?

What government followed the rule of the monarch who re-translated the Reflections into French, in the country that holds the Canton of Coussey?
1. Who re-translated the Reflections into French?
2. 

In [26]:
# Metrics

## Exact Match
def split_subquestions(decomposition_str):
    for line in decomposition_str.split("\n"):
        if line.strip():
            parts = line.split(". ", 1)
            if len(parts) == 1:
                return parts[0].strip
            elif len(parts) == 2:
                yield parts[1].strip()
            else:
                raise ValueError(f"Invalid decomposition line: {line}")


# Update the evaluation function
def evaluate_decomposition_exact_match(example, pred, trace=None):
    gold_sub_questions = list(split_subquestions(example.decomposition))
    pred_sub_questions = list(split_subquestions(pred.decomposition))

    assert len(gold_sub_questions), "Gold decomposition is empty."

    exact_matches = len([1 for gold, pred in zip(gold_sub_questions, pred_sub_questions) if gold == pred])
    accuracy = exact_matches / len(gold_sub_questions)
    return accuracy

## LLM as Judge
class DecompositionJudge(dspy.Signature):
    """Judge whether the predicted decomposition matches the ground truth.

    Instructions:
    - Given a ground-truth decomposition and a predicted decomposition, assess whether they are equivalent in meaning.
    - Consider whether the steps correspond logically, even if worded differently.
    - Output 'Yes' if they are equivalent, 'No' otherwise.
    """

    ground_truth: str = dspy.InputField(desc="The ground-truth decomposition")
    prediction: str = dspy.InputField(desc="The predicted decomposition")
    equivalent: str = dspy.OutputField(desc="Are the decompositions equivalent? [Yes/No]", prefix="Equivalent[Yes/No]:")

qdecomp_judge = dspy.Predict(DecompositionJudge)

# Updated evaluation function using the judge
def evaluate_decomposition_llm(example, pred, trace=None):
    result = qdecomp_judge(
        ground_truth=example.decomposition,
        prediction=pred.decomposition,
    )
    is_equivalent = result.equivalent.strip().lower()
    return int(is_equivalent == "yes")


## Combined

def evaluate_decomposition(example, pred, trace=None):
    accuracy = evaluate_decomposition_exact_match(example, pred, trace)
    if accuracy >= 0.8:
        return accuracy
    return evaluate_decomposition_llm(example, pred, trace)


# Set up the evaluation function
evaluate_qd = Evaluate(devset=valset, metric=evaluate_decomposition, num_threads=4, display_progress=True)

In [27]:
class DecomposeQuestion(dspy.Signature):
    """Decompose a complex question into simpler sub-questions."""

    question: str = dspy.InputField()
    decomposition: str = dspy.OutputField(
        desc="Enumerated list of sub-questions, using '#n >>' notation for dependent questions"
    )

class QuestionDecompositionModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.decompose = dspy.Predict(DecomposeQuestion)

    def forward(self, question):
        pred = self.decompose(question=question)
        return dspy.Prediction(decomposition=pred.decomposition)

In [None]:
# Initialize the uncompiled question decomposition module
uncompiled_qd = QuestionDecompositionModule()

# Evaluate the uncompiled question decomposition module
uncompiled_score = evaluate_qd(uncompiled_qd)
print(f"Uncompiled Question Decomposition Score: {uncompiled_score}")

In [28]:
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BootstrapFewShotWithRandomSearch

# Set up the teleprompter
teleprompter = BootstrapFewShotWithRandomSearch(
    metric=evaluate_decomposition, 
    max_bootstrapped_demos=8, 
    max_labeled_demos=8,
)

# Compile and optimize the question decomposition module
compiled_qd = teleprompter.compile(uncompiled_qd, trainset=trainset, valset=valset)
compiled_qd.save('compiled-qd.json')
print("Question Decomposition module compiled and optimized")

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


  0%|          | 0/99 [00:00<?, ?it/s]

Average Metric: 86 / 99  (86.9): 100%|██████████| 99/99 [12:36<00:00,  7.64s/it] 


New best score: 86.87 for seed -3
Scores so far: [86.87]
Best score so far: 86.87


Average Metric: 80.0 / 99  (80.8): 100%|██████████| 99/99 [14:28<00:00,  8.77s/it]


Scores so far: [86.87, 80.81]
Best score so far: 86.87


 10%|█         | 10/99 [07:16<1:04:44, 43.65s/it]


Bootstrapped 8 full traces after 11 examples in round 0.


Average Metric: 87.0 / 99  (87.9): 100%|██████████| 99/99 [14:57<00:00,  9.06s/it]


New best score: 87.88 for seed -1
Scores so far: [86.87, 80.81, 87.88]
Best score so far: 87.88


  7%|▋         | 7/99 [04:48<1:03:05, 41.15s/it]


Bootstrapped 7 full traces after 8 examples in round 0.


Average Metric: 84 / 99  (84.8): 100%|██████████| 99/99 [14:03<00:00,  8.52s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85]
Best score so far: 87.88


  3%|▎         | 3/99 [02:10<1:09:50, 43.65s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 84 / 99  (84.8): 100%|██████████| 99/99 [13:54<00:00,  8.43s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85]
Best score so far: 87.88


  1%|          | 1/99 [00:48<1:18:43, 48.20s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 88 / 99  (88.9): 100%|██████████| 99/99 [14:03<00:00,  8.52s/it]


New best score: 88.89 for seed 2
Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89]
Best score so far: 88.89


  4%|▍         | 4/99 [02:55<1:09:29, 43.89s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 81 / 99  (81.8): 100%|██████████| 99/99 [11:07<00:00,  6.74s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82]
Best score so far: 88.89


  5%|▌         | 5/99 [01:25<26:42, 17.04s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 84.0 / 99  (84.8): 100%|██████████| 99/99 [08:14<00:00,  5.00s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85]
Best score so far: 88.89


  6%|▌         | 6/99 [01:40<26:00, 16.78s/it]


Bootstrapped 5 full traces after 7 examples in round 0.


Average Metric: 89 / 99  (89.9): 100%|██████████| 99/99 [08:32<00:00,  5.17s/it] 


New best score: 89.9 for seed 5
Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9]
Best score so far: 89.9


  2%|▏         | 2/99 [00:41<33:56, 20.99s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 87 / 99  (87.9): 100%|██████████| 99/99 [08:49<00:00,  5.35s/it] 


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88]
Best score so far: 89.9


  8%|▊         | 8/99 [03:36<41:05, 27.09s/it]


Bootstrapped 6 full traces after 9 examples in round 0.


Average Metric: 81 / 99  (81.8): 100%|██████████| 99/99 [08:30<00:00,  5.16s/it] 


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82]
Best score so far: 89.9


  5%|▌         | 5/99 [01:40<31:34, 20.15s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 86 / 99  (86.9): 100%|██████████| 99/99 [08:20<00:00,  5.05s/it] 


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87]
Best score so far: 89.9


  9%|▉         | 9/99 [02:49<28:16, 18.85s/it]


Bootstrapped 8 full traces after 10 examples in round 0.


Average Metric: 88 / 99  (88.9): 100%|██████████| 99/99 [08:11<00:00,  4.96s/it] 


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89]
Best score so far: 89.9


  1%|          | 1/99 [00:15<24:46, 15.17s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 80.0 / 99  (80.8): 100%|██████████| 99/99 [06:20<00:00,  3.85s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89, 80.81]
Best score so far: 89.9


  8%|▊         | 8/99 [01:35<18:08, 11.96s/it]


Bootstrapped 8 full traces after 9 examples in round 0.


Average Metric: 88.0 / 99  (88.9): 100%|██████████| 99/99 [04:43<00:00,  2.87s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89, 80.81, 88.89]
Best score so far: 89.9


  9%|▉         | 9/99 [01:40<16:45, 11.17s/it]


Bootstrapped 8 full traces after 10 examples in round 0.


Average Metric: 92.0 / 99  (92.9): 100%|██████████| 99/99 [02:10<00:00,  1.32s/it] 


New best score: 92.93 for seed 12
Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89, 80.81, 88.89, 92.93]
Best score so far: 92.93


  5%|▌         | 5/99 [00:17<05:26,  3.48s/it]


Bootstrapped 5 full traces after 6 examples in round 0.


Average Metric: 86 / 99  (86.9): 100%|██████████| 99/99 [02:10<00:00,  1.32s/it] 


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89, 80.81, 88.89, 92.93, 86.87]
Best score so far: 92.93


  2%|▏         | 2/99 [00:07<06:01,  3.72s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 84 / 99  (84.8): 100%|██████████| 99/99 [02:10<00:00,  1.32s/it]


Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89, 80.81, 88.89, 92.93, 86.87, 84.85]
Best score so far: 92.93


  4%|▍         | 4/99 [00:15<06:01,  3.81s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 89 / 99  (89.9): 100%|██████████| 99/99 [02:09<00:00,  1.31s/it] 

Scores so far: [86.87, 80.81, 87.88, 84.85, 84.85, 88.89, 81.82, 84.85, 89.9, 87.88, 81.82, 86.87, 88.89, 80.81, 88.89, 92.93, 86.87, 84.85, 89.9]
Best score so far: 92.93
19 candidate programs found.
Question Decomposition module compiled and optimized





In [32]:
# Evaluate the uncompiled question decomposition module
uncompiled_score = evaluate_qd(uncompiled_qd)
print(f"Uncompiled Question Decomposition Score: {uncompiled_score}")

Average Metric: 82 / 99  (82.8): 100%|██████████| 99/99 [16:02<00:00,  9.73s/it]

Uncompiled Question Decomposition Score: 82.83





In [31]:
# Evaluate the compiled question decomposition module
compiled_score = evaluate_qd(compiled_qd)
print(f"Compiled Question Decomposition Score: {compiled_score}")

Average Metric: 87 / 99  (87.9): 100%|██████████| 99/99 [02:35<00:00,  1.57s/it]

Compiled Question Decomposition Module Score: 87.88





In [33]:
compiled_qd.save('compiled-qd.json')

[('decompose', Predict(DecomposeQuestion(question -> decomposition
    instructions='Decompose a complex question into simpler sub-questions.'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    decomposition = Field(annotation=str required=True json_schema_extra={'desc': "Enumerated list of sub-questions, using '#n >>' notation for dependent questions", '__dspy_field_type': 'output', 'prefix': 'Decomposition:'})
)))]


In [35]:
# Error Analysis
def perform_error_analysis(qd_module, dataset, num_samples=5):
    errors = []
    for example in dataset:
        pred = qd_module(question=example.question)
        if not evaluate_decomposition(example, pred):
            errors.append((example, pred))
    
    print(f"Total errors: {len(errors)}")
    print("\nSample of errors:")
    for example, pred in errors[:num_samples]:
        print(f"Original Question: {example.question}")
        print(f"# True Decomposition\n{example.decomposition}")
        print(f"# Predicted Decomposition\n{pred.decomposition}")
        print()

print("\nError analysis for compiled Question Decomposition module:")
perform_error_analysis(compiled_qd, valset)


Error analysis for compiled Question Decomposition module:


In [None]:
compiled_qd.predictors()[0]