In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.7,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

In [4]:
from datasets import load_dataset

dsd = load_dataset('bdsaglam/musique', 'answerable')
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 19938
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 2417
    })
})

In [5]:
# a function that samples from the dataset with equal distribution of n_hops
def sample_evenly(dataset, n_samples):
    dataset = dataset.map(lambda x: {'n_hops': len(x['question_decomposition'])})
    n_hops = np.unique(dataset['n_hops'])
    samples_per_hop = n_samples // len(n_hops)
    for hop in n_hops:
        hop_samples = dataset.filter(lambda x: x['n_hops'] == hop).shuffle().select(range(samples_per_hop))
        yield from hop_samples

In [6]:
train_samples = list(sample_evenly(dsd['train'], 30))
val_samples = list(sample_evenly(dsd['validation'], 30))

In [7]:
def make_example(record):
    decomposition = '\n'.join([f"{i+1}. {item['question']}" for i, item in enumerate(record["question_decomposition"])])
    return dspy.Example(
        question=record["question"],
        decomposition=decomposition,
    ).with_inputs("question")

In [8]:
make_example(train_samples[0])

Example({'question': 'Why did the Battle of the state where Me, Myself and Irene takes place happen?', 'decomposition': '1. where did me myself and irene take place\n2. why did the battle of #1 happen'}) (input_keys={'question'})

In [9]:
trainset = [make_example(record) for record in train_samples]
valset = [make_example(record) for record in val_samples]

In [10]:
dict(trainset[0])

{'question': 'Why did the Battle of the state where Me, Myself and Irene takes place happen?',
 'decomposition': '1. where did me myself and irene take place\n2. why did the battle of #1 happen'}

In [11]:
def print_example(example):
    print(example.question)
    print(example.decomposition)

In [12]:
for example in random.sample(trainset,5):
    print_example(example)
    print()

What social activities have Muslim's contributed to in The Mystic Masseur film's producer's birth country?
1. The Mystic Masseur >> producer
2. #1 >> place of birth
3. in which country is the city of #2
4. What are the roles of Muslims across #3 ?

Who failed to start an English colony off the coast of the state that borders the east of the state where Hello Love's performer lived in when he died?
1. Hello Love >> performer
2. What city did #1 live when he died?
3. Which state borders #2 to the east?
4. who failed in his attempt to start an english colony off the coast of #3

The county where Tiffany Scott was born shares a border with what county?
1. Tiffany Scott >> place of birth
2. #1 >> located in the administrative territorial entity
3. #2 >> shares border with

In which episode of Doctor Who did a character based on the creator of Trees and Undergrowth appear?
1. Trees and Undergrowth >> creator
2. episode of doctor who with #1

What is the highest point where the Green-breasted

In [13]:
# Metrics

## Exact Match
def split_subquestions(decomposition_str):
    for line in decomposition_str.split("\n"):
        if line.strip():
            parts = line.split(". ", 1)
            if len(parts) == 1:
                return parts[0].strip
            elif len(parts) == 2:
                yield parts[1].strip()
            else:
                raise ValueError(f"Invalid decomposition line: {line}")


# Update the evaluation function
def evaluate_decomposition_exact_match(example, pred, trace=None):
    gold_sub_questions = list(split_subquestions(example.decomposition))
    pred_sub_questions = list(split_subquestions(pred.decomposition))

    assert len(gold_sub_questions), "Gold decomposition is empty."

    exact_matches = len([1 for gold, pred in zip(gold_sub_questions, pred_sub_questions) if gold == pred])
    accuracy = exact_matches / len(gold_sub_questions)
    return accuracy

## LLM as Judge
class DecompositionJudge(dspy.Signature):
    """Judge whether the predicted decomposition matches the ground truth.

    Instructions:
    - Given a ground-truth decomposition and a predicted decomposition, assess whether they are equivalent in meaning.
    - Consider whether the steps correspond logically, even if worded differently.
    - Output 'Yes' if they are equivalent, 'No' otherwise.
    """

    ground_truth: str = dspy.InputField(desc="The ground-truth decomposition")
    prediction: str = dspy.InputField(desc="The predicted decomposition")
    equivalent: str = dspy.OutputField(desc="Are the decompositions equivalent? [Yes/No]", prefix="Equivalent[Yes/No]:")

qdecomp_judge = dspy.Predict(DecompositionJudge)

# Updated evaluation function using the judge
def evaluate_decomposition_llm(example, pred, trace=None):
    result = qdecomp_judge(
        ground_truth=example.decomposition,
        prediction=pred.decomposition,
    )
    is_equivalent = result.equivalent.strip().lower()
    return int(is_equivalent == "yes")


## Combined

def evaluate_decomposition(example, pred, trace=None):
    accuracy = evaluate_decomposition_exact_match(example, pred, trace)
    if accuracy >= 0.8:
        return accuracy
    return evaluate_decomposition_llm(example, pred, trace)


# Set up the evaluation function
evaluate_qd = Evaluate(devset=valset, metric=evaluate_decomposition, num_threads=8, display_progress=True, return_outputs=True)

In [14]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if score < 1.0] 
    for example, pred in errors:
        print(f"Original Question: {example.question}")
        print(f"# Groundtruth Decomposition\n{example.decomposition}")
        print(f"# Predicted Decomposition\n{pred.decomposition}")
        print()

In [15]:
class DecomposeQuestion(dspy.Signature):
    """Decompose a complex question into simpler sub-questions."""

    question: str = dspy.InputField()
    decomposition: str = dspy.OutputField(
        desc="Enumerated list of sub-questions, using '#n >>' notation for dependent questions"
    )

class QuestionDecompositionModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.decompose = dspy.Predict(DecomposeQuestion)

    def forward(self, question):
        pred = self.decompose(question=question)
        return dspy.Prediction(decomposition=pred.decomposition)

In [16]:
# Initialize the uncompiled question decomposition module
uncompiled_qd = QuestionDecompositionModule()

# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_qd(uncompiled_qd, return_outputs=True)
print(f"Uncompiled Question Decomposition Score: {uncompiled_score}")

  0%|          | 0/30 [00:00<?, ?it/s]

Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:36<00:00,  1.23s/it]

Uncompiled Question Decomposition Score: 86.67





In [17]:
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BootstrapFewShotWithRandomSearch

# Set up the teleprompter
teleprompter = BootstrapFewShotWithRandomSearch(
    metric=evaluate_decomposition, 
    max_bootstrapped_demos=8, 
    max_labeled_demos=8,
)

# Compile and optimize the question decomposition module
compiled_qd = teleprompter.compile(uncompiled_qd, trainset=trainset, valset=valset)
compiled_qd.save('qdecomp-program-compiled.json')
print("Question Decomposition module compiled and optimized")

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


  0%|          | 0/30 [00:00<?, ?it/s]

Average Metric: 25 / 30  (83.3): 100%|██████████| 30/30 [00:41<00:00,  1.39s/it]


New best score: 83.33 for seed -3
Scores so far: [83.33]
Best score so far: 83.33


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:51<00:00,  1.73s/it] 


New best score: 86.67 for seed -2
Scores so far: [83.33, 86.67]
Best score so far: 86.67


 27%|██▋       | 8/30 [00:37<01:43,  4.72s/it]


Bootstrapped 8 full traces after 9 examples in round 0.


Average Metric: 27 / 30  (90.0): 100%|██████████| 30/30 [00:46<00:00,  1.54s/it] 


New best score: 90.0 for seed -1
Scores so far: [83.33, 86.67, 90.0]
Best score so far: 90.0


 40%|████      | 12/30 [01:03<01:35,  5.32s/it]


Bootstrapped 7 full traces after 13 examples in round 0.


Average Metric: 24 / 30  (80.0): 100%|██████████| 30/30 [00:49<00:00,  1.65s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0]
Best score so far: 90.0


 13%|█▎        | 4/30 [00:23<02:33,  5.90s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 24 / 30  (80.0): 100%|██████████| 30/30 [00:49<00:00,  1.66s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0]
Best score so far: 90.0


  3%|▎         | 1/30 [00:04<02:13,  4.59s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:48<00:00,  1.62s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67]
Best score so far: 90.0


 13%|█▎        | 4/30 [00:22<02:25,  5.62s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 28 / 30  (93.3): 100%|██████████| 30/30 [00:53<00:00,  1.80s/it] 


New best score: 93.33 for seed 3
Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33]
Best score so far: 93.33


 17%|█▋        | 5/30 [00:28<02:23,  5.73s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 27 / 30  (90.0): 100%|██████████| 30/30 [00:49<00:00,  1.66s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0]
Best score so far: 93.33


 20%|██        | 6/30 [00:30<02:01,  5.08s/it]


Bootstrapped 5 full traces after 7 examples in round 0.


Average Metric: 27 / 30  (90.0): 100%|██████████| 30/30 [00:50<00:00,  1.67s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0]
Best score so far: 93.33


 13%|█▎        | 4/30 [00:25<02:48,  6.48s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:51<00:00,  1.73s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67]
Best score so far: 93.33


 23%|██▎       | 7/30 [00:38<02:07,  5.53s/it]


Bootstrapped 6 full traces after 8 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:48<00:00,  1.61s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67]
Best score so far: 93.33


 23%|██▎       | 7/30 [00:43<02:21,  6.16s/it]


Bootstrapped 4 full traces after 8 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:50<00:00,  1.69s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67]
Best score so far: 93.33


 33%|███▎      | 10/30 [00:41<01:23,  4.19s/it]


Bootstrapped 8 full traces after 11 examples in round 0.


Average Metric: 25 / 30  (83.3): 100%|██████████| 30/30 [01:10<00:00,  2.34s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33]
Best score so far: 93.33


  3%|▎         | 1/30 [00:20<09:59, 20.68s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [02:55<00:00,  5.85s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33, 86.67]
Best score so far: 93.33


 33%|███▎      | 10/30 [03:58<07:57, 23.85s/it]


Bootstrapped 8 full traces after 11 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [05:07<00:00, 10.25s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33, 86.67, 86.67]
Best score so far: 93.33


 40%|████      | 12/30 [11:19<16:59, 56.65s/it]


Bootstrapped 8 full traces after 13 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [06:30<00:00, 13.03s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33, 86.67, 86.67, 86.67]
Best score so far: 93.33


 20%|██        | 6/30 [08:03<32:15, 80.66s/it]


Bootstrapped 5 full traces after 7 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [07:36<00:00, 15.21s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33, 86.67, 86.67, 86.67, 86.67]
Best score so far: 93.33


  7%|▋         | 2/30 [02:42<37:57, 81.34s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 25 / 30  (83.3): 100%|██████████| 30/30 [07:30<00:00, 15.01s/it] 


Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33, 86.67, 86.67, 86.67, 86.67, 83.33]
Best score so far: 93.33


 17%|█▋        | 5/30 [06:48<34:00, 81.62s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 27 / 30  (90.0): 100%|██████████| 30/30 [07:14<00:00, 14.48s/it] 

Scores so far: [83.33, 86.67, 90.0, 80.0, 80.0, 86.67, 93.33, 90.0, 90.0, 86.67, 86.67, 86.67, 83.33, 86.67, 86.67, 86.67, 86.67, 83.33, 90.0]
Best score so far: 93.33
19 candidate programs found.
[('decompose', Predict(DecomposeQuestion(question -> decomposition
    instructions='Decompose a complex question into simpler sub-questions.'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    decomposition = Field(annotation=str required=True json_schema_extra={'desc': "Enumerated list of sub-questions, using '#n >>' notation for dependent questions", '__dspy_field_type': 'output', 'prefix': 'Decomposition:'})
)))]
Question Decomposition module compiled and optimized





In [18]:
# Evaluate the compiled question decomposition module
compiled_score, compiled_results = evaluate_qd(compiled_qd, return_outputs=True)
print(f"Compiled Question Decomposition Score: {compiled_score}")

  0%|          | 0/30 [00:00<?, ?it/s]

Average Metric: 28 / 30  (93.3): 100%|██████████| 30/30 [05:59<00:00, 11.99s/it] 

Compiled Question Decomposition Score: 93.33





In [19]:
print("Error analysis for uncompiled question decomposition:")
present_errors(uncompiled_results)

print("Error analysis for compiled question decomposition:")
present_errors(compiled_results)

Error analysis for uncompiled question decomposition:
Original Question: The developer of Mozilla Sunbird created which browser?
# Groundtruth Decomposition
1. Mozilla Sunbird >> developer
2. What was the resulting browser for #1 ?
# Predicted Decomposition
#1 >> What is Mozilla Sunbird?
#2 >> Who developed Mozilla Sunbird?
#3 >> What browser did the developer of Mozilla Sunbird create?

Original Question: Where did the player who scored the most points in a NBA season go in the NBA Draft?
# Groundtruth Decomposition
1. who has the most points in a nba season
2. where did #1 go in the nba draft
# Predicted Decomposition
#1 >> Who scored the most points in a NBA season?
#2 >> In what year did this player score the most points?
#3 >> Where did this player go in the NBA Draft?

Original Question: What is the meaning of the word that is also a majority religion in what became India when the country that disavowed the Taliban was created in the Arabic dictionary?
# Groundtruth Decomposition