In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.7,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

In [4]:
from datasets import load_dataset

dsd = load_dataset('bdsaglam/musique', 'answerable')
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 19938
    })
    validation: Dataset({
        features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
        num_rows: 2417
    })
})

In [5]:
# a function that samples from the dataset with equal distribution of n_hops
def sample_evenly(dataset, n_samples):
    dataset = dataset.map(lambda x: {'n_hops': len(x['question_decomposition'])})
    n_hops = np.unique(dataset['n_hops'])
    samples_per_hop = n_samples // len(n_hops)
    for hop in n_hops:
        hop_samples = dataset.filter(lambda x: x['n_hops'] == hop).shuffle().select(range(samples_per_hop))
        yield from hop_samples

In [6]:
train_samples = list(sample_evenly(dsd['train'], 30))
val_samples = list(sample_evenly(dsd['validation'], 30))

In [7]:
def make_example(record):
    decomposition = '\n'.join([f"{i+1}. {item['question']}" for i, item in enumerate(record["question_decomposition"])])
    return dspy.Example(
        question=record["question"],
        decomposition=decomposition,
    ).with_inputs("question")

In [8]:
make_example(train_samples[0])

Example({'question': 'Who was the ninth governor-general in the country where Victory is located?', 'decomposition': '1. Victory >> country\n2. Who was the ninth governor-general in #1 ?'}) (input_keys={'question'})

In [9]:
trainset = [make_example(record) for record in train_samples]
valset = [make_example(record) for record in val_samples]

In [10]:
dict(trainset[0])

{'question': 'Who was the ninth governor-general in the country where Victory is located?',
 'decomposition': '1. Victory >> country\n2. Who was the ninth governor-general in #1 ?'}

In [11]:
def print_example(example):
    print(example.question)
    print(example.decomposition)

In [12]:
for example in random.sample(trainset,5):
    print_example(example)
    print()

Who was manager when Aziz Deen-Conteh's team won the Champions League?
1. Aziz Deen-Conteh >> member of sports team
2. who was manager when #1 won champions league

What do they call the local government of the city where Spectre filming moved after the city where the author of Cum nimis absurdum died?
1. Cum nimis absurdum >> author
2. #1 >> place of death
3. Where did Spectre filming take place after #2 ?
4. What is the local government of #3 called?

There is a group of islands among which is one that received COM status in 2007 alongside St Barts. When did the people who received support from Posen in the Franco-Prussian War come to those islands?
1. What island besides St. Barts was granted COM status by France in 2007?
2. #1 (French part) >> located on terrain feature
3. What was there strong support of in Posen?
4. when did the #3 come to the #2

1994 Tour of the country whose official name is sometimes known as the country having Boesingheliede is a type of what?
1. Boesingheli

In [13]:
# Metrics

## Exact Match
def split_subquestions(decomposition_str):
    for line in decomposition_str.split("\n"):
        if line.strip():
            parts = line.split(". ", 1)
            if len(parts) == 1:
                return parts[0].strip
            elif len(parts) == 2:
                yield parts[1].strip()
            else:
                raise ValueError(f"Invalid decomposition line: {line}")


# Update the evaluation function
def evaluate_decomposition_exact_match(example, pred, trace=None):
    gold_sub_questions = list(split_subquestions(example.decomposition))
    pred_sub_questions = list(split_subquestions(pred.decomposition))

    assert len(gold_sub_questions), "Gold decomposition is empty."

    exact_matches = len([1 for gold, pred in zip(gold_sub_questions, pred_sub_questions) if gold == pred])
    accuracy = exact_matches / len(gold_sub_questions)
    return accuracy

## LLM as Judge
class DecompositionJudge(dspy.Signature):
    """Judge whether the predicted decomposition matches the ground truth.

    Instructions:
    - Given a ground-truth decomposition and a predicted decomposition, assess whether they are equivalent in meaning.
    - Consider whether the steps correspond logically, even if worded differently.
    - Output 'Yes' if they are equivalent, 'No' otherwise.
    """

    ground_truth: str = dspy.InputField(desc="The ground-truth decomposition")
    prediction: str = dspy.InputField(desc="The predicted decomposition")
    equivalent: str = dspy.OutputField(desc="Are the decompositions equivalent? [Yes/No]", prefix="Equivalent[Yes/No]:")

qdecomp_judge = dspy.Predict(DecompositionJudge)

# Updated evaluation function using the judge
def evaluate_decomposition_llm(example, pred, trace=None):
    result = qdecomp_judge(
        ground_truth=example.decomposition,
        prediction=pred.decomposition,
    )
    is_equivalent = result.equivalent.strip().lower()
    return int(is_equivalent == "yes")


## Combined

def evaluate_decomposition(example, pred, trace=None):
    accuracy = evaluate_decomposition_exact_match(example, pred, trace)
    if accuracy >= 0.8:
        return accuracy
    return evaluate_decomposition_llm(example, pred, trace)


# Set up the evaluation function
evaluate_qd = Evaluate(devset=valset, metric=evaluate_decomposition, num_threads=8, display_progress=True)

In [14]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if score < 1.0] 
    for example, pred in errors:
        print(f"Original Question: {example.question}")
        print(f"# Groundtruth Decomposition\n{example.decomposition}")
        print(f"# Predicted Decomposition\n{pred.decomposition}")
        print()

In [15]:
class DecomposeQuestion(dspy.Signature):
    """Decompose a complex question into simpler sub-questions."""

    question: str = dspy.InputField()
    decomposition: str = dspy.OutputField(
        desc="Enumerated list of sub-questions, using '#n >>' notation for dependent questions"
    )

class QuestionDecompositionModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.decompose = dspy.Predict(DecomposeQuestion)

    def forward(self, question):
        pred = self.decompose(question=question)
        return dspy.Prediction(decomposition=pred.decomposition)

In [16]:
# Initialize the uncompiled question decomposition module
uncompiled_qd = QuestionDecompositionModule()

# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_qd(uncompiled_qd, return_outputs=True)
print(f"Uncompiled Question Decomposition Score: {uncompiled_score}")

  0%|          | 0/30 [00:00<?, ?it/s]

Average Metric: 25 / 30  (83.3): 100%|██████████| 30/30 [00:28<00:00,  1.04it/s]

Uncompiled Question Decomposition Score: 83.33





In [17]:
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BootstrapFewShotWithRandomSearch

# Set up the teleprompter
teleprompter = BootstrapFewShotWithRandomSearch(
    metric=evaluate_decomposition, 
    max_bootstrapped_demos=8, 
    max_labeled_demos=8,
)

# Compile and optimize the question decomposition module
compiled_qd = teleprompter.compile(uncompiled_qd, trainset=trainset, valset=valset)
compiled_qd.save('qdecomp-program-compiled.json')
print("Question Decomposition module compiled and optimized")

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


Average Metric: 23 / 30  (76.7): 100%|██████████| 30/30 [00:33<00:00,  1.12s/it]


New best score: 76.67 for seed -3
Scores so far: [76.67]
Best score so far: 76.67


Average Metric: 23 / 30  (76.7): 100%|██████████| 30/30 [00:40<00:00,  1.35s/it]


Scores so far: [76.67, 76.67]
Best score so far: 76.67


 27%|██▋       | 8/30 [00:24<01:08,  3.11s/it]


Bootstrapped 8 full traces after 9 examples in round 0.


Average Metric: 26 / 30  (86.7): 100%|██████████| 30/30 [00:36<00:00,  1.23s/it]


New best score: 86.67 for seed -1
Scores so far: [76.67, 76.67, 86.67]
Best score so far: 86.67


 27%|██▋       | 8/30 [00:27<01:16,  3.46s/it]


Bootstrapped 7 full traces after 9 examples in round 0.


Average Metric: 9 / 12  (75.0):  37%|███▋      | 11/30 [00:16<00:18,  1.05it/s][2m2024-10-07T16:00:19.312750Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.APIError: APIError: OpenAIException - Connection error.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 9.0 / 13  (69.2):  43%|████▎     | 13/30 [07:25<28:31, 100.66s/it][2m2024-10-07T16:00:19.337143Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.APIError: APIError: OpenAIException - Connection error.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 9.0 / 14  (64.3):  43%|████▎     | 13/30 [07:25<28:31, 100.66s/it][2m2024-10-07T16:00:19.373370Z[0m [[31m[1merror    [0m] [1mError for example in dev


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



[2m2024-10-07T16:00:19.514583Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.APIError: APIError: OpenAIException - Connection error.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 9.0 / 18  (50.0):  57%|█████▋    | 17/30 [07:25<10:54, 50.36s/it]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



[2m2024-10-07T16:00:20.660472Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.APIError: APIError: OpenAIException - Connection error.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 9.0 / 19  (47.4):  63%|██████▎   | 19/30 [07:26<05:25, 29.55s/it][2m2024-10-07T16:00:20.706557Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.APIError: APIError: OpenAIException - Connection error.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 9.0 / 20  (45.0):  63%|██████▎   | 19/30 [07:26<05:25, 29.55s/it][2m2024-10-07T16:00:20.741464Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.APIError: APIError: OpenAIException - Connection error.. Set `pr


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Fee

APIError: litellm.APIError: APIError: OpenAIException - Connection error.

In [19]:
# Evaluate the compiled question decomposition module
compiled_score, compiled_results = evaluate_qd(compiled_qd, return_outputs=True)
print(f"Compiled Question Decomposition Score: {compiled_score}")

Average Metric: 8 / 8  (100.0):  27%|██▋       | 8/30 [00:49<01:05,  2.99s/it]

Average Metric: 28 / 30  (93.3): 100%|██████████| 30/30 [03:06<00:00,  6.22s/it] 

Compiled Question Decomposition Score: 93.33





In [3]:
print("Error analysis for uncompiled question decomposition:")
present_errors(uncompiled_results)

print("Error analysis for compiled question decomposition:")
present_errors(compiled_results)

Error analysis for uncompiled question decomposition:


NameError: name 'uncompiled_results' is not defined