# DSPy Question Answering Pipeline

This notebook implements a DSPy pipeline for optimizing question answering prompts.

## 1. Set up the environment

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [2]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.7,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

## 2. Load and preprocess the datasets

In [3]:
from datasets import load_dataset

train_ds = load_dataset('bdsaglam/musique-2hop', 'answerable', split='train')
val_ds = load_dataset('bdsaglam/musique-thesis', 'answerable', split='validation')
train_ds, val_ds

(Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
     num_rows: 14376
 }),
 Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable'],
     num_rows: 100
 }))

In [4]:
train_ds[0]

{'id': '2hop__482757_12019',
 'paragraphs': [{'idx': 0,
   'title': 'Pakistan Super League',
   'paragraph_text': 'Pakistan Super League (Urdu: پاکستان سپر لیگ \u202c \u200e; PSL) is a Twenty20 cricket league, founded in Lahore on 9 September 2015 with five teams and now comprises six teams. Instead of operating as an association of independently owned teams, the league is a single entity in which each franchise is owned and controlled by investors.',
   'is_supporting': False},
  {'idx': 1,
   'title': 'Serena Wilson',
   'paragraph_text': 'Serena Wilson (August 8, 1933 – June 17, 2007), often known just as "Serena", was a well-known dancer, choreographer, and teacher who helped popularize belly dance in the United States. Serena\'s work also helped legitimize the dance form and helped it to be perceived as more than burlesque or stripping. Serena danced in clubs in her younger years, opened her own studio, hosted her own television show, founded her own dance troupe, and was the auth

In [5]:
def make_example(record):
    id2paragraph = {p['idx']: p for p in record['paragraphs']}
    supporting_paragraphs = [id2paragraph[qd['paragraph_support_idx']]['paragraph_text'] for qd in record['question_decomposition']]
    return dspy.Example(
        question=record['question'],
        paragraphs=supporting_paragraphs,
        answers=[record['answer'], *record['answer_aliases']],
    ).with_inputs('question', 'paragraphs')

In [6]:
trainset = [make_example(record) for record in train_ds][:100]
valset = [make_example(record) for record in val_ds]
dict(trainset[0])

{'question': 'When was the institute that owned The Collegian founded?',
 'paragraphs': ['The Collegian is the bi-weekly official student publication of Houston Baptist University in Houston, Texas. It was founded in 1963 as a newsletter, and adopted the newspaper format in 1990.',
  "Several private institutions of higher learning—ranging from liberal arts colleges, such as The University of St. Thomas, Houston's only Catholic university, to Rice University, the nationally recognized research university—are located within the city. Rice, with a total enrollment of slightly more than 6,000 students, has a number of distinguished graduate programs and research institutes, such as the James A. Baker Institute for Public Policy. Houston Baptist University, affiliated with the Baptist General Convention of Texas, offers bachelor's and graduate degrees. It was founded in 1960 and is located in the Sharpstown area in Southwest Houston."],
 'answers': ['1960']}

## 4. Build the Pipeline

In [7]:
class DecomposeQuestion(dspy.Signature):
    """Decompose a complex question into simpler (usually 2, 3 or 4) sub-questions . Example:
Where did the player who scored the most points in a NBA season go in the NBA Draft?
1) Who has the most points in a NBA season?
2) Where did #1 go in the NBA draft?
"""

    question: str = dspy.InputField()
    decomposition: str = dspy.OutputField(
        desc="Enumerated list of sub-questions, using '#n >>' notation for dependent questions"
    )

In [8]:
def split_subquestions(decomposition_str):
    for line in decomposition_str.split("\n"):
        if line.strip():
            parts = line.split(") ", 1)
            if len(parts) == 1:
                yield parts[0].strip()
            elif len(parts) == 2:
                yield parts[1].strip()
            else:
                raise ValueError(f"Invalid decomposition line: {line}")

In [9]:
list(split_subquestions("""
1) Who has the most points in a NBA season?
2) Where did #1 go in the NBA draft?
""".strip()))

['Who has the most points in a NBA season?',
 'Where did #1 go in the NBA draft?']

In [10]:
class GenerateAnswer(dspy.Signature):
    """Answer question based on the given context."""
    context = dspy.InputField(desc="The context to use for answering the question.")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="The factual answer to the question.")

In [11]:
class SimplifiedBaleen(dspy.Module):
    def __init__(self):
        super().__init__()
        self.qdecomp = dspy.Predict(DecomposeQuestion)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question, paragraphs):
        # Decompose the question into sub-questions
        sub_questions = list(split_subquestions(self.qdecomp(question=question).decomposition))
        dspy.Suggest(len(sub_questions) == len(paragraphs), f"This question is composed of {len(paragraphs)} sub-questions, but you generated {len(sub_questions)} sub-questions.")

        # Answer each sub-question in sequence  
        sub_answers = []
        for i, (paragraph, sub_q) in enumerate(zip(paragraphs, sub_questions)):
            if i == 0:
                pred = self.generate_answer(context=paragraph, question=sub_q)
                sub_answers.append(pred.answer)
            else:
                sub_q = sub_q.replace(f"#{i}", sub_answers[i-1])
                pred = self.generate_answer(context=paragraph, question=sub_q)
                sub_answers.append(pred.answer)

        return dspy.Prediction(answer=sub_answers[-1], sub_questions=sub_questions, sub_answers=sub_answers)

In [12]:
# Initialize the uncompiled QA module
uncompiled_qa = SimplifiedBaleen()

## 5. Define the optimization metric

In [13]:
from dspy.evaluate import answer_exact_match_str

def evaluate_answer(example, pred, trace=None):
    return answer_exact_match_str(pred.answer, example.answers, frac=0.8)

In [14]:
evaluate_qa = Evaluate(devset=valset, metric=evaluate_answer, num_threads=8, display_progress=True, return_outputs=True)

In [15]:
# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_qa(uncompiled_qa)
print(f"Uncompiled Question Answering Score: {uncompiled_score}")

  0%|          | 0/100 [00:00<?, ?it/s]

Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [03:56<00:00,  2.36s/it]

Uncompiled Question Answering Score: 60.0





## 6. Implement the optimization process

In [None]:
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BootstrapFewShotWithRandomSearch

# Set up the teleprompter
teleprompter = BootstrapFewShotWithRandomSearch(
    metric=evaluate_answer, 
    max_bootstrapped_demos=8, 
)

# Compile and optimize the QA module
compiled_qa = teleprompter.compile(uncompiled_qa, trainset=trainset)
compiled_qa.save('compiled-qa.json')

print("QA module compiled and optimized")

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


  0%|          | 0/100 [00:00<?, ?it/s]

Average Metric: 21 / 49  (42.9):  49%|████▉     | 49/100 [02:15<01:33,  1.84s/it][2m2024-10-09T19:03:13.639809Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 4 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 48.0 / 100  (48.0): 100%|██████████| 100/100 [04:02<00:00,  2.42s/it]


New best score: 48.0 for seed -3
Scores so far: [48.0]
Best score so far: 48.0


Average Metric: 22 / 47  (46.8):  47%|████▋     | 47/100 [01:43<02:22,  2.68s/it][2m2024-10-09T19:06:46.779294Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 51.0 / 100  (51.0): 100%|██████████| 100/100 [03:34<00:00,  2.15s/it]


New best score: 51.0 for seed -2
Scores so far: [48.0, 51.0]
Best score so far: 51.0


 15%|█▌        | 15/100 [02:14<12:40,  8.95s/it]


Bootstrapped 8 full traces after 16 examples in round 0.


Average Metric: 27 / 49  (55.1):  49%|████▉     | 49/100 [03:26<03:16,  3.85s/it][2m2024-10-09T19:14:16.354612Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 52.0 / 96  (54.2):  96%|█████████▌| 96/100 [06:33<00:10,  2.63s/it][2m2024-10-09T19:17:24.454317Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 54.0 / 100  (54.0): 100%|██████████| 100/100 [06:48<00:00,  4.08s/it]


New best score: 54.0 for seed -1
Scores so far: [48.0, 51.0, 54.0]
Best score so far: 54.0


 12%|█▏        | 12/100 [02:34<14:42, 10.03s/it][2m2024-10-09T19:20:15.194335Z[0m [[31m[1merror    [0m] [1mFailed to run or to evaluate example Example({'question': 'What is the literacy rate in the main city near where Guerra was assassinated?', 'paragraphs': ['But the peace in the state did not last long, the elections of 1875 caused new hostilities. Ángel Trías led a new movement against the government in June 1875 and maintained control over the government until September 18, 1875 when Donato Guerra the orchestrator of the Revolution of the North was captured. Donato Guerra was assassinated in a suburb of Chihuahua City where he was incarcerated for conspiring with Ángel Trías. During October 1875 several locations were controlled by rebel forces, but the government finally regained control on November 25, 1875.', 'The state has one city with a population exceeding one million: Ciudad Juárez. Ciudad Juárez is ranked eighth most populous city in the country and Chihuahua City 

Bootstrapped 7 full traces after 16 examples in round 0.


Average Metric: 26 / 48  (54.2):  48%|████▊     | 48/100 [03:13<02:55,  3.38s/it][2m2024-10-09T19:23:53.630471Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 54.0 / 100  (54.0): 100%|██████████| 100/100 [06:56<00:00,  4.16s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0]
Best score so far: 54.0


  3%|▎         | 3/100 [05:23<2:54:20, 107.84s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 17 / 28  (60.7):  28%|██▊       | 28/100 [01:35<03:26,  2.87s/it][2m2024-10-09T19:34:31.992928Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 55.0 / 100  (55.0): 100%|██████████| 100/100 [04:56<00:00,  2.96s/it]


New best score: 55.0 for seed 1
Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0]
Best score so far: 55.0


  1%|          | 1/100 [00:05<09:32,  5.78s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 15 / 28  (53.6):  28%|██▊       | 28/100 [00:59<02:00,  1.67s/it][2m2024-10-09T19:38:57.702205Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 55.0 / 100  (55.0): 100%|██████████| 100/100 [03:28<00:00,  2.09s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0]
Best score so far: 55.0


  9%|▉         | 9/100 [03:40<37:12, 24.54s/it]  


Bootstrapped 4 full traces after 10 examples in round 0.


Average Metric: 5 / 6  (83.3):   6%|▌         | 6/100 [01:52<15:42, 10.02s/it]   [2m2024-10-09T19:47:34.218870Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 55.0 / 100  (55.0): 100%|██████████| 100/100 [08:59<00:00,  5.40s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0, 55.0]
Best score so far: 55.0


 11%|█         | 11/100 [08:00<1:04:48, 43.69s/it]


Bootstrapped 4 full traces after 12 examples in round 0.


Average Metric: 26 / 49  (53.1):  49%|████▉     | 49/100 [02:25<01:53,  2.23s/it][2m2024-10-09T20:04:34.723420Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 54.0 / 100  (54.0): 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0, 55.0, 54.0]
Best score so far: 55.0


 11%|█         | 11/100 [01:48<14:41,  9.91s/it]


Bootstrapped 5 full traces after 12 examples in round 0.


Average Metric: 29 / 48  (60.4):  48%|████▊     | 48/100 [02:53<03:11,  3.69s/it][2m2024-10-09T20:11:45.570165Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 59.0 / 100  (59.0): 100%|██████████| 100/100 [05:52<00:00,  3.52s/it]


New best score: 59.0 for seed 5
Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0, 55.0, 54.0, 59.0]
Best score so far: 59.0


  5%|▌         | 5/100 [00:49<15:37,  9.87s/it]


Bootstrapped 2 full traces after 6 examples in round 0.


Average Metric: 50 / 100  (50.0): 100%|██████████| 100/100 [32:20<00:00, 19.41s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0, 55.0, 54.0, 59.0, 50.0]
Best score so far: 59.0


 13%|█▎        | 13/100 [46:37<5:12:01, 215.18s/it]


Bootstrapped 6 full traces after 14 examples in round 0.


Average Metric: 14 / 28  (50.0):  28%|██▊       | 28/100 [01:42<03:19,  2.77s/it][2m2024-10-09T21:36:16.324726Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 25.0 / 48  (52.1):  48%|████▊     | 48/100 [02:54<02:51,  3.31s/it][2m2024-10-09T21:37:26.722864Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 56.0 / 100  (56.0): 100%|██████████| 100/100 [05:48<00:00,  3.48s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0, 55.0, 54.0, 59.0, 50.0, 56.0]
Best score so far: 59.0


  9%|▉         | 9/100 [01:17<13:06,  8.65s/it]


Bootstrapped 4 full traces after 10 examples in round 0.


Average Metric: 5 / 6  (83.3):   6%|▌         | 6/100 [00:20<03:33,  2.27s/it] [2m2024-10-09T21:42:02.037658Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 21.0 / 47  (44.7):  47%|████▋     | 47/100 [02:13<02:15,  2.56s/it][2m2024-10-09T21:43:55.650013Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 52.0 / 100  (52.0): 100%|██████████| 100/100 [04:35<00:00,  2.76s/it]


Scores so far: [48.0, 51.0, 54.0, 54.0, 55.0, 55.0, 55.0, 54.0, 59.0, 50.0, 56.0, 52.0]
Best score so far: 59.0


 11%|█         | 11/100 [01:40<13:29,  9.10s/it]


Bootstrapped 8 full traces after 12 examples in round 0.


Average Metric: 5 / 6  (83.3):   6%|▌         | 6/100 [00:34<07:34,  4.83s/it] [2m2024-10-09T21:48:34.294404Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m
Average Metric: 13.0 / 28  (46.4):  28%|██▊       | 28/100 [01:56<03:27,  2.88s/it][2m2024-10-09T21:49:55.797493Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 This question is composed of 2 sub-questions, but only 3 sub-questions were generated.. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m198[0m


In [None]:
compiled_qa.save('compiled-qa.json')

## 7. Evaluate the pipeline

In [25]:
# Evaluate the compiled QA module
compiled_score, compiled_results = evaluate_qa(compiled_qa)
print(f"Compiled QA Module Score: {compiled_score}")

Average Metric: 1 / 10  (10.0): 100%|██████████| 10/10 [00:12<00:00,  1.21s/it]
Uncompiled QA Module Score: 10.0
Average Metric: 3 / 10  (30.0): 100%|██████████| 10/10 [00:12<00:00,  1.25s/it]
Compiled QA Module Score: 30.0


## 8. (Optional) Error Analysis

In [None]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if score < 1.0] 
    for example, pred in errors:
        print(f"Question: {example.question}")
        print(f"Context: {example.context}")
        print(f"Groundtruth Answers: {example.answers}")
        print(f"Predicted Answer: {pred.answer}")
        print()

In [None]:
print("Error analysis for uncompiled program:")
present_errors(uncompiled_results)

print("Error analysis for compiled program:")
present_errors(compiled_results)