# DSPy Question Answering Pipeline

This notebook implements a DSPy pipeline for optimizing question answering prompts.

## 1. Set up the environment

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [2]:
from bellem.utils import set_seed
set_seed(89)

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.1,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

## 2. Load and preprocess the datasets

In [4]:
from datasets import load_dataset

train_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='train')
val_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='validation')
train_ds, val_ds

(Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }),
 Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }))

In [5]:
def format_paragraph(paragraph):
    text = paragraph['paragraph_text']
    title = paragraph['title']
    return f"# {title}\n{text}"

def make_example(record):
    supporting_paragraphs = [p for p in record['paragraphs'] if p['is_supporting']]
    context = "\n\n".join([format_paragraph(p) for p in supporting_paragraphs])
    return dspy.Example(
        question=record['question'],
        context=context,
        answer=record['answer'],
        answers=[record['answer'], *record['answer_aliases']],
    ).with_inputs('question', 'context')

In [6]:
trainset = [make_example(record) for record in train_ds]
valset = [make_example(record) for record in val_ds]
len(trainset), len(valset)

(300, 300)

## 3. Define Signatures

In [1]:
class GenerateAnswer(dspy.Signature):
    """Answer the question based on the given context."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

NameError: name 'dspy' is not defined

## 4. Build the Pipeline

In [8]:
# Initialize the uncompiled QA module
uncompiled_program = dspy.ChainOfThought(GenerateAnswer)

## 5. Define the optimization metric

In [9]:
from bellem.musique.eval import calculate_metrics

def compute_scores(results):
    df = pd.DataFrame([{**dict(example), "predicted_answer": pred.answer} for example, pred, score in results])
    return calculate_metrics(df)

Using the latest cached version of the module from /home/pc/.cache/huggingface/modules/evaluate_modules/metrics/bdsaglam--musique/9f409241d4cc6ea7853124e79cf44954a75900a0a2c0b9d20b909c2396f6b071 (last modified on Tue Jul 23 21:54:03 2024) since it couldn't be found locally at bdsaglam--musique, or remotely on the Hugging Face Hub.


In [10]:
from dspy.evaluate import answer_exact_match_str

def evaluate_answer(example, pred, trace=None):
    return answer_exact_match_str(pred.answer, example.answers)

In [11]:
evaluate_program = Evaluate(
    metric=evaluate_answer,
    devset=valset,
    num_threads=32,
    display_progress=True,
    return_outputs=True,
)

In [12]:
# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_program(uncompiled_program)
print("Uncompiled Question Answering Scores")
compute_scores(uncompiled_results)

{'exact_match': 0.47,
 'f1': 0.5911166818682684,
 'fuzzy_match': 0.5266666666666666}

## 6. Implement the optimization process

In [14]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

teleprompter_cls = BootstrapFewShotWithRandomSearch
teleprompter = teleprompter_cls(
    metric=evaluate_answer,
    max_labeled_demos=4,
    max_bootstrapped_demos=4,
)

compiled_program = teleprompter.compile(uncompiled_program, trainset=trainset)
compiled_program_filename = f"compiled-qa-cot-{teleprompter_cls.__name__.lower()}.json"
compiled_program.save(compiled_program_filename)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


Average Metric: 153 / 300  (51.0): 100%|██████████| 300/300 [08:12<00:00,  1.64s/it]


New best score: 51.0 for seed -3
Scores so far: [51.0]
Best score so far: 51.0


Average Metric: 180 / 300  (60.0): 100%|██████████| 300/300 [07:29<00:00,  1.50s/it]


New best score: 60.0 for seed -2
Scores so far: [51.0, 60.0]
Best score so far: 60.0


  3%|▎         | 8/300 [00:25<15:47,  3.24s/it]


Bootstrapped 4 full traces after 9 examples in round 0.


Average Metric: 193 / 300  (64.3): 100%|██████████| 300/300 [06:38<00:00,  1.33s/it]


New best score: 64.33 for seed -1
Scores so far: [51.0, 60.0, 64.33]
Best score so far: 64.33


  3%|▎         | 10/300 [00:30<14:47,  3.06s/it]


Bootstrapped 4 full traces after 11 examples in round 0.


Average Metric: 179 / 300  (59.7): 100%|██████████| 300/300 [06:41<00:00,  1.34s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67]
Best score so far: 64.33


  1%|          | 3/300 [00:08<14:21,  2.90s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 174 / 300  (58.0): 100%|██████████| 300/300 [06:44<00:00,  1.35s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0]
Best score so far: 64.33


  1%|          | 2/300 [00:05<14:33,  2.93s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 182 / 300  (60.7): 100%|██████████| 300/300 [08:49<00:00,  1.77s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67]
Best score so far: 64.33


  2%|▏         | 5/300 [00:13<13:00,  2.65s/it]


Bootstrapped 2 full traces after 6 examples in round 0.


Average Metric: 187 / 300  (62.3): 100%|██████████| 300/300 [06:24<00:00,  1.28s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33]
Best score so far: 64.33


  1%|          | 2/300 [00:04<11:31,  2.32s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 178 / 300  (59.3): 100%|██████████| 300/300 [06:34<00:00,  1.32s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33]
Best score so far: 64.33


  1%|          | 3/300 [00:08<14:33,  2.94s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 188 / 300  (62.7): 100%|██████████| 300/300 [07:48<00:00,  1.56s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67]
Best score so far: 64.33


  0%|          | 1/300 [00:02<12:53,  2.59s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 175 / 300  (58.3): 100%|██████████| 300/300 [07:06<00:00,  1.42s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33]
Best score so far: 64.33


  1%|▏         | 4/300 [00:11<13:38,  2.77s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 184 / 300  (61.3): 100%|██████████| 300/300 [07:03<00:00,  1.41s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33]
Best score so far: 64.33


  2%|▏         | 6/300 [00:14<12:07,  2.47s/it]


Bootstrapped 2 full traces after 7 examples in round 0.


Average Metric: 184 / 300  (61.3): 100%|██████████| 300/300 [06:49<00:00,  1.37s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33]
Best score so far: 64.33


  3%|▎         | 10/300 [00:28<13:32,  2.80s/it]


Bootstrapped 4 full traces after 11 examples in round 0.


Average Metric: 184 / 300  (61.3): 100%|██████████| 300/300 [08:01<00:00,  1.60s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33]
Best score so far: 64.33


  0%|          | 1/300 [00:02<13:46,  2.76s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 175 / 300  (58.3): 100%|██████████| 300/300 [08:50<00:00,  1.77s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33, 58.33]
Best score so far: 64.33


  2%|▏         | 6/300 [00:19<16:06,  3.29s/it]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 185 / 300  (61.7): 100%|██████████| 300/300 [07:09<00:00,  1.43s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33, 58.33, 61.67]
Best score so far: 64.33


  1%|▏         | 4/300 [00:10<13:12,  2.68s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 181 / 300  (60.3): 100%|██████████| 300/300 [08:19<00:00,  1.66s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33, 58.33, 61.67, 60.33]
Best score so far: 64.33


  1%|▏         | 4/300 [00:13<16:38,  3.37s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 181 / 300  (60.3): 100%|██████████| 300/300 [28:17<00:00,  5.66s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33, 58.33, 61.67, 60.33, 60.33]
Best score so far: 64.33


  1%|▏         | 4/300 [00:16<20:36,  4.18s/it]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 186 / 300  (62.0): 100%|██████████| 300/300 [12:39<00:00,  2.53s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33, 58.33, 61.67, 60.33, 60.33, 62.0]
Best score so far: 64.33


  1%|▏         | 4/300 [00:29<36:06,  7.32s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 186 / 300  (62.0): 100%|██████████| 300/300 [12:02<00:00,  2.41s/it]


Scores so far: [51.0, 60.0, 64.33, 59.67, 58.0, 60.67, 62.33, 59.33, 62.67, 58.33, 61.33, 61.33, 61.33, 58.33, 61.67, 60.33, 60.33, 62.0, 62.0]
Best score so far: 64.33
19 candidate programs found.
[('self', Predict(StringSignature(context, question -> reasoning, answer
    instructions='Answer the question based on the given context.'
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)))]


Average Metric: 178 / 300  (59.3): 100%|██████████| 300/300 [06:45<00:00,  1.35s/it]


BootstrapFewShotWithRandomSearch Compiled Question Answering Scores
{'exact_match': 0.5933333333333334, 'f1': 0.7032057090512973, 'fuzzy_match': 0.6566666666666666}


In [None]:
_, compiled_results = evaluate_program(compiled_program)
print(f"{teleprompter_cls.__name__} Compiled Question Answering Scores")
print(compute_scores(compiled_results))

## 8. Error Analysis

In [27]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if float(score) < 0.9] 
    for example, pred in errors:
        print(f"Question: {example.question}")
        print(f"Context: {example.context}")
        print(f"Groundtruth Answers: {example.answers}")
        print(f"Predicted Answer: {pred.answer}")
        print('='*80)

In [None]:
print("Error analysis for uncompiled program\n\n")
present_errors(uncompiled_results)

In [None]:
print("Error analysis for compiled program:")
present_errors(compiled_results)

## Inspect

In [None]:
i = 1
example = trainset[i]
pred = compiled_program(context=example.context, question=example.question)
example.answers, pred.answer

In [None]:
lm.history[-1]