# DSPy Question Answering Pipeline

This notebook implements a DSPy pipeline for optimizing question answering prompts.

## 1. Set up the environment

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [2]:
from bellek.utils import set_seed
set_seed(89)

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.1,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

## 2. Load and preprocess the datasets

In [4]:
from datasets import load_dataset

train_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='train')
val_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='validation')
train_ds, val_ds

(Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }),
 Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }))

In [5]:
def format_paragraph(paragraph):
    text = paragraph['paragraph_text']
    title = paragraph['title']
    return f"# {title}\n{text}"

def make_example(record):
    supporting_paragraphs = [p for p in record['paragraphs'] if p['is_supporting']]
    context = "\n\n".join([format_paragraph(p) for p in supporting_paragraphs])
    return dspy.Example(
        question=record['question'],
        context=context,
        answer=record['answer'],
        answers=[record['answer'], *record['answer_aliases']],
    ).with_inputs('question', 'context')

In [6]:
trainset = [make_example(record) for record in train_ds]
valset = [make_example(record) for record in val_ds]
len(trainset), len(valset)

(300, 300)

## 3. Define Signatures

In [7]:
class GenerateAnswer(dspy.Signature):
    """Answer the question based on the given context."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

## 4. Build the Pipeline

In [8]:
class QAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(GenerateAnswer)

    def forward(self, context, question):
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [9]:
# Initialize the uncompiled QA module
uncompiled_program = QAModule()

## 5. Define the optimization metric

In [10]:
from bellek.musique.eval import calculate_metrics

def compute_scores(results):
    df = pd.DataFrame([{**dict(example), "predicted_answer": pred.answer} for example, pred, score in results])
    return calculate_metrics(df)

Using the latest cached version of the module from /home/pc/.cache/huggingface/modules/evaluate_modules/metrics/bdsaglam--musique/9f409241d4cc6ea7853124e79cf44954a75900a0a2c0b9d20b909c2396f6b071 (last modified on Tue Jul 23 21:54:03 2024) since it couldn't be found locally at bdsaglam--musique, or remotely on the Hugging Face Hub.


In [11]:
from dspy.evaluate import answer_exact_match_str

def evaluate_answer(example, pred, trace=None):
    return answer_exact_match_str(pred.answer, example.answers)

In [12]:
evaluate_program = Evaluate(
    metric=evaluate_answer,
    devset=valset,
    num_threads=32,
    display_progress=True,
    return_outputs=True,
)

In [13]:
# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_program(uncompiled_program)
print("Uncompiled Question Answering Scores")
compute_scores(uncompiled_results)

Average Metric: 164 / 300  (54.7): 100%|██████████| 300/300 [02:34<00:00,  1.95it/s]


Uncompiled Question Answering Scores


{'exact_match': 0.5466666666666666,
 'f1': 0.6624913974913975,
 'fuzzy_match': 0.6033333333333334}

## 6. Implement the optimization process

In [14]:
from dspy.teleprompt import LabeledFewShot, BootstrapFewShot, BootstrapFewShotWithRandomSearch

In [None]:
teleprompter_cls = BootstrapFewShotWithRandomSearch

teleprompter = teleprompter_cls(
    metric=evaluate_answer,
    max_bootstrapped_demos=8, 
    max_labeled_demos=8,
    num_threads=4,
    num_candidate_programs=10
)

compiled_program = teleprompter.compile(uncompiled_program, trainset=trainset)
compiled_program_filename = f"compiled-qa-standard-{teleprompter_cls.__name__.lower()}.json"
compiled_program.save(compiled_program_filename)

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 10 candidate sets.


  0%|          | 0/300 [00:00<?, ?it/s]

Average Metric: 176 / 300  (58.7): 100%|██████████| 300/300 [09:50<00:00,  1.97s/it]


New best score: 58.67 for seed -3
Scores so far: [58.67]
Best score so far: 58.67


Average Metric: 188 / 300  (62.7): 100%|██████████| 300/300 [19:59<00:00,  4.00s/it]


New best score: 62.67 for seed -2
Scores so far: [58.67, 62.67]
Best score so far: 62.67


  4%|▎         | 11/300 [00:56<24:55,  5.17s/it]


Bootstrapped 8 full traces after 12 examples in round 0.


Average Metric: 184 / 300  (61.3): 100%|██████████| 300/300 [12:57<00:00,  2.59s/it]


Scores so far: [58.67, 62.67, 61.33]
Best score so far: 62.67


  4%|▍         | 12/300 [01:04<25:58,  5.41s/it]


Bootstrapped 7 full traces after 13 examples in round 0.


Average Metric: 185 / 300  (61.7): 100%|██████████| 300/300 [15:09<00:00,  3.03s/it]


Scores so far: [58.67, 62.67, 61.33, 61.67]
Best score so far: 62.67


  1%|▏         | 4/300 [00:22<27:39,  5.60s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 187 / 300  (62.3): 100%|██████████| 300/300 [18:37<00:00,  3.72s/it]


Scores so far: [58.67, 62.67, 61.33, 61.67, 62.33]
Best score so far: 62.67


  1%|          | 2/300 [00:10<25:54,  5.22s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 108 / 163  (66.3):  54%|█████▍    | 163/300 [13:27<23:24, 10.25s/it]

In [19]:
_, compiled_results = evaluate_program(compiled_program)
print(f"{teleprompter_cls.__name__} Compiled Question Answering Scores")
print(compute_scores(compiled_results))

Average Metric: 187 / 300  (62.3): 100%|██████████| 300/300 [10:45<00:00,  2.15s/it]


BootstrapFewShotWithRandomSearch Compiled Question Answering Scores
{'exact_match': 0.6233333333333333, 'f1': 0.7193650645650645, 'fuzzy_match': 0.6766666666666666}


## 8. Error Analysis

In [27]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if float(score) < 0.9] 
    for example, pred in errors:
        print(f"Question: {example.question}")
        print(f"Context: {example.context}")
        print(f"Groundtruth Answers: {example.answers}")
        print(f"Predicted Answer: {pred.answer}")
        print('='*80)

In [None]:
print("Error analysis for uncompiled program\n\n")
present_errors(uncompiled_results)

In [None]:
print("Error analysis for compiled program:")
present_errors(compiled_results)

## Inspect

In [None]:
i = 1
example = trainset[i]
pred = compiled_program(context=example.context, question=example.question)
example.answers, pred.answer

In [None]:
lm.history[-1]