# DSPy Question Answering Pipeline

This notebook implements a DSPy pipeline for optimizing question answering prompts.

## 1. Set up the environment

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [2]:
from bellek.utils import set_seed
set_seed(89)

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.1,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

## 2. Load and preprocess the datasets

In [4]:
from datasets import load_dataset

train_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='train')
val_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='validation')
train_ds, val_ds

(Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }),
 Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }))

In [5]:
def format_paragraph(paragraph):
    text = paragraph['paragraph_text']
    title = paragraph['title']
    return f"# {title}\n{text}"

def make_example(record):
    supporting_paragraphs = [p for p in record['paragraphs'] if p['is_supporting']]
    context = "\n\n".join([format_paragraph(p) for p in supporting_paragraphs])
    return dspy.Example(
        question=record['question'],
        context=context,
        answer=record['answer'],
        answers=[record['answer'], *record['answer_aliases']],
    ).with_inputs('question', 'context')

In [6]:
trainset = [make_example(record) for record in train_ds]
valset = [make_example(record) for record in val_ds]
len(trainset), len(valset)

(300, 300)

## Build the module

In [7]:
class GenerateAnswer(dspy.Signature):
    """Answer the question based on the given context."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [8]:
import dspy
from dspy.primitives.program import Module
from dspy.signatures.signature import ensure_signature


class ConnectTheEntities(Module):
    def __init__(self, signature, rationale_type=None, activated=True, **config):
        super().__init__()

        self.activated = activated

        self.signature = signature = ensure_signature(signature)

        prefix = "Let's identify the relevant entity-relation-entity triples in the format of 'subj;relation;obj'\n"
        desc = "${triples}"
        rationale_type = rationale_type or dspy.OutputField(prefix=prefix, desc=desc)

        # Add "rationale" field to the output signature.
        extended_signature = signature.prepend("triples", rationale_type, type_=str)
        
        self._predict = dspy.Predict(extended_signature, **config)
        self._predict.extended_signature = extended_signature

    def forward(self, **kwargs):
        assert self.activated in [True, False]

        signature = kwargs.pop("new_signature", self._predict.extended_signature if self.activated else self.signature)
        return self._predict(signature=signature, **kwargs)

    @property
    def demos(self):
        return self._predict.demos

    @property
    def extended_signature(self):
        return self._predict.extended_signature

In [9]:
class QAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = ConnectTheEntities(GenerateAnswer)

    def forward(self, context, question):
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [10]:
# Initialize the uncompiled QA module
uncompiled_program = QAModule()

## 5. Define the optimization metric

In [11]:
from bellek.musique.eval import calculate_metrics

def compute_scores(results):
    df = pd.DataFrame([{**dict(example), "predicted_answer": pred.answer} for example, pred, score in results])
    return calculate_metrics(df)

Using the latest cached version of the module from /home/pc/.cache/huggingface/modules/evaluate_modules/metrics/bdsaglam--musique/9f409241d4cc6ea7853124e79cf44954a75900a0a2c0b9d20b909c2396f6b071 (last modified on Tue Jul 23 21:54:03 2024) since it couldn't be found locally at bdsaglam--musique, or remotely on the Hugging Face Hub.


In [12]:
from dspy.evaluate import answer_exact_match_str

def evaluate_answer(example, pred, trace=None):
    return answer_exact_match_str(pred.answer, example.answers)

In [13]:
evaluate_program = Evaluate(
    metric=evaluate_answer,
    devset=valset,
    num_threads=32,
    display_progress=True,
    return_outputs=True,
)

In [14]:
# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_program(uncompiled_program)
print("Uncompiled Question Answering Scores")
compute_scores(uncompiled_results)

Average Metric: 179 / 300  (59.7): 100%|██████████| 300/300 [02:33<00:00,  1.96it/s]


Uncompiled Question Answering Scores


{'exact_match': 0.5966666666666667,
 'f1': 0.6866563414470185,
 'fuzzy_match': 0.6433333333333333}

## 6. Implement the optimization process

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

teleprompter_cls = BootstrapFewShotWithRandomSearch
teleprompter = teleprompter_cls(
    metric=evaluate_answer,
    max_labeled_demos=0,
    max_bootstrapped_demos=4,
    num_threads=4,
    num_candidate_programs=10
)

compiled_program = teleprompter.compile(uncompiled_program, trainset=trainset)
compiled_program_filename = f"compiled-qa-cte-{teleprompter_cls.__name__.lower()}.json"
compiled_program.save(compiled_program_filename)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 10 candidate sets.


  0%|          | 0/300 [00:00<?, ?it/s]

Average Metric: 193 / 300  (64.3): 100%|██████████| 300/300 [04:59<00:00,  1.00it/s]


New best score: 64.33 for seed -3
Scores so far: [64.33]
Best score so far: 64.33


Average Metric: 193 / 300  (64.3): 100%|██████████| 300/300 [05:02<00:00,  1.01s/it]


Scores so far: [64.33, 64.33]
Best score so far: 64.33


  2%|▏         | 5/300 [00:13<13:36,  2.77s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 193 / 300  (64.3): 100%|██████████| 300/300 [08:30<00:00,  1.70s/it]


Scores so far: [64.33, 64.33, 64.33]
Best score so far: 64.33


  2%|▏         | 6/300 [00:14<11:32,  2.36s/it]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 187 / 300  (62.3): 100%|██████████| 300/300 [08:32<00:00,  1.71s/it]


Scores so far: [64.33, 64.33, 64.33, 62.33]
Best score so far: 64.33


  1%|          | 3/300 [00:07<12:46,  2.58s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 183 / 300  (61.0): 100%|██████████| 300/300 [07:46<00:00,  1.56s/it]


Scores so far: [64.33, 64.33, 64.33, 62.33, 61.0]
Best score so far: 64.33


  1%|          | 2/300 [00:05<13:22,  2.69s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 195 / 300  (65.0): 100%|██████████| 300/300 [07:44<00:00,  1.55s/it]


New best score: 65.0 for seed 2
Scores so far: [64.33, 64.33, 64.33, 62.33, 61.0, 65.0]
Best score so far: 65.0


  2%|▏         | 5/300 [00:13<13:42,  2.79s/it]


Bootstrapped 2 full traces after 6 examples in round 0.


Average Metric: 186 / 300  (62.0): 100%|██████████| 300/300 [07:04<00:00,  1.42s/it]


Scores so far: [64.33, 64.33, 64.33, 62.33, 61.0, 65.0, 62.0]
Best score so far: 65.0


  1%|          | 2/300 [00:06<16:17,  3.28s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 187 / 300  (62.3): 100%|██████████| 300/300 [07:49<00:00,  1.57s/it]


Scores so far: [64.33, 64.33, 64.33, 62.33, 61.0, 65.0, 62.0, 62.33]
Best score so far: 65.0


  1%|          | 3/300 [00:07<12:53,  2.60s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 193 / 300  (64.3): 100%|██████████| 300/300 [27:10<00:00,  5.43s/it]


Scores so far: [64.33, 64.33, 64.33, 62.33, 61.0, 65.0, 62.0, 62.33, 64.33]
Best score so far: 65.0


  0%|          | 1/300 [00:23<1:56:43, 23.42s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 182 / 300  (60.7): 100%|██████████| 300/300 [29:31<00:00,  5.90s/it]


Scores so far: [64.33, 64.33, 64.33, 62.33, 61.0, 65.0, 62.0, 62.33, 64.33, 60.67]
Best score so far: 65.0


  1%|▏         | 4/300 [00:56<1:09:34, 14.10s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 187 / 297  (63.0):  99%|█████████▉| 297/300 [39:22<00:36, 12.33s/it]

In [18]:
_, compiled_results = evaluate_program(compiled_program)
print(f"{teleprompter_cls.__name__} Compiled Question Answering Scores")
print(compute_scores(compiled_results))

Average Metric: 182 / 300  (60.7): 100%|██████████| 300/300 [06:12<00:00,  1.24s/it]


BootstrapFewShotWithRandomSearch Compiled Question Answering Scores
{'exact_match': 0.6066666666666667, 'f1': 0.7130639399586769, 'fuzzy_match': 0.6766666666666666}


## 8. Error Analysis

In [27]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if float(score) < 0.9] 
    for example, pred in errors:
        print(f"Question: {example.question}")
        print(f"Context: {example.context}")
        print(f"Groundtruth Answers: {example.answers}")
        print(f"Predicted Answer: {pred.answer}")
        print('='*80)

In [None]:
print("Error analysis for uncompiled program\n\n")
present_errors(uncompiled_results)

In [None]:
print("Error analysis for compiled program:")
present_errors(compiled_results)

## Inspect

In [None]:
i = 1
example = trainset[i]
pred = compiled_program(context=example.context, question=example.question)
example.answers, pred.answer

In [None]:
lm.history[-1]