# DSPy Question Answering Pipeline

This notebook implements a DSPy pipeline for optimizing question answering prompts.

## 1. Set up the environment

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import dspy
from dspy.evaluate import Evaluate

In [2]:
from bellek.utils import set_seed
set_seed(89)

In [3]:
lm = dspy.LM(
    "openai/llama-3-70b-tgi",
    temperature=0.1,
    cache=False,
    api_base=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
)
dspy.configure(lm=lm)

## 2. Load and preprocess the datasets

In [4]:
from datasets import load_dataset

train_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='train')
val_ds = load_dataset('bdsaglam/musique-mini', 'answerable', split='validation')
train_ds, val_ds

(Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }),
 Dataset({
     features: ['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'n_hops'],
     num_rows: 300
 }))

In [5]:
def format_paragraph(paragraph):
    text = paragraph['paragraph_text']
    title = paragraph['title']
    return f"# {title}\n{text}"

def make_example(record):
    supporting_paragraphs = [p for p in record['paragraphs'] if p['is_supporting']]
    context = "\n\n".join([format_paragraph(p) for p in supporting_paragraphs])
    return dspy.Example(
        question=record['question'],
        context=context,
        answer=record['answer'],
        answers=[record['answer'], *record['answer_aliases']],
    ).with_inputs('question', 'context')

In [6]:
trainset = [make_example(record) for record in train_ds]
valset = [make_example(record) for record in val_ds]
len(trainset), len(valset)

(300, 300)

## 3. Define Signatures

In [7]:
class GenerateAnswer(dspy.Signature):
    """Answer the question based on the given context."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

## 4. Build the Pipeline

In [8]:
class QAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)

In [9]:
# Initialize the uncompiled QA module
uncompiled_program = QAModule()

## 5. Define the optimization metric

In [10]:
from bellek.musique.eval import calculate_metrics

def compute_scores(results):
    df = pd.DataFrame([{**dict(example), "predicted_answer": pred.answer} for example, pred, score in results])
    return calculate_metrics(df)

Using the latest cached version of the module from /home/pc/.cache/huggingface/modules/evaluate_modules/metrics/bdsaglam--musique/9f409241d4cc6ea7853124e79cf44954a75900a0a2c0b9d20b909c2396f6b071 (last modified on Tue Jul 23 21:54:03 2024) since it couldn't be found locally at bdsaglam--musique, or remotely on the Hugging Face Hub.


In [11]:
from dspy.evaluate import answer_exact_match_str

def evaluate_answer(example, pred, trace=None):
    return answer_exact_match_str(pred.answer, example.answers)

In [12]:
evaluate_program = Evaluate(
    metric=evaluate_answer,
    devset=valset,
    num_threads=32,
    display_progress=True,
    return_outputs=True,
)

In [None]:
# Evaluate the uncompiled question decomposition module
uncompiled_score, uncompiled_results = evaluate_program(uncompiled_program)
print("Uncompiled Question Answering Scores")
compute_scores(uncompiled_results)



In [None]:
uncompiled_results[0]
raise Exception("Stop here")

## 6. Implement the optimization process

In [14]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

teleprompter_cls = BootstrapFewShotWithRandomSearch
teleprompter = teleprompter_cls(
    metric=evaluate_answer,
    max_bootstrapped_demos=4,
    num_threads=4,
    num_candidate_programs=10
)

compiled_program = teleprompter.compile(uncompiled_program, trainset=trainset)
compiled_program_filename = f"compiled-qa-cot-{teleprompter_cls.__name__.lower()}.json"
compiled_program.save(compiled_program_filename)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 10 candidate sets.


  0%|          | 0/300 [00:00<?, ?it/s]

Average Metric: 170 / 300  (56.7): 100%|██████████| 300/300 [46:49<00:00,  9.36s/it]


New best score: 56.67 for seed -3
Scores so far: [56.67]
Best score so far: 56.67


  0%|          | 0/300 [00:00<?, ?it/s][2m2024-10-23T19:12:23.745494Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': "litellm.BadRequestError: OpenAIException - Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 9004 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}\nReceived Model Group=llama-3-70b-tgi\nAvailable Model Group Fallbacks=None", 'type': None, 'param': None, 'code': '400'}}. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 0.0 / 1  (0.0):   0%|          | 1/300 [00:00<02:16,  2.20it/s][2m2024-10-23T19:12:23.844230Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'me


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



Average Metric: 0.0 / 3  (0.0):   1%|          | 3/300 [00:00<00:56,  5.25it/s][2m2024-10-23T19:12:23.960472Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': "litellm.BadRequestError: OpenAIException - Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 9201 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}\nReceived Model Group=llama-3-70b-tgi\nAvailable Model Group Fallbacks=None", 'type': None, 'param': None, 'code': '400'}}. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 0.0 / 4  (0.0):   1%|          | 3/300 [00:00<00:56,  5.25it/s][2m2024-10-23T19:12:24.231177Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIExce


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



Average Metric: 0.0 / 6  (0.0):   2%|▏         | 6/300 [00:01<00:51,  5.66it/s][2m2024-10-23T19:12:24.507235Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': "litellm.BadRequestError: OpenAIException - Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 9293 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}\nReceived Model Group=llama-3-70b-tgi\nAvailable Model Group Fallbacks=None", 'type': None, 'param': None, 'code': '400'}}. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 0.0 / 7  (0.0):   2%|▏         | 7/300 [00:01<00:45,  6.40it/s][2m2024-10-23T19:12:24.596656Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIExce


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



[2m2024-10-23T19:12:24.809987Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': "litellm.BadRequestError: OpenAIException - Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 9015 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}\nReceived Model Group=llama-3-70b-tgi\nAvailable Model Group Fallbacks=None", 'type': None, 'param': None, 'code': '400'}}. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 0.0 / 9  (0.0):   3%|▎         | 9/300 [00:01<00:45,  6.40it/s]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Fee

BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': "litellm.BadRequestError: OpenAIException - Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 9389 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}\nReceived Model Group=llama-3-70b-tgi\nAvailable Model Group Fallbacks=None", 'type': None, 'param': None, 'code': '400'}}

In [30]:
uncompiled_results[0][1]

Prediction(
    context="# The Good Shepherd (film)\nThe Good Shepherd is a 2006 spy film produced and directed by Robert De Niro and starring Matt Damon, Angelina Jolie and De Niro, with an extensive supporting cast.\n\n# The Godfather Part II\nThe Godfather Part II is a 1974 American crime film produced and directed by Francis Ford Coppola from a screenplay co-written with Mario Puzo, starring Al Pacino and Robert De Niro. Partially based on Puzo's 1969 novel The Godfather, the film is both sequel and prequel to The Godfather, presenting parallel dramas: one picks up the 1958 story of Michael Corleone (Pacino), the new Don of the Corleone crime family, protecting the family business in the aftermath of an attempt on his life; the prequel covers the journey of his father, Vito Corleone (De Niro), from his Sicilian childhood to the founding of his family enterprise in New York City.",
    answer='Vito Corleone'
)

In [15]:
_, compiled_results = evaluate_program(compiled_program)
print(f"{teleprompter_cls.__name__} Compiled Question Answering Scores")
print(compute_scores(compiled_results))

Average Metric: 180 / 300  (60.0): 100%|██████████| 300/300 [06:19<00:00,  1.27s/it]


BootstrapFewShotWithRandomSearch Compiled Question Answering Scores
{'exact_match': 0.6, 'f1': 0.6991829875777245, 'fuzzy_match': 0.66}


## 8. Error Analysis

In [27]:
def present_errors(results):
    errors = [(example, pred) for example, pred, score in results if float(score) < 0.9] 
    for example, pred in errors:
        print(f"Question: {example.question}")
        print(f"Context: {example.context}")
        print(f"Groundtruth Answers: {example.answers}")
        print(f"Predicted Answer: {pred.answer}")
        print('='*80)

In [None]:
print("Error analysis for uncompiled program\n\n")
present_errors(uncompiled_results)

In [None]:
print("Error analysis for compiled program:")
present_errors(compiled_results)

## Inspect

In [None]:
i = 1
example = trainset[i]
pred = compiled_program(context=example.context, question=example.question)
example.answers, pred.answer

In [None]:
lm.history[-1]