In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
from pathlib import Path
import datetime

import dspy
from vf_musique.data import prepare_dataset
from vf_musique.metrics import exact_match, f1
from vf_musique.rewards import extract_all_retrieved_doc_ids

In [3]:
def setup_mlflow():
    import mlflow
    import mlflow.dspy

    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
    mlflow.set_experiment("dspy-gepa-musique")
    mlflow.dspy.autolog(
        log_compiles=True,
        log_evals=True,
        log_traces_from_compile=True,
    )
    print(f"✅ MLflow tracking enabled at {os.getenv('MLFLOW_TRACKING_URI')}")

setup_mlflow()

✅ MLflow tracking enabled at http://localhost:5005


In [4]:
EXP_ID = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
EXP_DIR = Path(f"../outputs/dspy-gepa-musique/{EXP_ID}")
EXP_DIR.mkdir(parents=True, exist_ok=True)
EXP_DIR

PosixPath('../outputs/dspy-gepa-musique/20251002_131311')

In [5]:
lm = dspy.LM(
    "openai/Qwen/Qwen3-8B",
    temperature=0.6,
    max_tokens=8192,
    api_key="local",
    api_base="http://0.0.0.0:8000/v1",
)
dspy.configure(lm=lm)

reflection_lm = dspy.LM("gemini/gemini-2.5-pro", api_key=os.getenv("GEMINI_API_KEY"), max_tokens=16384)
# reflection_lm = dspy.LM(
#     "openai/Qwen/Qwen3-32B",
#     temperature=0.6,
#     max_tokens=16384,
#     api_key="local",
#     api_base="http://0.0.0.0:8001/v1",
# )

In [6]:
lm(messages=[{"role": "user", "content": "Hello"}])

['\n\nHello! 😊 How can I assist you today?']

In [7]:
reflection_lm(messages=[{"role": "user", "content": "What is largest prime number below 10?"}])

['The largest prime number below 10 is **7**.\n\nThe prime numbers below 10 are 2, 3, 5, and 7.']

In [8]:
import random


def prepare_musique_dataset(datasets_str: str = "bdsaglam/musique,answerable,train", noise_rate: float = 1.0):
    """Load and prepare MuSiQue dataset using vf_musique data functions."""
    # Use the official vf_musique data preparation
    dataset = prepare_dataset(datasets_str, noise_rate=noise_rate)

    # Convert to DSPy examples
    processed_examples = []
    for x in dataset:
        # Get supporting document IDs
        supporting_doc_ids = [doc["id"] for doc in x["info"]["docs"] if doc.get("is_supporting")]

        # Create DSPy example
        example = dspy.Example(
            question=x["question"],
            answer=x["answer"],
            answers=x["info"]["answers"],  # All valid answer forms
            docs=x["info"]["docs"],  # All documents
            supporting_ids=supporting_doc_ids,  # IDs of supporting docs
            n_hops=x["info"]["n_hops"],  # Number of hops
        ).with_inputs("question", "docs")

        processed_examples.append(example)

    return processed_examples

ds = prepare_musique_dataset(datasets_str="bdsaglam/musique-mini,answerable,train", noise_rate=1.0)
random.Random(89).shuffle(ds)
train_size = int(len(ds)*0.60)
train_ds, val_ds = ds[:train_size], ds[train_size:]
test_ds = prepare_musique_dataset(datasets_str="bdsaglam/musique-mini,answerable,validation[:50]", noise_rate=1.0)

train_ds = train_ds[:30]
val_ds = val_ds[:30]

Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map: 100%|##########| 50/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [9]:
from pydantic import BaseModel
from agents import RunContextWrapper
from vf_musique.tools import make_retrieve_tool, ToolContext


class GenerateSearchQuery(dspy.Signature):
    """Given a multi-hop question and information collected so far, generate a search query
    to find the next piece of information needed to answer the question.
    Focus on entities, dates, or facts that need to be resolved step by step."""

    question: str = dspy.InputField(desc="The multi-hop question to answer")
    collected_info: str = dspy.InputField(desc="Information collected from previous retrieval steps")
    search_query: str = dspy.OutputField(desc="Search query for the next retrieval step")
    top_n: int = dspy.OutputField(desc="Number of documents to retrieve. 1 <= top_n <= 3")

class KeyInformation(BaseModel):
    info: str 
    source_doc_id: str

    def format(self):
        return f"{self.info}[{self.source_doc_id}]"

class ExtractInformation(dspy.Signature):
    """Given a question and retrieved documents, extract the key information
    that helps answer the question or leads to the next retrieval step.
    Focus on entities, relationships, dates, and facts."""

    question: str = dspy.InputField(desc="The multi-hop question to answer")
    documents: str = dspy.InputField(desc="Retrieved documents from search")
    key_informations: list[KeyInformation] = dspy.OutputField(desc="Key information(s) extracted from retrieved document(s)")

class DecideInfoCollection(dspy.Signature):
    question: str = dspy.InputField(desc="The multi-hop question to answer")
    all_information: str = dspy.InputField(desc="All information collected during retrieval")
    has_collected_enough_info: bool = dspy.OutputField(desc="Has enough information been collected to answer question?")

class GenerateAnswer(dspy.Signature):
    """Given a multi-hop question and all collected information, provide a concise answer.
    The answer should directly address what the question asks for.
    Be specific and use the exact entities/dates/facts from the documents."""

    question: str = dspy.InputField(desc="The multi-hop question to answer")
    all_information: str = dspy.InputField(desc="All information collected during retrieval")
    answer: str = dspy.OutputField(desc="Final answer to the question")
    citations: list[str] = dspy.OutputField(desc="List of document IDs cited for the answer, e.g. `[4,9]`")


class MultiHopQA(dspy.Module):
    """Multi-hop question answering module for MuSiQue."""

    def __init__(self, retriever_name: str = "hybrid", max_iter: int = 5):
        self.retriever_name = retriever_name
        self.max_iter = max_iter

        # Create the retrieve tool
        self.retrieve_tool = make_retrieve_tool(retriever_name, default_top_n=2)

        # Create modules with typed signatures
        self.generate_query = dspy.ChainOfThought(GenerateSearchQuery)
        self.extract_info = dspy.ChainOfThought(ExtractInformation)
        self.decide_info_collect = dspy.ChainOfThought(DecideInfoCollection)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question: str, docs: list, **kwargs) -> dspy.Prediction:
        """
        Forward pass for multi-hop QA.

        Args:
            question: The multi-hop question to answer
            docs: List of documents available for retrieval
        """
        collected_info = []
        retrieved_doc_ids = []

        # Create a context object that mimics the verifiers tool environment
        run_context_wrapper = RunContextWrapper[ToolContext](context=ToolContext(info=dict(docs=docs)))

        for hop_idx in range(self.max_iter):
            # Generate search query
            if hop_idx == 0:
                # First hop: use the original question
                query = question
                top_n = 2
            else:
                # Subsequent hops: generate query based on collected info
                query_pred = self.generate_query(
                    question=question,
                    collected_info="\n".join([item.format() for item in collected_info]) if collected_info else "No information collected yet",
                )
                query = query_pred.search_query
                top_n = max(min(query_pred.top_n, 3), 1)

            # Retrieve documents using the MuSiQue retrieve tool
            retrieved_text = self.retrieve_tool(run_context_wrapper, query=query, top_n=top_n)

            # Extract document IDs from retrieved text using the official function
            doc_ids = extract_all_retrieved_doc_ids(retrieved_text)
            for doc_id in doc_ids:
                if doc_id not in retrieved_doc_ids:
                    retrieved_doc_ids.append(doc_id)

            # Extract key information from retrieved documents
            info_pred = self.extract_info(question=question, documents=retrieved_text)
            collected_info.extend(info_pred.key_informations)

            decision_pred = self.decide_info_collect(question=question, all_information="\n".join([item.format() for item in collected_info]))        
            if decision_pred.has_collected_enough_info:
                break

        # Generate final answer based on all collected information
        answer_pred: GenerateAnswer = self.generate_answer(question=question, all_information="\n".join([item.format() for item in collected_info]))

        return dspy.Prediction(
            answer=answer_pred.answer,
            collected_info=collected_info,
            retrieved_doc_ids=retrieved_doc_ids,
            citations=answer_pred.citations,
            n_turns = hop_idx + 1,
        )

program = MultiHopQA()

In [10]:
example = train_ds[0]
example

Example({'question': "What county contains the work location of the president making father's day a national holiday?", 'answer': 'Washington County', 'answers': ['washington county', 'Washington County'], 'docs': [{'body': 'Thanksgiving, or Thanksgiving Day, is a public holiday celebrated on the fourth Thursday of November in the United States. It originated as a harvest festival. Thanksgiving has been celebrated nationally on and off since 1789, after Congress requested a proclamation by George Washington. It has been celebrated as a federal holiday every year since 1863, when, during the American Civil War, President Abraham Lincoln proclaimed a national day of "Thanksgiving and Praise to our beneficent Father who dwelleth in the Heavens,"to be celebrated on the last Thursday in November. Together with Christmas and the New Year, Thanksgiving is a part of the broader fall / winter holiday season in the U.S.', 'id': '0', 'is_supporting': False, 'text': '# Thanksgiving (United States)

In [11]:
pred = program(example.question, example.docs)
pred

Prediction(
    answer='Washington County, Kansas',
    collected_info=[KeyInformation(info="President Richard Nixon signed the bill making Father's Day a permanent national holiday in 1972.", source_doc_id='8'), KeyInformation(info="President Richard Nixon signed the bill making Father's Day a permanent national holiday in 1972.", source_doc_id='14'), KeyInformation(info='Washington is the county seat of Washington County, Kansas.', source_doc_id='7')],
    retrieved_doc_ids=['8', '14', '7', '5', '3'],
    citations=['7'],
    n_turns=5
)

In [12]:
def metric_retrieval_recall(example, pred, trace=None):
    """Retrieval recall metric - fraction of supporting documents found."""
    if not example.supporting_ids:
        return 1.0  # No supporting documents to evaluate

    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)

    if not gold_ids:
        return 1.0

    found = gold_ids.intersection(retrieved_ids)
    return len(found) / len(gold_ids)


def metric_retrieval_precision(example, pred, trace=None):
    """Retrieval precision metric - fraction of retrieved documents that are supporting."""
    if not example.supporting_ids:
        return 1.0

    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)
    found = gold_ids.intersection(retrieved_ids)
    return len(found) / len(retrieved_ids)


def metric_answer_exact_match(example, pred, trace=None):
    """Exact match metric for MuSiQue using the official metrics."""
    return exact_match(pred.answer, example.answers)


def metric_answer_f1_score(example, pred, trace=None):
    """Token-level F1 score using the official metrics."""
    return f1(pred.answer, example.answers)


def metric_citation_f1(example, pred, trace=None):
    """Citation accuracy metrics - precision, recall, F1 for cited document IDs."""
    # Convert to sets for easy comparison
    gold_ids = set(example.supporting_ids) if example.supporting_ids else set()
    cited_ids = set(str(doc_id) for doc_id in pred.citations)  # Ensure string format

    # Handle edge cases
    if not gold_ids:
        raise ValueError("Supporting docs must be provided for citation metric")

    if not cited_ids:
        # No citations given but some needed
        return 0.0

    # Calculate standard precision/recall/F1
    correct_citations = cited_ids & gold_ids
    precision = len(correct_citations) / len(cited_ids)
    recall = len(correct_citations) / len(gold_ids)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return f1


def metric_n_hops_penalty(example, pred, trace=None):
    """N-hops penalty metric that penalizes agents taking more turns than reference."""
    # Get actual turns taken by the agent
    agent_turns = pred.n_turns
    
    # Get reference hops from the example
    reference_hops = example.n_hops
    
    if agent_turns <= reference_hops:
        # Perfect score if agent is efficient
        return 1.0
    else:
        # Exponential penalty for taking too many hops
        # Penalty = 0.8^(extra_hops) 
        extra_hops = agent_turns - reference_hops
        return max(0.8 ** extra_hops, 0.1)  # Minimum score of 0.1


def metric(example, pred, trace=None):
    """Combined metric for MuSiQue: weighted by number of hops."""
    retrieval_recall_score = metric_retrieval_recall(example, pred, trace)
    retrieval_precision_score = metric_retrieval_precision(example, pred, trace)
    answer_f1_score = metric_answer_f1_score(example, pred, trace)
    citation_f1 = metric_citation_f1(example, pred, trace)
    n_hops_penalty_score = metric_n_hops_penalty(example, pred, trace)

    # Combine metrics: EM and F1 for answer quality, retrieval recall for completeness,
    # citation F1 for proper attribution, and hop efficiency penalty
    score_weight_pairs = [
        (retrieval_recall_score, 0.9),  # Retrieval recall for finding supporting docs
        (retrieval_precision_score, 0.5),  # Retrieval precision for finding supporting docs
        (answer_f1_score, 1.0),  # F1
        (citation_f1, 0.7),  # Citation accuracy for proper attribution
        (n_hops_penalty_score, 0.6),  # Hop efficiency penalty
    ]

    return sum(score * weight for score, weight in score_weight_pairs) / sum(weight for _, weight in score_weight_pairs)

In [13]:
metric(example, pred)

0.7389189189189189

In [14]:
# Evaluate original program
print("📊 Evaluating ORIGINAL program...")
original_evaluate = dspy.Evaluate(
    devset=test_ds,
    metric=metric,
    num_threads=8,
    display_table=False,
    display_progress=True
)
original_eval_result = original_evaluate(program)

📊 Evaluating ORIGINAL program...
Average Metric: 37.04 / 50 (74.1%): 100%|█████████████████████████████████████████████████████| 50/50 [00:44<00:00,  1.12it/s]

2025/10/02 13:14:15 INFO dspy.evaluate.evaluate: Average Metric: 37.03619060614726 / 50 (74.1%)



🏃 View run eval at: http://localhost:5005/#/experiments/1/runs/4d9e3c3089aa43859734784e23680112
🧪 View experiment at: http://localhost:5005/#/experiments/1


## GEPA Optimization

GEPA is a reflective prompt optimizer that uses textual feedback to improve performance. We'll create feedback functions for each evaluation aspect and optimize our multi-hop QA program.


In [15]:
def feedback_retrieval_recall(example, pred):
    """Generate feedback for retrieval recall evaluation."""
    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)
    found = gold_ids.intersection(retrieved_ids)
    recall_score = len(found) / len(gold_ids)

    if recall_score == 1.0:
        feedback = f"Perfect retrieval! You found all {len(gold_ids)} supporting documents: {sorted(found)}"
    elif recall_score >= 0.5:
        missing_ids = gold_ids - found
        feedback = (
            f"Good retrieval (recall: {recall_score:.2f}). Found {len(found)} out of {len(gold_ids)} "
            f"supporting documents. Missing: {sorted(missing_ids)}. Consider refining your search queries "
            f"to find the remaining relevant documents."
        )
    else:
        missing_ids = gold_ids - found
        feedback = (
            f"Poor retrieval (recall: {recall_score:.2f}). Only found {len(found)} out of {len(gold_ids)} "
            f"supporting documents. Missing critical documents: {sorted(missing_ids)}. "
            f"Your search queries need to be more comprehensive and targeted."
        )

    return recall_score, feedback


def feedback_retrieval_precision(example, pred):
    """Generate feedback for retrieval precision evaluation."""
    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)

    if not retrieved_ids:
        return 0.0, "No documents were retrieved. Your search queries need to find relevant documents."

    found = gold_ids.intersection(retrieved_ids)
    precision_score = len(found) / len(retrieved_ids)
    irrelevant_docs = retrieved_ids - gold_ids

    if precision_score == 1.0:
        feedback = (
            f"Perfect precision! All {len(retrieved_ids)} retrieved documents are supporting documents: {sorted(found)}"
        )
    elif precision_score >= 0.7:
        feedback = (
            f"Good precision (precision: {precision_score:.2f}). {len(found)} out of {len(retrieved_ids)} "
            f"retrieved documents are relevant. Irrelevant docs: {sorted(irrelevant_docs)}. "
            f"Consider making your search queries more specific to avoid irrelevant documents."
        )
    elif precision_score >= 0.3:
        feedback = (
            f"Moderate precision (precision: {precision_score:.2f}). Only {len(found)} out of {len(retrieved_ids)} "
            f"retrieved documents are relevant. Many irrelevant docs retrieved: {sorted(irrelevant_docs)}. "
            f"Your search queries are too broad - focus on more specific terms and entities."
        )
    else:
        feedback = (
            f"Poor precision (precision: {precision_score:.2f}). Only {len(found)} out of {len(retrieved_ids)} "
            f"retrieved documents are relevant. Most retrieved docs are irrelevant: {sorted(irrelevant_docs)}. "
            f"Your search queries are retrieving too many irrelevant documents. Be much more specific and targeted."
        )

    return precision_score, feedback


def feedback_answer_exact_match(example, pred):
    """Generate feedback for exact match evaluation."""
    em_score = exact_match(pred.answer, example.answers)

    if em_score == 1.0:
        feedback = f"Perfect! You provided the exact correct answer: '{pred.answer}'. This matches the expected answer exactly."
    else:
        # Find the best matching answer for more specific feedback
        best_answer = example.answers[0] if example.answers else "N/A"
        feedback = (
            f"Your answer '{pred.answer}' doesn't exactly match the expected answer '{best_answer}'. "
            f"Consider being more precise with entity names, dates, and specific facts."
        )

    return em_score, feedback


def feedback_answer_f1_score(example, pred):
    """Generate feedback for F1 score evaluation."""
    f1_score = f1(pred.answer, example.answers)

    if f1_score >= 0.9:
        feedback = f"Excellent! Your answer has high overlap (F1: {f1_score:.2f}) with the expected answer. Good token-level accuracy."
    elif f1_score >= 0.5:
        feedback = (
            f"Good partial match (F1: {f1_score:.2f}). Your answer contains relevant information but "
            f"could be more complete or precise. Consider including more specific details from the retrieved documents."
        )
    else:
        best_answer = example.answers[0] if example.answers else "N/A"
        feedback = (
            f"Low overlap (F1: {f1_score:.2f}) with expected answer. Your answer '{pred.answer}' "
            f"differs significantly from '{best_answer}'. Focus on extracting the specific information "
            f"requested in the question."
        )

    return f1_score, feedback


def feedback_citation_f1(example, pred):
    """Generate feedback for citation F1 evaluation."""
    gold_ids = set(example.supporting_ids) if example.supporting_ids else set()
    cited_ids = set(str(doc_id) for doc_id in pred.citations)

    if not gold_ids:
        return 1.0, "No supporting documents to cite."

    if not cited_ids:
        feedback = f"You didn't cite any documents, but should have cited: {sorted(gold_ids)}. Always cite the documents that support your answer."
        return 0.0, feedback

    correct_citations = cited_ids & gold_ids
    precision = len(correct_citations) / len(cited_ids)
    recall = len(correct_citations) / len(gold_ids)
    citation_f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    if citation_f1 >= 0.9:
        feedback = f"Excellent citations (F1: {citation_f1:.2f})! You properly cited the supporting documents: {sorted(correct_citations)}"
    elif citation_f1 >= 0.5:
        incorrect_citations = cited_ids - gold_ids
        missing_citations = gold_ids - cited_ids
        feedback = f"Good citations (F1: {citation_f1:.2f}). Correct: {sorted(correct_citations)}. "
        if incorrect_citations:
            feedback += f"Unnecessary: {sorted(incorrect_citations)}. "
        if missing_citations:
            feedback += f"Missing: {sorted(missing_citations)}. "
        feedback += "Be more precise about which documents actually support your answer."
    else:
        incorrect_citations = cited_ids - gold_ids
        missing_citations = gold_ids - cited_ids
        feedback = (
            f"Poor citations (F1: {citation_f1:.2f}). You cited {sorted(cited_ids)} but should cite {sorted(gold_ids)}. "
            f"Focus on identifying which documents directly support your answer claims."
        )

    return citation_f1, feedback


def feedback_n_hops_penalty(example, pred):
    """Generate feedback for n-hops penalty evaluation."""
    # Get actual turns taken by the agent
    agent_turns = pred.n_turns
    
    # Get reference hops from the example
    reference_hops = example.n_hops
    
    penalty_score = metric_n_hops_penalty(example, pred)
    
    # Analyze retrieval patterns to provide specific feedback
    retrieved_docs = pred.retrieved_doc_ids 
    unique_docs = len(set(retrieved_docs))
    total_retrievals = len(retrieved_docs)
    
    if agent_turns <= reference_hops:
        feedback = f"Perfect efficiency! You completed the task in {agent_turns} turns, same as or fewer than the reference ({reference_hops} hops). This shows excellent retrieval strategy and reasoning efficiency."
    elif agent_turns == reference_hops + 1:
        if total_retrievals > unique_docs:
            feedback = f"Good efficiency (penalty: {penalty_score:.2f}). You took {agent_turns} turns vs reference {reference_hops} hops. Only 1 extra turn - but check if you retrieved duplicate documents ({total_retrievals} retrievals, {unique_docs} unique). Focus on more targeted initial queries."
        else:
            feedback = f"Good efficiency (penalty: {penalty_score:.2f}). You took {agent_turns} turns vs reference {reference_hops} hops. Only 1 extra turn - consider if your initial retrieval query could have been more comprehensive to get the needed information upfront."
    elif agent_turns <= reference_hops + 2:
        if total_retrievals > unique_docs:
            redundant_retrievals = total_retrievals - unique_docs
            feedback = f"Moderate efficiency (penalty: {penalty_score:.2f}). You took {agent_turns} turns vs reference {reference_hops} hops. You made {redundant_retrievals} redundant retrieval(s) - avoid retrieving the same documents multiple times. Plan your queries more strategically."
        else:
            feedback = f"Moderate efficiency (penalty: {penalty_score:.2f}). You took {agent_turns} turns vs reference {reference_hops} hops. Your retrieval queries may be too specific or missing key entities. Try broader, more comprehensive initial searches."
    else:
        extra_turns = agent_turns - reference_hops
        if total_retrievals > unique_docs:
            redundant_retrievals = total_retrievals - unique_docs
            feedback = f"Poor efficiency (penalty: {penalty_score:.2f}). You took {extra_turns} extra turns ({agent_turns} vs {reference_hops} reference). Major issue: {redundant_retrievals} redundant retrieval(s) - you're wasting turns retrieving documents you already have. Focus on tracking what you've retrieved and crafting better initial queries."
        else:
            feedback = f"Poor efficiency (penalty: {penalty_score:.2f}). You took {extra_turns} extra turns ({agent_turns} vs {reference_hops} reference). Your retrieval strategy is inefficient - queries may be too narrow or poorly targeted. Plan what information you need and retrieve it strategically in fewer, more comprehensive searches."
    
    return penalty_score, feedback


def metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Combined metric for MuSiQue with feedback for GEPA optimization.
    Returns a dspy.Prediction with score (float) and feedback (str).

    The feedback is targeted at specific predictors when pred_name is provided,
    helping GEPA understand how to improve each component.
    """
    # Compute feedback and scores for all metrics
    score_answer_f1, fb_answer_f1 = feedback_answer_f1_score(example, pred)
    score_retrieval_recall, fb_retrieval_recall = feedback_retrieval_recall(example, pred)
    score_retrieval_precision, fb_retrieval_precision = feedback_retrieval_precision(example, pred)
    score_citation_f1, fb_citation_f1 = feedback_citation_f1(example, pred)
    score_n_hops_penalty, fb_n_hops_penalty = feedback_n_hops_penalty(example, pred)

    # Combined score: weighted average of all metrics (same as original metric)
    score_weight_pairs = [
        (score_answer_f1, 1.0),  # Answer F1
        (score_retrieval_recall, 0.9),  # Retrieval recall for finding supporting docs
        (score_retrieval_precision, 0.5),  # Retrieval precision for finding supporting docs
        (score_citation_f1, 0.7),  # Citation accuracy for proper attribution
        (score_n_hops_penalty, 0.6),  # Hop efficiency penalty
    ]

    total_score = sum(score * weight for score, weight in score_weight_pairs) / sum(
        weight for _, weight in score_weight_pairs
    )

    # Provide targeted feedback based on the predictor being optimized
    if pred_name == "generate_query.predict":
        # Focus on query generation quality and retrieval effectiveness
        feedback = (
            fb_retrieval_recall
            + " "
            + fb_retrieval_precision
            + " "
            + fb_n_hops_penalty
            + " "
            + "Your search queries should be both comprehensive (high recall) and specific (high precision). "
            "Consider what entities, relationships, or facts are needed for each hop of reasoning. "
            "Plan your queries strategically to avoid unnecessary turns."
        )

    elif pred_name == "extract_info.predict":
        # Focus on information extraction quality
        feedback = (
            fb_answer_f1
            + " "
            + (
                "Focus on extracting the most relevant facts, entities, and relationships from the retrieved documents. "
                "Make sure to capture information that directly helps answer the question or leads to the next reasoning step."
            )
        )

    elif pred_name == "generate_answer.predict":
        # Focus on answer generation and citation quality
        feedback = (
            fb_answer_f1
            + " "
            + fb_citation_f1
            + " "
            + (
                "Provide precise, complete answers using the exact information from the retrieved documents. "
                "Always cite the document IDs that support your answer claims."
            )
        )
    else:
        # Generic feedback combining all aspects
        feedback = "\n".join([
            "Overall performance breakdown:",
            f"- Answer F1 Score: {fb_answer_f1}",
            f"- Retrieval Recall: {fb_retrieval_recall}",
            f"- Retrieval Precision: {fb_retrieval_precision}",
            f"- Citations F1 Score: {fb_citation_f1}",
            f"- Hop Efficiency: {fb_n_hops_penalty}"
        ])

    return dspy.Prediction(score=total_score, feedback=feedback)


In [16]:
# Test the feedback metric on our example
feedback_result = metric_with_feedback(example, pred)
print(f"Score: {feedback_result.score:.3f}")
print(f"Feedback: {feedback_result.feedback}")


Score: 0.739
Feedback: Overall performance breakdown:
- Answer F1 Score: Good partial match (F1: 0.80). Your answer contains relevant information but could be more complete or precise. Consider including more specific details from the retrieved documents.
- Retrieval Recall: Perfect retrieval! You found all 3 supporting documents: ['14', '5', '7']
- Retrieval Precision: Moderate precision (precision: 0.60). Only 3 out of 5 retrieved documents are relevant. Many irrelevant docs retrieved: ['3', '8']. Your search queries are too broad - focus on more specific terms and entities.
- Citations F1 Score: Good citations (F1: 0.50). Correct: ['7']. Missing: ['14', '5']. Be more precise about which documents actually support your answer.
- Hop Efficiency: Moderate efficiency (penalty: 0.64). You took 5 turns vs reference 3 hops. Your retrieval queries may be too specific or missing key entities. Try broader, more comprehensive initial searches.


In [17]:
from dspy import GEPA

# Set up GEPA optimizer with reflection LM for optimization
optimizer = GEPA(
    metric=metric_with_feedback,
    auto="light",  # Use light budget for faster experimentation. Use "heavy" for best performance
    num_threads=8,
    track_stats=True,
    use_merge=False,
    reflection_lm=reflection_lm  
)

print("✅ GEPA optimizer configured")

✅ GEPA optimizer configured


In [None]:
# Run GEPA optimization
print("🚀 Starting GEPA optimization...")

optimized_program = optimizer.compile(
    program,
    trainset=train_ds,
    valset=val_ds,
)

print("✅ GEPA optimization completed!")


2025/10/02 13:14:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4f2fb7f1c3d54cb5b55f6319bd78b1f6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current dspy workflow


🚀 Starting GEPA optimization...


2025/10/02 13:14:15 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1765 metric calls of the program. This amounts to 29.42 full evals on the train+val set.
2025/10/02 13:14:15 INFO dspy.teleprompt.gepa.gepa: Using 30 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|                                                                       | 0/1765 [00:00<?, ?rollouts/s]2025/10/02 13:14:54 INFO dspy.evaluate.evaluate: Average Metric: 19.10404514404515 / 30 (63.7%)


🏃 View run eval_0 at: http://localhost:5005/#/experiments/1/runs/5438de8e37a641ac88f70b0147a63053
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:14:54 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.6368015048015047
GEPA Optimization:   2%|█                                                             | 30/1765 [00:39<37:38,  1.30s/rollouts]2025/10/02 13:14:54 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.6368015048015047


Average Metric: 2.50 / 3 (83.2%): 100%|█████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.76it/s]

2025/10/02 13:14:56 INFO dspy.evaluate.evaluate: Average Metric: 2.496846846846847 / 3 (83.2%)



🏃 View run eval_1 at: http://localhost:5005/#/experiments/1/runs/32915fb1cbff4c07aaa92a053948c3a1
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:15:21 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for generate_query.predict: You are an expert reasoning agent designed for a multi-hop question-answering system. Your primary task is to generate the next optimal search query to find a missing piece of information needed to answer a complex question.

### Task
Given a multi-hop question and a set of facts that have already been collected, you must generate a search query to find the next piece of information.

### Inputs
1.  `question`: The original multi-hop question you are trying to answer.
2.  `collected_info`: A list of facts and statements retrieved from previous search steps.

### Outputs
You must generate the following three outputs:
1.  `reasoning`: A clear, step-by-step explanation of your thought process. This should:
    *   Summarize the information you already have from `collected_info`.
    *   Identify the next logical sub-question or the specific entity/fact that needs to be resolved.


🏃 View run eval_2 at: http://localhost:5005/#/experiments/1/runs/8601008d44dd4568b59e3092d5d89ae2
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:17:27 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score is not better, skipping
GEPA Optimization:   2%|█▏                                                          | 36/1765 [03:11<3:11:47,  6.66s/rollouts]2025/10/02 13:17:27 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.6368015048015047


Average Metric: 2.02 / 3 (67.4%): 100%|█████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.02it/s]

2025/10/02 13:17:29 INFO dspy.evaluate.evaluate: Average Metric: 2.0206306306306305 / 3 (67.4%)



🏃 View run eval_3 at: http://localhost:5005/#/experiments/1/runs/12df9e7cd52b4d8aaca4d5f75dba68bc
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:17:52 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for extract_info.predict: Your task is to act as a highly focused information extractor. Given a question and a set of retrieved documents, you must identify and extract key pieces of information that are essential for answering the question.

### Core Instructions:
1.  **Analyze the Question:** First, carefully deconstruct the question to understand its core components and the specific information required (e.g., a person, a location, a date, a relationship between entities).
2.  **Scan for Relevance:** Read through the provided documents, specifically looking for text that directly addresses the components of the question.
3.  **Extract Key Information:**
    *   Focus on extracting concrete facts, entities (people, places, organizations), relationships, and dates.
    *   The extracted information does not need to be the final answer. It can be an intermediate fact that is a crucial step in the reason

🏃 View run eval_4 at: http://localhost:5005/#/experiments/1/runs/c468e7ea5733428eadd158e64a2b9bd1
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:18:44 INFO dspy.teleprompt.gepa.gepa: Iteration 2: New subsample score is not better, skipping
GEPA Optimization:   2%|█▍                                                          | 42/1765 [04:29<3:54:35,  8.17s/rollouts]2025/10/02 13:18:44 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Selected program 0 score: 0.6368015048015047


Average Metric: 2.26 / 3 (75.4%): 100%|█████████████████████████████████████████████████████████| 3/3 [00:21<00:00,  7.29s/it]

2025/10/02 13:19:07 INFO dspy.evaluate.evaluate: Average Metric: 2.2623063063063062 / 3 (75.4%)



🏃 View run eval_5 at: http://localhost:5005/#/experiments/1/runs/6adbfd52160444d8b2be7367e0f6a37d
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:19:33 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for decide_info_collect.predict: Your task is to act as a verifier. Given a `question` and a collection of text snippets in `all_information`, you must determine if the provided information is sufficient to construct a complete and accurate answer. Your output should be a single boolean field: `has_collected_enough_info`.

To do this, follow these steps:

1.  **Deconstruct the Question:** Break down the `question` into its core components. Identify all the entities, the relationships between them, and any specific constraints such as dates, locations, or other conditions.

2.  **Verify Each Component:** Systematically check if every essential component and constraint from the question can be addressed by the text in `all_information`. You must be able to form a complete logical chain that connects all the parts of the question to a final answer.

3.  **Assess Sufficiency:**
    *   **Set `has_collected_e

🏃 View run eval_6 at: http://localhost:5005/#/experiments/1/runs/3fa89e2b4d8c41808fda502a3e40653f
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:19:55 INFO dspy.teleprompt.gepa.gepa: Iteration 3: New subsample score is not better, skipping
GEPA Optimization:   3%|█▋                                                          | 48/1765 [05:39<4:19:34,  9.07s/rollouts]2025/10/02 13:19:55 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 0 score: 0.6368015048015047


Average Metric: 2.27 / 3 (75.8%): 100%|█████████████████████████████████████████████████████████| 3/3 [00:49<00:00, 16.56s/it]

2025/10/02 13:20:45 INFO dspy.evaluate.evaluate: Average Metric: 2.274954954954955 / 3 (75.8%)



🏃 View run eval_7 at: http://localhost:5005/#/experiments/1/runs/d271c1faf7a04484a453db3866fd5bcc
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:21:06 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for generate_answer.predict: You are an expert at answering multi-hop questions based on a provided set of documents. Your task is to synthesize information from multiple sources to construct a concise and accurate answer.

Follow these rules carefully:

1.  **Analyze the Question:** First, break down the multi-hop question to understand the chain of information you need to find. Identify the entities and the relationships between them that you must establish.

2.  **Synthesize Information:** Your answer will require connecting facts from different documents. Trace the logical path from one piece of information to the next using the provided documents. You must base your answer *exclusively* on the information given in the `all_information` section. Do not use any external knowledge.

3.  **Provide a Concise Answer:** The `answer` should be direct and to the point, containing only the specific informatio

🏃 View run eval_8 at: http://localhost:5005/#/experiments/1/runs/73fa1cebaa6740ee86863cf1ed906a8e
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:23:28 INFO dspy.evaluate.evaluate: Average Metric: 19.86307235982911 / 30 (66.2%)


🏃 View run eval_9 at: http://localhost:5005/#/experiments/1/runs/061bfef200684c0789238719dfc4b45a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:23:29 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New program is on the linear pareto front
2025/10/02 13:23:29 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full valset score for new program: 0.6621024119943039
2025/10/02 13:23:29 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full train_val score for new program: 0.6621024119943039
2025/10/02 13:23:29 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Individual valset scores for new program: [1.0, 0.3281081081081081, 0.9369369369369369, 0.9283783783783783, 0.8918918918918918, 0.8918918918918918, 0.8198198198198198, 0.3335135135135135, 0.6503287070854638, 0.45063063063063064, 0.5554054054054053, 0.4317117117117117, 0.545045045045045, 0.9549549549549549, 1.0, 0.2299099099099099, 0.8108108108108107, 0.7162162162162161, 0.42027027027027025, 0.4945945945945946, 0.5997297297297297, 0.6149058149058149, 0.8783783783783783, 0.6238738738738738, 0.6846846846846846, 0.4091891891891892, 0.5745045045045044, 0.7459459459459459, 1.0, 0.341441

Average Metric: 1.56 / 3 (52.0%): 100%|█████████████████████████████████████████████████████████| 3/3 [02:04<00:00, 41.51s/it]

2025/10/02 13:25:33 INFO dspy.evaluate.evaluate: Average Metric: 1.5605045045045045 / 3 (52.0%)



🏃 View run eval_10 at: http://localhost:5005/#/experiments/1/runs/aeb102ec7a1d4c749a049adcf581cff7
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:26:05 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for generate_query.predict: You are an expert search strategist for a multi-hop question-answering system. Your task is to analyze a complex question and the information gathered so far, then generate the single best search query to find the next piece of information needed to answer the question.

Your goal is to devise a search plan that answers the question in the fewest steps possible. This requires creating queries that are both highly precise (to avoid irrelevant results) and efficient (to resolve the next step completely).

Follow these steps to generate your response:

1.  **Analyze the Current State:**
    *   **Deconstruct the `question`:** Identify all the entities (people, places, organizations), relationships, and constraints within the question.
    *   **Synthesize `collected_info`:** Review the facts that have already been found. Understand what parts of the original question have been re

🏃 View run eval_11 at: http://localhost:5005/#/experiments/1/runs/b524379a91004a648aaca031a2af21bb
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:33:41 INFO dspy.evaluate.evaluate: Average Metric: 19.837278745976413 / 30 (66.1%)


🏃 View run eval_12 at: http://localhost:5005/#/experiments/1/runs/90253f6f4eeb4b9d96951c92e623cdf1
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:33:42 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full valset score for new program: 0.6612426248658807
2025/10/02 13:33:42 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full train_val score for new program: 0.6612426248658807
2025/10/02 13:33:42 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Individual valset scores for new program: [1.0, 0.2882882882882883, 0.9369369369369369, 0.5657657657657658, 0.8918918918918918, 0.4614054054054054, 0.8648648648648649, 0.7792792792792792, 0.5612377959354703, 0.45063063063063064, 0.42702702702702705, 0.3281081081081081, 0.545045045045045, 0.9324324324324325, 1.0, 0.9078378378378378, 0.8108108108108107, 0.7162162162162161, 0.42027027027027025, 0.4945945945945946, 0.6335135135135135, 0.6149058149058149, 0.3662162162162162, 0.6238738738738738, 0.6846846846846846, 0.4317117117117117, 0.5745045045045044, 0.7459459459459459, 1.0, 0.7792792792792792]
2025/10/02 13:33:42 INFO dspy.teleprompt.gepa.gepa: Iteration 5: New valset pareto front score

Average Metric: 2.03 / 3 (67.7%): 100%|█████████████████████████████████████████████████████████| 3/3 [01:54<00:00, 38.14s/it]

2025/10/02 13:35:37 INFO dspy.evaluate.evaluate: Average Metric: 2.031981981981982 / 3 (67.7%)



🏃 View run eval_13 at: http://localhost:5005/#/experiments/1/runs/fd2733c504b24d0ea7ddcb292a60462b
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:36:06 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for extract_info.predict: Your task is to act as a specialized information extractor. Given a question and a set of retrieved documents, you must meticulously extract all key pieces of information that are essential for answering the question or for taking the next step in a multi-step reasoning process.

### Guiding Principles:

1.  **Deconstruct the Question:** First, analyze the question to identify the core entities (e.g., people, places, organizations), relationships, and specific details (e.g., dates, quantities, locations) it is asking about. Break the question down into the individual facts needed to construct a complete answer.

2.  **Extract Atomic and Relevant Facts:**
    *   Scan the documents for any information that directly addresses the components of the deconstructed question.
    *   Extract this information as concise, self-contained statements. For example, instead of a long paragrap

🏃 View run eval_14 at: http://localhost:5005/#/experiments/1/runs/3c8c475660f24ccba939f69ad192d503
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:38:15 INFO dspy.teleprompt.gepa.gepa: Iteration 6: New subsample score is not better, skipping
GEPA Optimization:   7%|████▏                                                      | 126/1765 [23:59<7:02:31, 15.47s/rollouts]2025/10/02 13:38:15 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Selected program 2 score: 0.6612426248658807


Average Metric: 2.03 / 3 (67.7%): 100%|█████████████████████████████████████████████████████████| 3/3 [02:35<00:00, 51.82s/it]

2025/10/02 13:40:50 INFO dspy.evaluate.evaluate: Average Metric: 2.0306306306306303 / 3 (67.7%)



🏃 View run eval_15 at: http://localhost:5005/#/experiments/1/runs/4e3e8527ec694c0189b601de00e4c1de
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:41:18 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for decide_info_collect.predict: You are an expert at determining information sufficiency. Your task is to evaluate if a collection of text snippets (`all_information`) contains all the necessary facts to definitively answer a given `question`.

Your goal is to produce a single boolean field: `has_collected_enough_info`.

**Instructions:**

1.  **Deconstruct the Question:** First, carefully break down the `question` into a logical chain of required facts. Identify all the entities, their attributes, and the relationships between them that you need to establish to reach the final answer.

2.  **Strictly Verify Each Fact:** For every single link in your logical chain, you must find explicit support within the provided `all_information`. **Do not use any external knowledge or make assumptions.** If a piece of information, even a seemingly obvious one, is not present in `all_information`, you must consider i

🏃 View run eval_16 at: http://localhost:5005/#/experiments/1/runs/c330ea55c524499f8c1e030d331e6112
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:52:04 INFO dspy.evaluate.evaluate: Average Metric: 19.20684139953907 / 30 (64.0%)


🏃 View run eval_17 at: http://localhost:5005/#/experiments/1/runs/e4311d1087234065bec1e04729c741c3
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:52:04 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Full valset score for new program: 0.6402280466513024
2025/10/02 13:52:04 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Full train_val score for new program: 0.6402280466513024
2025/10/02 13:52:04 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Individual valset scores for new program: [1.0, 0.292972972972973, 0.9369369369369369, 0.41441441441441434, 0.6055495495495495, 0.4614054054054054, 0.8648648648648649, 0.35603603603603606, 0.5612377959354703, 0.45063063063063064, 0.42702702702702705, 0.3281081081081081, 0.545045045045045, 0.9324324324324325, 1.0, 0.9078378378378378, 0.8108108108108107, 0.7162162162162161, 0.42027027027027025, 0.4945945945945946, 0.6335135135135135, 0.8699999999999999, 0.8423423423423423, 0.6238738738738738, 0.6846846846846846, 0.4317117117117117, 0.5745045045045044, 0.7009009009009008, 1.0, 0.3189189189189189]
2025/10/02 13:52:04 INFO dspy.teleprompt.gepa.gepa: Iteration 7: New valset pareto front scor

Average Metric: 2.14 / 3 (71.4%): : 4it [04:27, 66.89s/it]                                                                    

2025/10/02 13:56:32 INFO dspy.evaluate.evaluate: Average Metric: 2.142342342342342 / 3 (71.4%)



🏃 View run eval_18 at: http://localhost:5005/#/experiments/1/runs/aa22fbd3ba14432c94b16e2e9140c229
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:57:07 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for generate_answer.predict: You are an expert at answering multi-hop questions based on a provided set of documents. Your task is to synthesize information from multiple sources to construct a concise and accurate answer.

Follow these rules carefully:

1.  **Analyze the Question:** First, break down the multi-hop question to understand the chain of information you need to find. Identify the starting entity, the target information, and the intermediate or "bridge" entities required to connect them.

2.  **Synthesize Information:** Your answer will require connecting facts from different documents. Trace the logical path from one piece of information to the next.
    *   **Strict Grounding:** Your reasoning must be based *exclusively* on the provided documents. Do not make assumptions or logical leaps. For example, if a document states a person was "associated with" a city, you cannot conclude they were 

🏃 View run eval_19 at: http://localhost:5005/#/experiments/1/runs/213d5bb90d1c4622a33c716ccf2d6297
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:58:13 INFO dspy.teleprompt.gepa.gepa: Iteration 8: New subsample score is not better, skipping
GEPA Optimization:  10%|█████▌                                                    | 168/1765 [43:57<10:21:59, 23.37s/rollouts]2025/10/02 13:58:13 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Selected program 0 score: 0.6368015048015047


Average Metric: 1.98 / 3 (66.2%): 100%|█████████████████████████████████████████████████████████| 3/3 [01:07<00:00, 22.64s/it]

2025/10/02 13:59:21 INFO dspy.evaluate.evaluate: Average Metric: 1.9849789789789791 / 3 (66.2%)



🏃 View run eval_20 at: http://localhost:5005/#/experiments/1/runs/9227af300e084085aa732bba0774c3ac
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/02 13:59:52 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for generate_query.predict: You are an expert reasoning agent designed to solve complex, multi-hop questions. Your task is to generate the next search query needed to find a missing piece of information. You will be given the original question and a collection of information found so far.

Your process should be as follows:

1.  **Decompose the Question:** First, analyze the main question and break it down into a logical chain of smaller, dependent sub-questions or entities that need to be resolved.
2.  **Synthesize Known Information:** Review the `collected_info`. Identify which parts of your decomposed question have already been answered by this information.
3.  **Identify the Next Unresolved Step:** Pinpoint the immediate next piece of information required to move forward in the chain. Do not skip steps or make assumptions about unresolved entities. If a location, person, or date in the question is st

🏃 View run eval_21 at: http://localhost:5005/#/experiments/1/runs/64bbe47fc5b340c3bc042226e331236c
🧪 View experiment at: http://localhost:5005/#/experiments/1


In [None]:
optimized_program.save(str(EXP_DIR / "optimized-program"), save_program=True)

### Examine Optimized Prompts

Let's look at how GEPA improved the prompts for each predictor:


In [None]:
for name, pred in optimized_program.named_predictors():
    print("=" * 60)
    print(f"Predictor: {name}")
    print("=" * 60)
    print("Optimized Instructions:")
    print(pred.signature.instructions)
    print("*" * 60)


### Evaluate Optimized Program

Compare the performance before and after GEPA optimization:


In [None]:
print("\\n📊 Evaluating OPTIMIZED program...")
# Evaluate optimized program  
optimized_evaluate = dspy.Evaluate(
    devset=test_ds,
    metric=metric,
    num_threads=8,
    display_table=False,
    display_progress=True
)
optimized_eval_result = optimized_evaluate(optimized_program)

In [None]:
print("\\n" + "=" * 50)
print("🏆 PERFORMANCE COMPARISON")
print("=" * 50)
print(f"Original Program Score:  {original_eval_result.score:.3f}")
print(f"Optimized Program Score: {optimized_eval_result.score:.3f}")
print(f"Improvement:            {optimized_eval_result.score - original_eval_result.score:+.3f}")
print(f"Relative Improvement:   {((optimized_eval_result.score / original_eval_result.score) - 1) * 100:+.1f}%")

### GEPA Optimization Analysis

Analyze the detailed optimization results:


In [None]:
# Analyze GEPA optimization trajectory
if hasattr(optimized_program, 'detailed_results'):
    results = optimized_program.detailed_results
    
    print("🔍 GEPA Optimization Details:")
    print(f"- Total candidates explored: {len(results.candidates)}")
    print(f"- Best candidate index: {results.best_idx}")
    print(f"- Best validation score: {results.val_aggregate_scores[results.best_idx]:.3f}")
    print(f"- Discovery evaluations used: {sum(results.discovery_eval_counts)}")
    
    # Show score progression
    print("\\n📈 Score progression:")
    for i, score in enumerate(results.val_aggregate_scores[:10]):  # Show first 10
        print(f"Candidate {i}: {score:.3f}")
    
    if len(results.val_aggregate_scores) > 10:
        print(f"... and {len(results.val_aggregate_scores) - 10} more candidates")
else:
    print("Detailed results not available (set track_stats=True in GEPA constructor)")


In [None]:
# Test optimized program on the same example
example = test_ds[3]

print("🧪 Testing optimized program on example:")
print(f"Question: {example.question}")
print(f"Expected Answer: {example.answer}")
print(f"Supporting Docs: {example.supporting_ids}")
print()

pred = program(example.question, example.docs)
optimized_pred = optimized_program(example.question, example.docs)

print("📋 ORIGINAL vs OPTIMIZED Results:")
print("-" * 50)
print("ORIGINAL:")
print(f"  Answer: {pred.answer}")
print(f"  Retrieved docs: {pred.retrieved_doc_ids}")
print(f"  Cited docs: {pred.citations}")

print("OPTIMIZED:")
print(f"  Answer: {optimized_pred.answer}")
print(f"  Retrieved docs: {optimized_pred.retrieved_doc_ids}")
print(f"  Cited docs: {optimized_pred.citations}")

print("🎯 Metric Comparison:")
original_metric_result = metric_with_feedback(example, pred)
optimized_metric_result = metric_with_feedback(example, optimized_pred)
print(f"Original score: {original_metric_result.score:.3f}")
print(f"Original feedback: {original_metric_result.feedback}")
print()
print(f"Optimized score: {optimized_metric_result.score:.3f}")
print(f"Optimized feedback: {optimized_metric_result.feedback}")


Can we measure instruction quality by using them with a larger model to see if it gets questions right?