In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
from pathlib import Path
import datetime

import dspy
from vf_musique.data import prepare_dataset
from vf_musique.metrics import exact_match, f1
from vf_musique.rewards import extract_all_retrieved_doc_ids

In [3]:
def setup_mlflow():
    import mlflow
    import mlflow.dspy

    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
    mlflow.set_experiment("dspy-gepa-musique")
    mlflow.dspy.autolog(
        log_compiles=True,
        log_evals=True,
        log_traces_from_compile=True,
    )
    print(f"✅ MLflow tracking enabled at {os.getenv('MLFLOW_TRACKING_URI')}")

setup_mlflow()

✅ MLflow tracking enabled at http://localhost:5005


In [4]:
EXP_ID = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
EXP_DIR = Path(f"../outputs/dspy/gepa-musique/{EXP_ID}")
EXP_DIR.mkdir(parents=True, exist_ok=True)
EXP_DIR

PosixPath('../outputs/dspy/gepa-musique/20250930_171202')

In [5]:
lm = dspy.LM(
    "openai/Qwen/Qwen3-8B",
    temperature=0.6,
    max_tokens=8192,
    api_key="local",
    api_base="http://0.0.0.0:8000/v1",
)
dspy.configure(lm=lm)

# reflection_lm = dspy.LM("gemini/gemini-2.5-pro", api_key=os.getenv("GEMINI_API_KEY"), max_tokens=16384,)
reflection_lm = dspy.LM(
    "openai/Qwen/Qwen3-32B",
    temperature=0.6,
    max_tokens=16384,
    api_key="local",
    api_base="http://0.0.0.0:8001/v1",
)

In [6]:
lm(messages=[{"role": "user", "content": "Hello"}])

['\n\nHello! 😊 How can I assist you today?']

In [7]:
reflection_lm(messages=[{"role": "user", "content": "What is largest prime number below 10?"}])

['\n\nThe largest prime number below 10 is **7**. \n\nTo determine this, we first identify all prime numbers less than 10. A prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. Evaluating the numbers from 2 to 9:\n\n- **2** is prime (divisible only by 1 and 2).\n- **3** is prime (divisible only by 1 and 3).\n- **4** is not prime (divisible by 2).\n- **5** is prime (divisible only by 1 and 5).\n- **6** is not prime (divisible by 2 and 3).\n- **7** is prime (divisible only by 1 and 7).\n- **8** is not prime (divisible by 2 and 4).\n- **9** is not prime (divisible by 3).\n\nAmong the prime numbers 2, 3, 5, and 7, the largest is **7**.\n\n---\n\n$$\n\\boxed{7}\n$$']

In [8]:
import random


def prepare_musique_dataset(datasets_str: str = "bdsaglam/musique,answerable,train", noise_rate: float = 1.0):
    """Load and prepare MuSiQue dataset using vf_musique data functions."""
    # Use the official vf_musique data preparation
    dataset = prepare_dataset(datasets_str, noise_rate=noise_rate)

    # Convert to DSPy examples
    processed_examples = []
    for x in dataset:
        # Get supporting document IDs
        supporting_doc_ids = [doc["id"] for doc in x["info"]["docs"] if doc.get("is_supporting")]

        # Create DSPy example
        example = dspy.Example(
            question=x["question"],
            answer=x["answer"],
            answers=x["info"]["answers"],  # All valid answer forms
            docs=x["info"]["docs"],  # All documents
            supporting_ids=supporting_doc_ids,  # IDs of supporting docs
            n_hops=x["info"]["n_hops"],  # Number of hops
        ).with_inputs("question", "docs")

        processed_examples.append(example)

    return processed_examples

ds = prepare_musique_dataset(datasets_str="bdsaglam/musique-mini,answerable,train", noise_rate=1.0)
random.Random(89).shuffle(ds)
train_size = int(len(ds)*0.60)
train_ds, val_ds = ds[:train_size], ds[train_size:]
test_ds = prepare_musique_dataset(datasets_str="bdsaglam/musique-mini,answerable,validation[:50]", noise_rate=1.0)

train_ds = train_ds[:30]
val_ds = val_ds[:30]

Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map: 100%|##########| 50/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [9]:
from pydantic import BaseModel
from agents import RunContextWrapper
from vf_musique.tools import make_retrieve_tool, ToolContext


class GenerateSearchQuery(dspy.Signature):
    """Given a multi-hop question and information collected so far, generate a search query
    to find the next piece of information needed to answer the question.
    Focus on entities, dates, or facts that need to be resolved step by step."""

    question: str = dspy.InputField(desc="The multi-hop question to answer")
    collected_info: str = dspy.InputField(desc="Information collected from previous retrieval steps")
    search_query: str = dspy.OutputField(desc="Search query for the next retrieval step")
    top_n: int = dspy.OutputField(desc="Number of documents to retrieve. 1 <= top_n <= 3")

class KeyInformation(BaseModel):
    info: str 
    source_doc_id: str

    def format(self):
        return f"{self.info}[{self.source_doc_id}]"

class ExtractInformation(dspy.Signature):
    """Given a question and retrieved documents, extract the key information
    that helps answer the question or leads to the next retrieval step.
    Focus on entities, relationships, dates, and facts."""

    question: str = dspy.InputField(desc="The multi-hop question to answer")
    documents: str = dspy.InputField(desc="Retrieved documents from search")
    key_informations: list[KeyInformation] = dspy.OutputField(desc="Key information(s) extracted from retrieved document(s)")

class DecideInfoCollection(dspy.Signature):
    question: str = dspy.InputField(desc="The multi-hop question to answer")
    all_information: str = dspy.InputField(desc="All information collected during retrieval")
    has_collected_enough_info: bool = dspy.OutputField(desc="Has enough information been collected to answer question?")

class GenerateAnswer(dspy.Signature):
    """Given a multi-hop question and all collected information, provide a concise answer.
    The answer should directly address what the question asks for.
    Be specific and use the exact entities/dates/facts from the documents."""

    question: str = dspy.InputField(desc="The multi-hop question to answer")
    all_information: str = dspy.InputField(desc="All information collected during retrieval")
    answer: str = dspy.OutputField(desc="Final answer to the question")
    citations: list[str] = dspy.OutputField(desc="List of document IDs cited for the answer, e.g. `[4,9]`")


class MultiHopQA(dspy.Module):
    """Multi-hop question answering module for MuSiQue."""

    def __init__(self, retriever_name: str = "hybrid", max_iter: int = 10):
        self.retriever_name = retriever_name
        self.max_iter = 10

        # Create the retrieve tool
        self.retrieve_tool = make_retrieve_tool(retriever_name, default_top_n=2)

        # Create modules with typed signatures
        self.generate_query = dspy.ChainOfThought(GenerateSearchQuery)
        self.extract_info = dspy.ChainOfThought(ExtractInformation)
        self.decide_info_collect = dspy.ChainOfThought(DecideInfoCollection)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question: str, docs: list, **kwargs) -> dspy.Prediction:
        """
        Forward pass for multi-hop QA.

        Args:
            question: The multi-hop question to answer
            docs: List of documents available for retrieval
        """
        collected_info = []
        retrieved_doc_ids = []

        # Create a context object that mimics the verifiers tool environment
        run_context_wrapper = RunContextWrapper[ToolContext](context=ToolContext(info=dict(docs=docs)))

        for hop_idx in range(self.max_iter):
            # Generate search query
            if hop_idx == 0:
                # First hop: use the original question
                query = question
                top_n = 2
            else:
                # Subsequent hops: generate query based on collected info
                query_pred = self.generate_query(
                    question=question,
                    collected_info="\n".join([item.format() for item in collected_info]) if collected_info else "No information collected yet",
                )
                query = query_pred.search_query
                top_n = max(min(query_pred.top_n, 3), 1)

            # Retrieve documents using the MuSiQue retrieve tool
            retrieved_text = self.retrieve_tool(run_context_wrapper, query=query, top_n=top_n)

            # Extract document IDs from retrieved text using the official function
            doc_ids = extract_all_retrieved_doc_ids(retrieved_text)
            for doc_id in doc_ids:
                if doc_id not in retrieved_doc_ids:
                    retrieved_doc_ids.append(doc_id)

            # Extract key information from retrieved documents
            info_pred = self.extract_info(question=question, documents=retrieved_text)
            collected_info.extend(info_pred.key_informations)

            decision_pred = self.decide_info_collect(question=question, all_information="\n".join([item.format() for item in collected_info]))        
            if decision_pred.has_collected_enough_info:
                break

        # Generate final answer based on all collected information
        answer_pred: GenerateAnswer = self.generate_answer(question=question, all_information="\n".join([item.format() for item in collected_info]))

        return dspy.Prediction(
            answer=answer_pred.answer,
            collected_info=collected_info,
            retrieved_doc_ids=retrieved_doc_ids,
            citations=answer_pred.citations,
        )

program = MultiHopQA()

In [10]:
example = train_ds[3]
example

Example({'question': 'The mosaic in the church in the city where Maria Tsiartsiani was born, is known as what?', 'answer': "Christ in majesty (or Ezekiel's Vision)", 'answers': ["Christ in majesty (or Ezekiel's Vision)", "christ in majesty (or ezekiel's vision)"], 'docs': [{'body': 'In the Iconoclastic era, figural mosaics were also condemned as idolatry. The Iconoclastic churches were embellished with plain gold mosaics with only one great cross in the apse like the Hagia Irene in Constantinople (after 740). There were similar crosses in the apses of the Hagia Sophia Church in Thessaloniki and in the Church of the Dormition in Nicaea. The crosses were substituted with the image of the Theotokos in both churches after the victory of the Iconodules (787–797 and in 8th–9th centuries respectively, the Dormition church was totally destroyed in 1922).', 'id': '0', 'is_supporting': False, 'text': '# Mosaic\nIn the Iconoclastic era, figural mosaics were also condemned as idolatry. The Iconocl

In [11]:
pred = program(example.question, example.docs)
pred

Prediction(
    answer="Christ in majesty (or Ezekiel's Vision)",
    collected_info=[KeyInformation(info='Maria Tsiartsiani was born in Thessaloniki', source_doc_id='1'), KeyInformation(info="The mosaic in the Church of Hosios David in Thessaloniki is known as the 'Christ in majesty (or Ezekiel's Vision)' mosaic.", source_doc_id='5')],
    retrieved_doc_ids=['1', '2', '12', '5'],
    citations=['5']
)

In [12]:
def metric_retrieval_recall(example, pred, trace=None):
    """Retrieval recall metric - fraction of supporting documents found."""
    if not example.supporting_ids:
        return 1.0  # No supporting documents to evaluate

    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)

    if not gold_ids:
        return 1.0

    found = gold_ids.intersection(retrieved_ids)
    return len(found) / len(gold_ids)


def metric_retrieval_precision(example, pred, trace=None):
    """Retrieval precision metric - fraction of retrieved documents that are supporting."""
    if not example.supporting_ids:
        return 1.0

    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)
    found = gold_ids.intersection(retrieved_ids)
    return len(found) / len(retrieved_ids)


def metric_answer_exact_match(example, pred, trace=None):
    """Exact match metric for MuSiQue using the official metrics."""
    return exact_match(pred.answer, example.answers)


def metric_answer_f1_score(example, pred, trace=None):
    """Token-level F1 score using the official metrics."""
    return f1(pred.answer, example.answers)


def metric_citation_f1(example, pred, trace=None):
    """Citation accuracy metrics - precision, recall, F1 for cited document IDs."""
    # Convert to sets for easy comparison
    gold_ids = set(example.supporting_ids) if example.supporting_ids else set()
    cited_ids = set(str(doc_id) for doc_id in pred.citations)  # Ensure string format

    # Handle edge cases
    if not gold_ids:
        raise ValueError("Supporting docs must be provided for citation metric")

    if not cited_ids:
        # No citations given but some needed
        return 0.0

    # Calculate standard precision/recall/F1
    correct_citations = cited_ids & gold_ids
    precision = len(correct_citations) / len(cited_ids)
    recall = len(correct_citations) / len(gold_ids)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return f1


def metric(example, pred, trace=None):
    """Combined metric for MuSiQue: weighted by number of hops."""
    retrieval_recall_score = metric_retrieval_recall(example, pred, trace)
    retrieval_precision_score = metric_retrieval_precision(example, pred, trace)
    answer_f1_score = metric_answer_f1_score(example, pred, trace)
    citation_f1 = metric_citation_f1(example, pred, trace)

    # Combine metrics: EM and F1 for answer quality, retrieval recall for completeness,
    # citation F1 for proper attribution
    score_weight_pairs = [
        (retrieval_recall_score, 0.9),  # Retrieval recall for finding supporting docs
        (retrieval_precision_score, 0.5),  # Retrieval precision for finding supporting docs
        (answer_f1_score, 1.0),  # F1
        (citation_f1, 0.7),  # Citation accuracy for proper attribution
    ]

    return sum(score * weight for score, weight in score_weight_pairs) / sum(weight for _, weight in score_weight_pairs)

In [13]:
metric(example, pred)

0.8440860215053764

In [14]:
# Evaluate original program
print("📊 Evaluating ORIGINAL program...")
original_evaluate = dspy.Evaluate(
    devset=test_ds,
    metric=metric,
    num_threads=8,
    display_table=False,
    display_progress=True
)
original_eval_result = original_evaluate(program)

📊 Evaluating ORIGINAL program...
Average Metric: 34.81 / 50 (69.6%): : 51it [12:36, 14.84s/it]                                                                                                                                         

2025/09/30 17:26:37 INFO dspy.evaluate.evaluate: Average Metric: 34.80956083099297 / 50 (69.6%)



🏃 View run eval at: http://localhost:5005/#/experiments/1/runs/4b8d61076ae649a09876c1bb09cf8a32
🧪 View experiment at: http://localhost:5005/#/experiments/1


## GEPA Optimization

GEPA is a reflective prompt optimizer that uses textual feedback to improve performance. We'll create feedback functions for each evaluation aspect and optimize our multi-hop QA program.


In [15]:
def feedback_retrieval_recall(example, pred):
    """Generate feedback for retrieval recall evaluation."""
    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)
    found = gold_ids.intersection(retrieved_ids)
    recall_score = len(found) / len(gold_ids)

    if recall_score == 1.0:
        feedback = f"Perfect retrieval! You found all {len(gold_ids)} supporting documents: {sorted(found)}"
    elif recall_score >= 0.5:
        missing_ids = gold_ids - found
        feedback = (
            f"Good retrieval (recall: {recall_score:.2f}). Found {len(found)} out of {len(gold_ids)} "
            f"supporting documents. Missing: {sorted(missing_ids)}. Consider refining your search queries "
            f"to find the remaining relevant documents."
        )
    else:
        missing_ids = gold_ids - found
        feedback = (
            f"Poor retrieval (recall: {recall_score:.2f}). Only found {len(found)} out of {len(gold_ids)} "
            f"supporting documents. Missing critical documents: {sorted(missing_ids)}. "
            f"Your search queries need to be more comprehensive and targeted."
        )

    return recall_score, feedback


def feedback_retrieval_precision(example, pred):
    """Generate feedback for retrieval precision evaluation."""
    gold_ids = set(example.supporting_ids)
    retrieved_ids = set(pred.retrieved_doc_ids)

    if not retrieved_ids:
        return 0.0, "No documents were retrieved. Your search queries need to find relevant documents."

    found = gold_ids.intersection(retrieved_ids)
    precision_score = len(found) / len(retrieved_ids)
    irrelevant_docs = retrieved_ids - gold_ids

    if precision_score == 1.0:
        feedback = (
            f"Perfect precision! All {len(retrieved_ids)} retrieved documents are supporting documents: {sorted(found)}"
        )
    elif precision_score >= 0.7:
        feedback = (
            f"Good precision (precision: {precision_score:.2f}). {len(found)} out of {len(retrieved_ids)} "
            f"retrieved documents are relevant. Irrelevant docs: {sorted(irrelevant_docs)}. "
            f"Consider making your search queries more specific to avoid irrelevant documents."
        )
    elif precision_score >= 0.3:
        feedback = (
            f"Moderate precision (precision: {precision_score:.2f}). Only {len(found)} out of {len(retrieved_ids)} "
            f"retrieved documents are relevant. Many irrelevant docs retrieved: {sorted(irrelevant_docs)}. "
            f"Your search queries are too broad - focus on more specific terms and entities."
        )
    else:
        feedback = (
            f"Poor precision (precision: {precision_score:.2f}). Only {len(found)} out of {len(retrieved_ids)} "
            f"retrieved documents are relevant. Most retrieved docs are irrelevant: {sorted(irrelevant_docs)}. "
            f"Your search queries are retrieving too many irrelevant documents. Be much more specific and targeted."
        )

    return precision_score, feedback


def feedback_answer_exact_match(example, pred):
    """Generate feedback for exact match evaluation."""
    em_score = exact_match(pred.answer, example.answers)

    if em_score == 1.0:
        feedback = f"Perfect! You provided the exact correct answer: '{pred.answer}'. This matches the expected answer exactly."
    else:
        # Find the best matching answer for more specific feedback
        best_answer = example.answers[0] if example.answers else "N/A"
        feedback = (
            f"Your answer '{pred.answer}' doesn't exactly match the expected answer '{best_answer}'. "
            f"Consider being more precise with entity names, dates, and specific facts."
        )

    return em_score, feedback


def feedback_answer_f1_score(example, pred):
    """Generate feedback for F1 score evaluation."""
    f1_score = f1(pred.answer, example.answers)

    if f1_score >= 0.9:
        feedback = f"Excellent! Your answer has high overlap (F1: {f1_score:.2f}) with the expected answer. Good token-level accuracy."
    elif f1_score >= 0.5:
        feedback = (
            f"Good partial match (F1: {f1_score:.2f}). Your answer contains relevant information but "
            f"could be more complete or precise. Consider including more specific details from the retrieved documents."
        )
    else:
        best_answer = example.answers[0] if example.answers else "N/A"
        feedback = (
            f"Low overlap (F1: {f1_score:.2f}) with expected answer. Your answer '{pred.answer}' "
            f"differs significantly from '{best_answer}'. Focus on extracting the specific information "
            f"requested in the question."
        )

    return f1_score, feedback


def feedback_citation_f1(example, pred):
    """Generate feedback for citation F1 evaluation."""
    gold_ids = set(example.supporting_ids) if example.supporting_ids else set()
    cited_ids = set(str(doc_id) for doc_id in pred.citations)

    if not gold_ids:
        return 1.0, "No supporting documents to cite."

    if not cited_ids:
        feedback = f"You didn't cite any documents, but should have cited: {sorted(gold_ids)}. Always cite the documents that support your answer."
        return 0.0, feedback

    correct_citations = cited_ids & gold_ids
    precision = len(correct_citations) / len(cited_ids)
    recall = len(correct_citations) / len(gold_ids)
    citation_f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    if citation_f1 >= 0.9:
        feedback = f"Excellent citations (F1: {citation_f1:.2f})! You properly cited the supporting documents: {sorted(correct_citations)}"
    elif citation_f1 >= 0.5:
        incorrect_citations = cited_ids - gold_ids
        missing_citations = gold_ids - cited_ids
        feedback = f"Good citations (F1: {citation_f1:.2f}). Correct: {sorted(correct_citations)}. "
        if incorrect_citations:
            feedback += f"Unnecessary: {sorted(incorrect_citations)}. "
        if missing_citations:
            feedback += f"Missing: {sorted(missing_citations)}. "
        feedback += "Be more precise about which documents actually support your answer."
    else:
        incorrect_citations = cited_ids - gold_ids
        missing_citations = gold_ids - cited_ids
        feedback = (
            f"Poor citations (F1: {citation_f1:.2f}). You cited {sorted(cited_ids)} but should cite {sorted(gold_ids)}. "
            f"Focus on identifying which documents directly support your answer claims."
        )

    return citation_f1, feedback


def metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Combined metric for MuSiQue with feedback for GEPA optimization.
    Returns a dspy.Prediction with score (float) and feedback (str).

    The feedback is targeted at specific predictors when pred_name is provided,
    helping GEPA understand how to improve each component.
    """
    # Compute feedback and scores for all metrics
    score_answer_f1, fb_answer_f1 = feedback_answer_f1_score(example, pred)
    score_retrieval_recall, fb_retrieval_recall = feedback_retrieval_recall(example, pred)
    score_retrieval_precision, fb_retrieval_precision = feedback_retrieval_precision(example, pred)
    score_citation_f1, fb_citation_f1 = feedback_citation_f1(example, pred)

    # Combined score: weighted average of all metrics (same as original metric)
    score_weight_pairs = [
        (score_answer_f1, 1.0),  # Answer F1
        (score_retrieval_recall, 0.9),  # Retrieval recall for finding supporting docs
        (score_retrieval_precision, 0.5),  # Retrieval precision for finding supporting docs
        (score_citation_f1, 0.7),  # Citation accuracy for proper attribution
    ]

    total_score = sum(score * weight for score, weight in score_weight_pairs) / sum(
        weight for _, weight in score_weight_pairs
    )

    # Provide targeted feedback based on the predictor being optimized
    if pred_name == "generate_query.predict":
        # Focus on query generation quality and retrieval effectiveness
        feedback = (
            fb_retrieval_recall
            + " "
            + fb_retrieval_precision
            + " "
            + "Your search queries should be both comprehensive (high recall) and specific (high precision). "
            "Consider what entities, relationships, or facts are needed for each hop of reasoning."
        )

    elif pred_name == "extract_info.predict":
        # Focus on information extraction quality
        feedback = (
            fb_answer_f1
            + " "
            + (
                "Focus on extracting the most relevant facts, entities, and relationships from the retrieved documents. "
                "Make sure to capture information that directly helps answer the question or leads to the next reasoning step."
            )
        )

    elif pred_name == "generate_answer.predict":
        # Focus on answer generation and citation quality
        feedback = (
            fb_answer_f1
            + " "
            + fb_citation_f1
            + " "
            + (
                "Provide precise, complete answers using the exact information from the retrieved documents. "
                "Always cite the document IDs that support your answer claims."
            )
        )
    else:
        # Generic feedback combining all aspects
        feedback = "\n".join([
            "Overall performance breakdown:",
            f"- Answer F1 Score: {fb_answer_f1}",
            f"- Retrieval Recall: {fb_retrieval_recall}",
            f"- Retrieval Precision: {fb_retrieval_precision}",
            f"- Citations F1 Score: {fb_citation_f1}"
        ])

    return dspy.Prediction(score=total_score, feedback=feedback)


In [16]:
# Test the feedback metric on our example
feedback_result = metric_with_feedback(example, pred)
print(f"Score: {feedback_result.score:.3f}")
print(f"Feedback: {feedback_result.feedback}")


Score: 0.844
Feedback: Overall performance breakdown:
- Answer F1 Score: Excellent! Your answer has high overlap (F1: 1.00) with the expected answer. Good token-level accuracy.
- Retrieval Recall: Perfect retrieval! You found all 2 supporting documents: ['1', '5']
- Retrieval Precision: Moderate precision (precision: 0.50). Only 2 out of 4 retrieved documents are relevant. Many irrelevant docs retrieved: ['12', '2']. Your search queries are too broad - focus on more specific terms and entities.
- Citations F1 Score: Good citations (F1: 0.67). Correct: ['5']. Missing: ['1']. Be more precise about which documents actually support your answer.


In [17]:
from dspy import GEPA

# Set up GEPA optimizer with reflection LM for optimization
optimizer = GEPA(
    metric=metric_with_feedback,
    auto="light",  # Use light budget for faster experimentation. Use "heavy" for best performance
    num_threads=8,
    track_stats=True,
    use_merge=False,
    reflection_lm=reflection_lm  
)

print("✅ GEPA optimizer configured")

✅ GEPA optimizer configured


In [None]:
# Run GEPA optimization
print("🚀 Starting GEPA optimization...")

optimized_program = optimizer.compile(
    program,
    trainset=train_ds,
    valset=val_ds,
)

print("✅ GEPA optimization completed!")


2025/09/30 17:26:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '72261987ca3342a2a49dd381825e0dce', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current dspy workflow
2025/09/30 17:26:38 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1765 metric calls of the program. This amounts to 29.42 full evals on the train+val set.
2025/09/30 17:26:38 INFO dspy.teleprompt.gepa.gepa: Using 30 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.


🚀 Starting GEPA optimization...


GEPA Optimization:   0%|                                                                                                                                               | 0/1765 [00:00<?, ?rollouts/s]2025/09/30 17:42:02 INFO dspy.evaluate.evaluate: Average Metric: 17.871924849344207 / 30 (59.6%)


🏃 View run eval_0 at: http://localhost:5005/#/experiments/1/runs/73771306fb7240a7a3f0a3338e436b5f
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 17:42:02 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.5957308283114735
GEPA Optimization:   2%|██▏                                                                                                                                | 30/1765 [15:24<14:51:08, 30.82s/rollouts]2025/09/30 17:42:02 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.5957308283114735


Average Metric: 2.40 / 3 (80.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:17<00:00, 45.93s/it]

2025/09/30 17:44:21 INFO dspy.evaluate.evaluate: Average Metric: 2.399462365591398 / 3 (80.0%)



🏃 View run eval_1 at: http://localhost:5005/#/experiments/1/runs/8d638260df954da28d57a576c538c532
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 17:45:03 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for generate_query.predict: Given a multi-hop question and collected information, generate a search query to find the next piece of information needed to answer the question. Focus on **specific entities, relationships, or facts** that require step-by-step resolution. Prioritize **precision and recall** by:  

1. **Breaking down the question into logical hops**: Identify intermediate entities, dates, or facts that must be resolved (e.g., "Church of St. Demetrios in Thessaloniki" instead of generic terms like "church in Thessaloniki").  
2. **Leveraging collected info**: Use explicitly stated entities (e.g., "UN Command," "Khabarovsk") and their relationships to narrow queries.  
3. **Anticipating ambiguity**: If terms like "regions" or "mosaics" are vague, consider standard classifications (e.g., "sovereign states in Asia" or "famous mosaics in Church of St. Demetrios").  
4. **Refining queries for speci

🏃 View run eval_2 at: http://localhost:5005/#/experiments/1/runs/a907004d769c43a5898c71d9cd35d484
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 17:49:40 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score is not better, skipping
GEPA Optimization:   2%|██▋                                                                                                                                | 36/1765 [23:01<19:38:46, 40.91s/rollouts]2025/09/30 17:49:40 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.5957308283114735


Average Metric: 1.97 / 3 (65.7%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:19<00:00, 46.53s/it]

2025/09/30 17:52:00 INFO dspy.evaluate.evaluate: Average Metric: 1.9704301075268817 / 3 (65.7%)



🏃 View run eval_3 at: http://localhost:5005/#/experiments/1/runs/65f047c5c35b48459bc5263bd575cb2c
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 17:52:57 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for extract_info.predict: markdown
### New Instruction for the Assistant

**Task Description:**  
You are to extract **precise and actionable key information** from provided documents that either directly answers a question or provides entities/relationships needed for the next retrieval step. Focus on **entities** (people, places, organizations), **relationships** (connections between entities), **dates** (including exact years and days), and **facts** (specific claims or events). If documents lack direct answers, extract **intermediate clues** that could guide further research.

---

**Critical Guidelines:**  
1. **Extract All Relevant Entities**  
   - Even if a document does not directly answer the question, extract **any entity** (e.g., locations, names, organizations) that could be connected to the question.  
   - Example: If a question involves a president’s work location, extract all locations m

🏃 View run eval_4 at: http://localhost:5005/#/experiments/1/runs/9d2301f4f4fd4b2d80254ade125a7700
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:17:41 INFO dspy.evaluate.evaluate: Average Metric: 17.00138248847926 / 30 (56.7%)


🏃 View run eval_5 at: http://localhost:5005/#/experiments/1/runs/c8198af2a1164ff28139fe078d967b3a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:17:41 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Full valset score for new program: 0.5667127496159754
2025/09/30 18:17:41 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Full train_val score for new program: 0.5667127496159754
2025/09/30 18:17:41 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Individual valset scores for new program: [0.8870967741935484, 0.26774193548387093, 0.9247311827956989, 0.15053763440860216, 0.521505376344086, 0.8709677419354839, 0.8387096774193549, 0.7365591397849461, 0.46159754224270355, 0.3010752688172043, 0.4569892473118279, 0.3010752688172043, 0.39677419354838706, 0.2258064516129032, 0.9354838709677419, 0.8225806451612903, 0.7365591397849461, 0.6612903225806451, 0.30806451612903224, 0.39677419354838706, 0.564516129032258, 0.8225806451612903, 0.5903225806451613, 0.5510752688172043, 0.39784946236559143, 0.3010752688172043, 0.1774193548387097, 0.643010752688172, 1.0, 0.7516129032258064]
2025/09/30 18:17:41 INFO dspy.teleprompt.gepa.gepa: Iteration 

Average Metric: 2.12 / 3 (70.8%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:54<00:00, 38.05s/it]

2025/09/30 18:19:36 INFO dspy.evaluate.evaluate: Average Metric: 2.1236559139784945 / 3 (70.8%)



🏃 View run eval_6 at: http://localhost:5005/#/experiments/1/runs/273b1735ba2f42daa7a0983c644f8eb4
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:20:35 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for decide_info_collect.predict: text
**Instruction for the Assistant:**

Given a `question` and `all_information`, your task is to determine whether the provided information contains **sufficient, relevant, and accurate details** to answer the question. Follow these steps to ensure correctness:

1. **Analyze the Question:**
   - Identify key entities, dates, and relationships (e.g., "father of the artist," "Cameroon's main ally in 1306," "largest peacekeeping troop contributor").
   - Note any historical, geopolitical, or contextual nuances (e.g., Cameroon as a modern nation did not exist in 1306; Ethiopia's role as a peacekeeping contributor).

2. **Extract and Validate Information:**
   - Use `all_information` to locate **directly relevant facts**. For example:
     - If the question asks about a person’s parent, ensure the provided information explicitly states the parent-child relationship (e.g., "J

🏃 View run eval_7 at: http://localhost:5005/#/experiments/1/runs/ee4fc67d8d5a4e17bbfa1063963cf8f5
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:20:53 INFO dspy.teleprompt.gepa.gepa: Iteration 3: New subsample score is not better, skipping
GEPA Optimization:   4%|█████▊                                                                                                                             | 78/1765 [54:14<20:08:20, 42.98s/rollouts]2025/09/30 18:20:53 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 1 score: 0.5667127496159754


Average Metric: 2.22 / 3 (74.1%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:25<00:00, 48.35s/it]

2025/09/30 18:23:18 INFO dspy.evaluate.evaluate: Average Metric: 2.2236559139784946 / 3 (74.1%)



🏃 View run eval_8 at: http://localhost:5005/#/experiments/1/runs/cbbe88529fd94b9e896d68b729f7e669
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:24:13 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for generate_answer.predict: Given a multi-hop question and a set of retrieved documents, provide a concise, precise answer that directly addresses the question using exact entities, dates, or facts from the documents. Follow these steps:  

1. **Answer the question explicitly**: Focus solely on the specific information requested (e.g., a number, name, or entity) without adding extraneous context or speculation.  
2. **Chain reasoning with citations**: Use a logical multi-hop approach to connect facts from the documents. For *every* claim or step in your reasoning, cite the exact document ID(s) that support it.  
3. **Include *all* supporting citations**: If a document contributes to the reasoning (even indirectly), cite it. For example:  
   - If the question requires connecting "X" and "Y," and documents [A] and [B] are used to establish the link, cite both.  
   - If a document provides context for a 

🏃 View run eval_9 at: http://localhost:5005/#/experiments/1/runs/fb4a0ee0149f4c3c8f07968c6666725a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:26:15 INFO dspy.evaluate.evaluate: Average Metric: 17.526497695852534 / 30 (58.4%)


🏃 View run eval_10 at: http://localhost:5005/#/experiments/1/runs/7aefc217db0442f0964c1690a3406881
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:26:16 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full valset score for new program: 0.5842165898617512
2025/09/30 18:26:16 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full train_val score for new program: 0.5842165898617512
2025/09/30 18:26:16 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Individual valset scores for new program: [0.9548387096774194, 0.26774193548387093, 0.9247311827956989, 0.2408602150537634, 0.3709677419354838, 0.39784946236559143, 0.7849462365591396, 0.7365591397849461, 0.46774193548387094, 0.4139784946236559, 0.4569892473118279, 0.3913978494623656, 0.4569892473118279, 0.2258064516129032, 1.0, 0.8903225806451612, 0.7741935483870968, 0.6612903225806451, 0.30806451612903224, 0.39677419354838706, 0.6370967741935484, 0.8903225806451612, 0.48924731182795694, 0.5510752688172043, 0.510752688172043, 0.3913978494623656, 0.26774193548387093, 0.8119815668202764, 1.0, 0.8548387096774193]
2025/09/30 18:26:16 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New valset

Average Metric: 2.36 / 3 (78.6%): : 4it [07:08, 107.21s/it]                                                                                                                                           

2025/09/30 18:33:25 INFO dspy.evaluate.evaluate: Average Metric: 2.358064516129032 / 3 (78.6%)



🏃 View run eval_11 at: http://localhost:5005/#/experiments/1/runs/60c53d5c2a7c465bb2df1ab352b10611
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:34:31 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for generate_query.predict: Given a multi-hop question and partial collected information, generate a precise and focused search query to retrieve the next critical piece of information required to answer the question. Follow these guidelines:  

1. **Entity Disambiguation**:  
   - Explicitly distinguish between similar-sounding or related entities (e.g., "Democratic Republic of the Congo" vs. "Republic of the Congo").  
   - Verify the exact name of countries, people, or concepts mentioned in the question (e.g., clarify "Purmerbuurt" as part of the Netherlands).  

2. **Multi-Hop Reasoning**:  
   - Identify the next logical step in the reasoning chain. For example:  
     - If the question requires a leader of a country, ensure the query specifies the **exact country name** and **timeframe** (e.g., "First president of the Democratic Republic of the Congo after independence").  
     - If the question i

🏃 View run eval_12 at: http://localhost:5005/#/experiments/1/runs/6256e40b8020477cb2952070b5dea768
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:39:57 INFO dspy.teleprompt.gepa.gepa: Iteration 5: New subsample score is not better, skipping
GEPA Optimization:   7%|████████▋                                                                                                                       | 120/1765 [1:13:19<17:06:26, 37.44s/rollouts]2025/09/30 18:39:57 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Selected program 1 score: 0.5667127496159754


  0%|                                                                                                                                                                           | 0/3 [00:00<?, ?it/s]



Average Metric: 1.48 / 2 (74.1%):  67%|██████████████████████████████████████████████████████████████████████████████████████                                           | 2/3 [01:53<00:58, 58.79s/it]



Average Metric: 2.17 / 3 (72.3%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:36<00:00, 72.14s/it]

2025/09/30 18:43:34 INFO dspy.evaluate.evaluate: Average Metric: 2.168817204301075 / 3 (72.3%)



🏃 View run eval_13 at: http://localhost:5005/#/experiments/1/runs/8d331aa8dd3440ababb174ba357d04e7
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:44:42 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for extract_info.predict: markdown
### Revised Instruction for the Assistant

**Task Description:**  
You are to extract **precise, actionable, and unambiguous key information** from documents to either directly answer a question or provide **entities/relationships** critical for subsequent retrieval steps. Focus on **entities** (people, places, organizations), **relationships** (explicit or implied connections), **dates** (exact day, month, year), and **facts** (specific claims or events). If documents lack direct answers, extract **intermediate clues** (e.g., partial entities, contextual relationships) that could guide further research. Avoid speculative reasoning but document all relevant leads.

---

**Critical Guidelines:**  
1. **Extract All Entities with Context**  
   - Extract **all entities** (names, locations, organizations) even if their connection to the question is indirect.  
   - Example:

🏃 View run eval_14 at: http://localhost:5005/#/experiments/1/runs/13f587cb91c24cb183616d8279a4a968
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:47:27 INFO dspy.teleprompt.gepa.gepa: Iteration 6: New subsample score is not better, skipping
GEPA Optimization:   7%|█████████▏                                                                                                                      | 126/1765 [1:20:49<19:25:53, 42.68s/rollouts]2025/09/30 18:47:27 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Selected program 0 score: 0.5957308283114735


Average Metric: 2.70 / 3 (89.9%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:59<00:00, 99.92s/it]

2025/09/30 18:52:28 INFO dspy.evaluate.evaluate: Average Metric: 2.696236559139785 / 3 (89.9%)



🏃 View run eval_15 at: http://localhost:5005/#/experiments/1/runs/c4158cae690d4a25b71cbec445d2182f
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 18:53:27 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for decide_info_collect.predict: You are to determine whether the `all_information` provided contains sufficient details to answer the `question`. Your task is to:  

1. **Parse the Question**:  
   - Break the question into logical components (e.g., identifying entities, relationships, or specific data points).  
   - Map each component to the corresponding information in `all_information`.  

2. **Analyze `all_information`**:  
   - Check for **explicit mentions** of entities, locations, dates, or relationships required to answer the question.  
   - Resolve **conflicts** (e.g., conflicting headquarters locations in Example 1) by prioritizing the most directly relevant or unambiguous information.  
   - Ensure **citations** are used precisely:  
     - Only cite documents that **directly support** a specific claim in your reasoning.  
     - Avoid citing irrelevant documents (e.g., [16] in Example 3). 

🏃 View run eval_16 at: http://localhost:5005/#/experiments/1/runs/0ef2084129c844c3851daf02b0922bb3
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:07:54 INFO dspy.evaluate.evaluate: Average Metric: 18.298268935365712 / 30 (61.0%)


🏃 View run eval_17 at: http://localhost:5005/#/experiments/1/runs/3bc18299bd284239b5271353ca6e5f82
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:07:55 INFO dspy.teleprompt.gepa.gepa: Iteration 7: New program is on the linear pareto front
2025/09/30 19:07:55 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Full valset score for new program: 0.6099422978455237
2025/09/30 19:07:55 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Full train_val score for new program: 0.6099422978455237
2025/09/30 19:07:55 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Individual valset scores for new program: [0.9548387096774194, 0.26774193548387093, 0.9247311827956989, 0.8467741935483871, 0.521505376344086, 0.8709677419354839, 0.7311827956989246, 0.8225806451612903, 0.6069124423963134, 0.4139784946236559, 0.5096774193548387, 0.3913978494623656, 0.4693548387096774, 0.9462365591397849, 0.9354838709677419, 0.15053763440860216, 0.7365591397849461, 0.5483870967741935, 0.3, 0.4032258064516129, 0.5919354838709677, 0.7666666666666666, 0.8118279569892473, 0.5510752688172043, 0.5483870967741935, 0.4032258064516129, 0.41129032258064513, 0.684367245657568

Average Metric: 1.29 / 3 (42.9%): : 4it [08:31, 127.95s/it]                                                                                                                                           

2025/09/30 19:16:27 INFO dspy.evaluate.evaluate: Average Metric: 1.2881720430107526 / 3 (42.9%)



🏃 View run eval_18 at: http://localhost:5005/#/experiments/1/runs/98a74a2cf0f040f9b6f6faee11fb6546
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:17:23 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for decide_info_collect.predict: You are tasked with determining whether the provided `all_information` contains sufficient details to answer the given `question`. Your output must include two fields:  
1. **`reasoning`**: A detailed explanation of whether the information is sufficient. Specifically, you must:  
   - Identify *all required components* of the question (e.g., specific entities, locations, dates, relationships, or numerical thresholds).  
   - Check if each component is explicitly stated in `all_information`, including:  
     - The identity of ambiguous terms (e.g., "Starting Time performer" in Example 1).  
     - Geographical or historical context (e.g., linking "Jacobinism" to Europe in Example 3).  
     - Numerical or comparative data (e.g., distances, population figures).  
   - Avoid assumptions or inferences beyond the explicitly stated information.  
   - Explicitly note *missing 

🏃 View run eval_19 at: http://localhost:5005/#/experiments/1/runs/eae8690d14354fd8855d06dd2d884c1a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:20:39 INFO dspy.teleprompt.gepa.gepa: Iteration 8: New subsample score is not better, skipping
GEPA Optimization:  10%|████████████▏                                                                                                                   | 168/1765 [1:54:00<21:24:18, 48.25s/rollouts]2025/09/30 19:20:39 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Selected program 3 score: 0.6099422978455237


Average Metric: 2.11 / 3 (70.2%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:38<00:00, 72.81s/it]

2025/09/30 19:24:17 INFO dspy.evaluate.evaluate: Average Metric: 2.106989247311828 / 3 (70.2%)



🏃 View run eval_20 at: http://localhost:5005/#/experiments/1/runs/dfeff92084e24345a960d0beecb95ddc
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:25:21 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for generate_answer.predict: Given a multi-hop question and a set of retrieved documents, provide a concise answer that directly addresses the question using **exact entities, dates, and facts from the documents**. Follow these steps:  

1. **Chain reasoning**: Use all relevant documents to logically connect entities, locations, and events.  
   - Example: If the question links two entities (e.g., "Al-Mu'tamid's successor" and "Al-Qanjarah"), explicitly reference documents that establish their relationships and locations.  

2. **Cite comprehensively**: Include **all document IDs** that directly or indirectly support your answer, even if they are redundant or repeated.  
   - Example: If a document confirms a location (e.g., [10] for Al-Qanjarah in Syria) and another confirms an event (e.g., [3] for the 634 CE invasion), cite both.  
   - Example: If a document is used in reasoning (e.g., [2] for Al-Mu't

🏃 View run eval_21 at: http://localhost:5005/#/experiments/1/runs/59df1079b80a49a180dbfd76c50332f0
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:26:37 INFO dspy.teleprompt.gepa.gepa: Iteration 9: New subsample score is not better, skipping
GEPA Optimization:  10%|████████████▌                                                                                                                   | 174/1765 [1:59:58<22:02:08, 49.86s/rollouts]2025/09/30 19:26:37 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Selected program 3 score: 0.6099422978455237


Average Metric: 1.99 / 3 (66.3%): : 4it [03:51, 57.75s/it]                                                                                                                                            

2025/09/30 19:30:28 INFO dspy.evaluate.evaluate: Average Metric: 1.9882488479262672 / 3 (66.3%)



🏃 View run eval_22 at: http://localhost:5005/#/experiments/1/runs/9126a2ba4f4a4de4a9845fc948f750f2
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:31:35 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for generate_query.predict: Given a multi-hop question and partial information collected so far, generate a **specific and comprehensive search query** to resolve the next unresolved entity, date, or factual relationship required to answer the question.  

**Key Requirements:**  
1. **Break down the question into reasoning steps**: Identify the exact entities, relationships, or facts that must be resolved in sequence. For example:  
   - If the question involves a chain of events (e.g., "X did Y to Z"), ensure each link in the chain is addressed step-by-step.  
   - Prioritize resolving ambiguous or missing entities (e.g., "country X" → specify the exact name once partially known).  

2. **Use precise terms and entities from collected info**:  
   - Incorporate **exact names** of countries, historical periods, or specific events (e.g., "Portuguese colonial period in Brazil" instead of "colonial era").  

🏃 View run eval_23 at: http://localhost:5005/#/experiments/1/runs/04c56dd87b504fcaa31ddb2c421add03
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:42:21 INFO dspy.teleprompt.gepa.gepa: Iteration 10: New subsample score is not better, skipping
GEPA Optimization:  10%|█████████████                                                                                                                   | 180/1765 [2:15:43<29:52:46, 67.87s/rollouts]2025/09/30 19:42:21 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Selected program 3 score: 0.6099422978455237


Average Metric: 1.57 / 3 (52.4%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:29<00:00, 49.80s/it]

2025/09/30 19:44:51 INFO dspy.evaluate.evaluate: Average Metric: 1.5709677419354837 / 3 (52.4%)



🏃 View run eval_24 at: http://localhost:5005/#/experiments/1/runs/db72dc7dabae43048a80bf54e3432397
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:46:04 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for extract_info.predict: text
Given a question and retrieved documents, extract the most relevant and precise information that directly answers the question or provides critical context for the next retrieval step. Focus on the following:

1. **Entities**: Identify all named entities (people, places, organizations, events) explicitly mentioned in the documents. If the question references an entity not directly mentioned in the documents, note this explicitly but do not infer or assume connections.

2. **Relationships**: Extract relationships between entities (e.g., "X was the successor of Y," "Z occurred in Location A"). Ensure these relationships are explicitly stated in the documents.

3. **Dates and Events**: Capture exact dates, timeframes, and event names (e.g., "634 CE," "October 29, 2012"). If a document mentions a date that could answer the question, extract it verbatim, even if additional cont

🏃 View run eval_25 at: http://localhost:5005/#/experiments/1/runs/eb99e082c11c48e4bd31ef57f5ee7c53
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:48:31 INFO dspy.teleprompt.gepa.gepa: Iteration 11: New subsample score is not better, skipping
GEPA Optimization:  11%|█████████████▍                                                                                                                  | 186/1765 [2:21:53<29:14:36, 66.67s/rollouts]2025/09/30 19:48:31 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Selected program 3 score: 0.6099422978455237


Average Metric: 1.28 / 3 (42.8%): : 5it [07:39, 91.84s/it]                                                                                                                                            

2025/09/30 19:56:11 INFO dspy.evaluate.evaluate: Average Metric: 1.2838709677419353 / 3 (42.8%)



🏃 View run eval_26 at: http://localhost:5005/#/experiments/1/runs/e31ac37910cf45b6a2b7bc4e2709528f
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 19:57:42 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for decide_info_collect.predict: **Revised Instructions for the Assistant**  

You are to determine whether the `all_information` provided contains sufficient details to answer the `question`. Follow these steps **rigorously**, ensuring precision in citations and logical connections:  

---

### **1. Parse the Question with Explicit Mapping**  
- **Break the question into discrete components**:  
  - Identify **entities** (e.g., organizations, people, locations), **relationships** (e.g., "authorized troops," "place of birth"), and **data points** (e.g., "square miles," "number of regions").  
  - For example, if the question is *"How many regions in Asia does the UN Command recognize?"*, the components are:  
    - **Entity 1**: UN Command (action: authorized troops).  
    - **Entity 2**: Nikifor Popov’s birthplace (location: Khabarovsk, Asia).  
    - **Data Point**: Number of regions recognized by th

🏃 View run eval_27 at: http://localhost:5005/#/experiments/1/runs/05654b92c7704b89b4ef46fb885a55c4
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:09:50 INFO dspy.evaluate.evaluate: Average Metric: 18.05010240655402 / 30 (60.2%)


🏃 View run eval_28 at: http://localhost:5005/#/experiments/1/runs/4d99e98678c7466290804b71612e9f78
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:09:51 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Full valset score for new program: 0.6016700802184672
2025/09/30 20:09:51 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Full train_val score for new program: 0.6016700802184672
2025/09/30 20:09:51 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Individual valset scores for new program: [0.9548387096774194, 0.15053763440860216, 0.9247311827956989, 0.8467741935483871, 0.521505376344086, 0.8709677419354839, 0.7311827956989246, 0.8225806451612903, 0.6069124423963134, 0.4139784946236559, 0.5096774193548387, 0.3913978494623656, 0.4693548387096774, 0.9462365591397849, 0.9354838709677419, 0.15053763440860216, 0.7365591397849461, 0.5483870967741935, 0.3, 0.4032258064516129, 0.5919354838709677, 0.7666666666666666, 0.8118279569892473, 0.5510752688172043, 0.5483870967741935, 0.4032258064516129, 0.41129032258064513, 0.5534050179211469, 0.9247311827956989, 0.25268817204301075]
2025/09/30 20:09:51 INFO dspy.teleprompt.gepa.gepa: Iteratio

Average Metric: 2.33 / 3 (77.7%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:40<00:00, 53.47s/it]

2025/09/30 20:12:31 INFO dspy.evaluate.evaluate: Average Metric: 2.3311827956989246 / 3 (77.7%)



🏃 View run eval_29 at: http://localhost:5005/#/experiments/1/runs/acd104be4ce541338a1563a498562ee0
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:13:26 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for generate_answer.predict: You are to answer multi-hop questions by synthesizing information from provided documents. Follow these steps rigorously:  

1. **Break down the question into subcomponents**:  
   - Identify all entities, relationships, and required connections (e.g., "torch visit" → "city" → "World Bank regional office").  
   - Map each component to the exact entities/dates/facts in the documents.  

2. **Prioritize document accuracy and completeness**:  
   - Use **only the exact entities/dates/facts** from the provided `all_information`.  
   - If documents conflict (e.g., R&R Partners headquarters in Las Vegas [3] vs. San Francisco [8]), select the document most directly tied to the answer (e.g., [8] in Example 3).  
   - **Do not infer or assume** relationships not explicitly stated in the documents (e.g., avoid conflating Turkey with Ottoman dynasty unless explicitly linked in the do

🏃 View run eval_30 at: http://localhost:5005/#/experiments/1/runs/31d7439fc0334c8c9431b9ad09823976
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:19:38 INFO dspy.evaluate.evaluate: Average Metric: 18.9194234728895 / 30 (63.1%)


🏃 View run eval_31 at: http://localhost:5005/#/experiments/1/runs/ad87ffc6268b4fd58b834b116d8b8ad2
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:19:38 INFO dspy.teleprompt.gepa.gepa: Iteration 13: New program is on the linear pareto front
2025/09/30 20:19:38 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Full valset score for new program: 0.6306474490963166
2025/09/30 20:19:38 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Full train_val score for new program: 0.6306474490963166
2025/09/30 20:19:38 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Individual valset scores for new program: [1.0, 0.26774193548387093, 0.9247311827956989, 0.914516129032258, 0.5967741935483871, 0.8709677419354839, 0.7849462365591396, 0.8225806451612903, 0.6065202470830473, 0.3010752688172043, 0.4946236559139785, 0.4139784946236559, 0.4693548387096774, 0.9462365591397849, 1.0, 0.15053763440860216, 0.7741935483870968, 0.6612903225806451, 0.30806451612903224, 0.343010752688172, 0.5919354838709677, 0.8741935483870967, 0.8118279569892473, 0.5510752688172043, 0.6236559139784946, 0.27419354838709675, 0.5919354838709677, 0.6967741935483871, 1.0, 0.2

  0%|                                                                                                                                                                           | 0/3 [00:00<?, ?it/s]



Average Metric: 2.75 / 3 (91.8%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:51<00:00, 77.20s/it]

2025/09/30 20:23:30 INFO dspy.evaluate.evaluate: Average Metric: 2.7526881720430105 / 3 (91.8%)



🏃 View run eval_32 at: http://localhost:5005/#/experiments/1/runs/11d9ac43e976465eb190c56b493f5839
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:24:23 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for generate_query.predict: **Instruction for Generating Search Queries for Multi-Hop Questions**  

1. **Analyze the Question Structure**:  
   - Break down the question into sequential reasoning steps (hops). Identify the **primary entities** (e.g., people, locations, organizations) and **relationships** (e.g., "record label of," "birthplace of," "mosaic in") that need resolution at each step.  

2. **Leverage Collected Information**:  
   - Use the provided `collected_info` to determine what is already known and what remains unresolved. For example:  
     - If the performer of a song is known, the next hop is to resolve their record label.  
     - If a person’s city of association is known but their birthplace is not, prioritize resolving the birthplace.  

3. **Generate Specific Search Queries**:  
   - Focus on **precise entities** and **domain-specific terms** to avoid irrelevant results. Exampl

🏃 View run eval_33 at: http://localhost:5005/#/experiments/1/runs/9a7f09239e6c4ac5a753953117dca037
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:28:33 INFO dspy.teleprompt.gepa.gepa: Iteration 14: New subsample score is not better, skipping
GEPA Optimization:  15%|███████████████████▏                                                                                                            | 264/1765 [3:01:54<15:59:59, 38.37s/rollouts]2025/09/30 20:28:33 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Selected program 1 score: 0.5667127496159754


Average Metric: 2.83 / 3 (94.2%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:47<00:00, 55.83s/it]

2025/09/30 20:31:20 INFO dspy.evaluate.evaluate: Average Metric: 2.825268817204301 / 3 (94.2%)



🏃 View run eval_34 at: http://localhost:5005/#/experiments/1/runs/c8ffeb43217545b3842ca5c54db89833
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:32:24 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for generate_answer.predict: **Instructions for the Assistant:**  

You are to answer **multi-hop questions** by synthesizing information from provided documents. Follow these guidelines:  

1. **Answer Structure**:  
   - Provide a **concise, direct answer** to the question using **exact entities/dates/facts** from the documents.  
   - Include **citations** (document IDs in brackets, e.g., [13]) for **every factual claim** in your answer.  

2. **Reasoning Process**:  
   - Break the question into parts (e.g., identify key entities, relationships, or events).  
   - Map each part to the **specific document(s)** that provide the necessary information.  
   - Combine information logically to form the answer, ensuring **all supporting documents are cited**.  

3. **Citation Rules**:  
   - **Cite all documents** that directly support **any step of your reasoning**, even if they do not directly state the 

🏃 View run eval_35 at: http://localhost:5005/#/experiments/1/runs/26a29d158e7d4ead9f6d612ab3755bcc
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:32:40 INFO dspy.teleprompt.gepa.gepa: Iteration 15: New subsample score is not better, skipping
GEPA Optimization:  15%|███████████████████▌                                                                                                            | 270/1765 [3:06:02<16:05:17, 38.74s/rollouts]2025/09/30 20:32:40 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Selected program 2 score: 0.5842165898617512


Average Metric: 2.32 / 3 (77.2%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:43<00:00, 54.61s/it]

2025/09/30 20:35:24 INFO dspy.evaluate.evaluate: Average Metric: 2.3172043010752685 / 3 (77.2%)



🏃 View run eval_36 at: http://localhost:5005/#/experiments/1/runs/6fd9ae0468ac49e491260ea112b07170
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:36:15 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for generate_query.predict: Given a multi-hop question and partial collected information, generate a **specific and precise search query** to resolve the next unresolved entity, relationship, or fact in the reasoning chain. Follow these guidelines:  

1. **Break down the question into sequential hops**:  
   - Identify the exact entities, relationships, or facts that must be resolved step-by-step (e.g., person → university → publication, or artist → work → parent).  
   - Prioritize resolving ambiguous or unknown entities first (e.g., disambiguate "Five Treasure Island" if it is a fictional or non-standard term).  

2. **Use specific, unambiguous terms**:  
   - Incorporate **exact entity names** (e.g., "John Kerry" instead of "Kerry") and **relationships** (e.g., "father of", "attended university", "publication of").  
   - Avoid overly broad queries (e.g., "Kerry attended university" → refine to "John

🏃 View run eval_37 at: http://localhost:5005/#/experiments/1/runs/7ce14ed488a34145893be672b572959b
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:52:07 INFO dspy.evaluate.evaluate: Average Metric: 17.27488479262673 / 30 (57.6%)


🏃 View run eval_38 at: http://localhost:5005/#/experiments/1/runs/9a1968ac0f77431e88dcde1fe4409e9a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:52:08 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Full valset score for new program: 0.5758294930875576
2025/09/30 20:52:08 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Full train_val score for new program: 0.5758294930875576
2025/09/30 20:52:08 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Individual valset scores for new program: [0.914516129032258, 0.26774193548387093, 0.9247311827956989, 0.2408602150537634, 0.5483870967741935, 0.39784946236559143, 0.7849462365591396, 0.7096774193548387, 0.46774193548387094, 0.27419354838709675, 0.4569892473118279, 0.26774193548387093, 0.4569892473118279, 0.2258064516129032, 1.0, 0.914516129032258, 0.7741935483870968, 0.6612903225806451, 0.30806451612903224, 0.30645161290322576, 0.5241935483870968, 0.914516129032258, 0.47580645161290325, 0.5510752688172043, 0.6236559139784946, 0.3913978494623656, 0.26774193548387093, 0.8119815668202764, 1.0, 0.8118279569892473]
2025/09/30 20:52:08 INFO dspy.teleprompt.gepa.gepa: Iteration 16: New va

Average Metric: 1.82 / 3 (60.7%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:25<00:00,  8.56s/it]

2025/09/30 20:52:34 INFO dspy.evaluate.evaluate: Average Metric: 1.8204301075268816 / 3 (60.7%)



🏃 View run eval_39 at: http://localhost:5005/#/experiments/1/runs/de5e42bef134427a92f3d7a7ec0a6ba5
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 20:54:52 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for extract_info.predict: markdown
### Revised Instruction for the Assistant

**Task Description:**  
You are to extract **precise and actionable key information** from provided documents that either directly answers a question or provides entities/relationships needed for the next retrieval step. Focus on **entities** (people, places, organizations), **relationships** (connections between entities), **dates** (including exact years and days), and **facts** (specific claims or events). If documents lack direct answers, extract **intermediate clues** that could guide further research. **Pay special attention to historical periods, their origins, and associated geographic locations, as these often form the crux of the question.**

---

**Critical Guidelines:**  
1. **Extract All Relevant Entities, Including Historical Periods and Their Origins**  
   - If a document mentions a historical period (e.g., "Re

🏃 View run eval_40 at: http://localhost:5005/#/experiments/1/runs/a41b0e69badd44088776b59fbf090d0b
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:14:26 INFO dspy.evaluate.evaluate: Average Metric: 17.097644649257557 / 30 (57.0%)


🏃 View run eval_41 at: http://localhost:5005/#/experiments/1/runs/6026e5eb07e6493b94e57a5b49a080cf
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:14:27 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Full valset score for new program: 0.5699214883085851
2025/09/30 21:14:27 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Full train_val score for new program: 0.5699214883085851
2025/09/30 21:14:27 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Individual valset scores for new program: [1.0, 0.26774193548387093, 0.9247311827956989, 0.364516129032258, 0.39784946236559143, 0.39784946236559143, 0.7634408602150538, 0.4247311827956989, 0.4103942652329749, 0.1774193548387097, 0.31451612903225806, 0.3913978494623656, 0.5295698924731183, 0.9193548387096774, 1.0, 0.26774193548387093, 0.7741935483870968, 0.6612903225806451, 0.3709677419354838, 0.4569892473118279, 0.632258064516129, 0.7774193548387097, 0.7258064516129032, 0.5510752688172043, 0.5483870967741935, 0.3913978494623656, 0.5919354838709677, 0.8119815668202764, 1.0, 0.25268817204301075]
2025/09/30 21:14:27 INFO dspy.teleprompt.gepa.gepa: Iteration 17: New valset pareto front

Average Metric: 2.33 / 3 (77.5%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:11<00:00, 43.78s/it]

2025/09/30 21:16:39 INFO dspy.evaluate.evaluate: Average Metric: 2.325268817204301 / 3 (77.5%)



🏃 View run eval_42 at: http://localhost:5005/#/experiments/1/runs/936c90d20e8d4a1d956d0abd161792b7
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:17:24 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for extract_info.predict: You are to act as an information extraction assistant. Given a complex, multi-step question and a set of retrieved documents, your task is to:  

1. **Identify and extract precise entities, relationships, dates, and factual claims** from the documents that directly address the question or enable the next logical step in answering it.  
   - **Entities**: Names of people, organizations, locations, events, or concepts explicitly mentioned (e.g., "Oklahoma City," "Barack Obama," "Slavic migration").  
   - **Relationships**: Explicit connections between entities (e.g., "Angelical Tears formed in Oklahoma City," "North Carolina shifted to Democratic voters in 2008").  
   - **Dates**: Specific timeframes tied to events (e.g., "5th-6th centuries CE," "1976," "2008").  
   - **Facts**: Statistical claims (if present), political shifts, or historical events (e.g., "North Carolina beca

🏃 View run eval_43 at: http://localhost:5005/#/experiments/1/runs/ccc8fb1a75684761b2d2cfc1ba0daaa8
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:19:08 INFO dspy.teleprompt.gepa.gepa: Iteration 18: New subsample score is not better, skipping
GEPA Optimization:  20%|█████████████████████████▏                                                                                                      | 348/1765 [3:52:30<14:38:07, 37.18s/rollouts]2025/09/30 21:19:08 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Selected program 2 score: 0.5842165898617512


Average Metric: 2.20 / 3 (73.5%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:56<00:00, 18.86s/it]

2025/09/30 21:20:05 INFO dspy.evaluate.evaluate: Average Metric: 2.2043010752688175 / 3 (73.5%)



🏃 View run eval_44 at: http://localhost:5005/#/experiments/1/runs/bb418565dd7f4a5789e01a9c5a6fe51c
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:20:51 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for decide_info_collect.predict: You are to determine whether the provided `all_information` contains sufficient details to answer the `question` accurately. Follow these guidelines:

1. **Exact Entity Matching**:
   - Identify the specific entity in the question (e.g., "Democratic Republic of Congo" vs. "Republic of the Congo"). Use the exact name or abbreviation provided in the question.
   - If the question references a location (e.g., "county," "city," "country"), ensure the answer matches the geographic scope (e.g., "Washington, D.C." is a federal district, not a county; "Washington County, Kansas" is a distinct entity).

2. **Political/Historical Precision**:
   - For questions about leadership or historical events, verify the timeline and jurisdiction (e.g., independence dates, roles like "first president" or "first leader"). Avoid conflating similar-sounding entities (e.g., DRC and Republic of C

🏃 View run eval_45 at: http://localhost:5005/#/experiments/1/runs/c0e415e36b1e4749842b15e1a08b21cf
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:22:34 INFO dspy.teleprompt.gepa.gepa: Iteration 19: New subsample score is not better, skipping
GEPA Optimization:  20%|█████████████████████████▋                                                                                                      | 354/1765 [3:55:55<14:26:24, 36.84s/rollouts]2025/09/30 21:22:34 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Selected program 6 score: 0.5758294930875576


Average Metric: 1.41 / 2 (70.4%):  67%|██████████████████████████████████████████████████████████████████████████████████████                                           | 2/3 [00:49<00:25, 25.14s/it]

2025/09/30 21:30:35 INFO dspy.evaluate.evaluate: Average Metric: 2.327956989247312 / 3 (77.6%)


🏃 View run eval_47 at: http://localhost:5005/#/experiments/1/runs/de0adf05ce8e406f9776c2c530acd9e3
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:47:46 INFO dspy.evaluate.evaluate: Average Metric: 17.873195084485403 / 30 (59.6%)


🏃 View run eval_48 at: http://localhost:5005/#/experiments/1/runs/3dd5cc540c004879a23e12abc7b552b9
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Full valset score for new program: 0.5957731694828469
2025/09/30 21:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Full train_val score for new program: 0.5957731694828469
2025/09/30 21:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Individual valset scores for new program: [1.0, 0.26774193548387093, 0.9247311827956989, 0.4774193548387097, 0.39784946236559143, 1.0, 0.7849462365591396, 0.7365591397849461, 0.44193548387096776, 0.364516129032258, 0.3548387096774194, 0.3913978494623656, 0.4569892473118279, 0.9462365591397849, 1.0, 0.7741935483870968, 0.7741935483870968, 0.6612903225806451, 0.5124423963133641, 0.4569892473118279, 0.6370967741935484, 0.3870967741935484, 0.343010752688172, 0.5510752688172043, 0.6236559139784946, 0.3913978494623656, 0.5618279569892473, 0.6967741935483871, 0.6021505376344086, 0.3548387096774194]
2025/09/30 21:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 20: New valset pareto front sc

Average Metric: 2.29 / 3 (76.3%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:58<00:00, 39.37s/it]

2025/09/30 21:49:45 INFO dspy.evaluate.evaluate: Average Metric: 2.290322580645161 / 3 (76.3%)



🏃 View run eval_49 at: http://localhost:5005/#/experiments/1/runs/18e475d97f7943918af01bb43503fbb1
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:51:04 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Proposed new text for decide_info_collect.predict: text
You are to determine whether the `all_information` provided contains sufficient details to answer the `question`. Your task is to:  

1. **Parse the Question**:  
   - Break the question into **specific entities, relationships, or data points** (e.g., "city where X was born" → identify "city," "X," and the relationship "birthplace").  
   - Map each component to **explicit mentions** in `all_information`. If a component is not directly addressed, flag it as missing.  

2. **Analyze `all_information`**:  
   - **Check for explicit matches**: Only use information **directly stated** in `all_information` (e.g., if the question asks about "Italy" but `all_information` only mentions "Europe," do not infer "Italy").  
   - **Resolve conflicts**: Prioritize the **most specific or unambiguous source** (e.g., if a location is mentioned in two documents, prefer the one with h

🏃 View run eval_50 at: http://localhost:5005/#/experiments/1/runs/34495c421b304ca79ea949f146c727d9
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:53:21 INFO dspy.teleprompt.gepa.gepa: Iteration 21: New subsample score is not better, skipping
GEPA Optimization:  22%|████████████████████████████▋                                                                                                   | 396/1765 [4:26:43<15:39:17, 41.17s/rollouts]2025/09/30 21:53:21 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Selected program 6 score: 0.5758294930875576


Average Metric: 2.16 / 3 (72.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:59<00:00, 39.96s/it]

2025/09/30 21:55:22 INFO dspy.evaluate.evaluate: Average Metric: 2.1586021505376345 / 3 (72.0%)



🏃 View run eval_51 at: http://localhost:5005/#/experiments/1/runs/89fc9e0ff9ac49ab955ff92aad73d84d
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/09/30 21:56:10 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Proposed new text for decide_info_collect.predict: You are tasked with determining whether the provided `all_information` contains sufficient and directly relevant data to answer the given `question`. Your output must include two components:  

1. **Reasoning**:  
   - Identify the key entities, relationships, and temporal/geographic links in the question (e.g., "city where RSA Conference was held" → "San Francisco").  
   - Map these entities to the `all_information` to verify if **all required data points** (e.g., dates, locations, historical events) are explicitly stated.  
   - Check if **supporting documents** (e.g., [11], [13]) directly address the question’s components. Avoid citing irrelevant documents.  
   - If the information is insufficient, explain which critical details are missing (e.g., "Ladakh's integration into Qing China" in Example 2).  

2. **has_collected_enough_info**:  
   - Output `True` **only i

In [None]:
optimized_program.save(str(EXP_DIR / "optimized-program"), save_program=True)

### Examine Optimized Prompts

Let's look at how GEPA improved the prompts for each predictor:


In [None]:
for name, pred in optimized_program.named_predictors():
    print("=" * 60)
    print(f"Predictor: {name}")
    print("=" * 60)
    print("Optimized Instructions:")
    print(pred.signature.instructions)
    print("*" * 60)


### Evaluate Optimized Program

Compare the performance before and after GEPA optimization:


In [None]:
print("\\n📊 Evaluating OPTIMIZED program...")
# Evaluate optimized program  
optimized_evaluate = dspy.Evaluate(
    devset=test_ds,
    metric=metric,
    num_threads=8,
    display_table=False,
    display_progress=True
)
optimized_eval_result = optimized_evaluate(optimized_program)

In [None]:
print("\\n" + "=" * 50)
print("🏆 PERFORMANCE COMPARISON")
print("=" * 50)
print(f"Original Program Score:  {original_eval_result.score:.3f}")
print(f"Optimized Program Score: {optimized_eval_result.score:.3f}")
print(f"Improvement:            {optimized_eval_result.score - original_eval_result.score:+.3f}")
print(f"Relative Improvement:   {((optimized_eval_result.score / original_eval_result.score) - 1) * 100:+.1f}%")

### GEPA Optimization Analysis

Analyze the detailed optimization results:


In [None]:
# Analyze GEPA optimization trajectory
if hasattr(optimized_program, 'detailed_results'):
    results = optimized_program.detailed_results
    
    print("🔍 GEPA Optimization Details:")
    print(f"- Total candidates explored: {len(results.candidates)}")
    print(f"- Best candidate index: {results.best_idx}")
    print(f"- Best validation score: {results.val_aggregate_scores[results.best_idx]:.3f}")
    print(f"- Discovery evaluations used: {sum(results.discovery_eval_counts)}")
    
    # Show score progression
    print("\\n📈 Score progression:")
    for i, score in enumerate(results.val_aggregate_scores[:10]):  # Show first 10
        print(f"Candidate {i}: {score:.3f}")
    
    if len(results.val_aggregate_scores) > 10:
        print(f"... and {len(results.val_aggregate_scores) - 10} more candidates")
else:
    print("Detailed results not available (set track_stats=True in GEPA constructor)")


In [None]:
# Test optimized program on the same example
example = test_ds[0]

print("🧪 Testing optimized program on example:")
print(f"Question: {example.question}")
print(f"Expected Answer: {example.answer}")
print(f"Supporting Docs: {example.supporting_ids}")
print()

pred = program(example.question, example.docs)
optimized_pred = optimized_program(example.question, example.docs)

print("📋 ORIGINAL vs OPTIMIZED Results:")
print("-" * 50)
print("ORIGINAL:")
print(f"  Answer: {pred.answer}")
print(f"  Retrieved docs: {pred.retrieved_doc_ids}")
print(f"  Cited docs: {pred.citations}")

print("OPTIMIZED:")
print(f"  Answer: {optimized_pred.answer}")
print(f"  Retrieved docs: {optimized_pred.retrieved_doc_ids}")
print(f"  Cited docs: {optimized_pred.citations}")

print("🎯 Metric Comparison:")
original_metric_result = metric_with_feedback(example, pred)
optimized_metric_result = metric_with_feedback(example, optimized_pred)
print(f"Original score: {original_metric_result.score:.3f}")
print(f"Original feedback: {original_metric_result.feedback}")
print()
print(f"Optimized score: {optimized_metric_result.score:.3f}")
print(f"Optimized feedback: {optimized_metric_result.feedback}")


Can we measure instruction quality by using them with a larger model to see if it gets questions right?