In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
from pathlib import Path
import datetime
import random

import dspy

In [3]:
def setup_mlflow():
    import mlflow
    import mlflow.dspy

    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
    experiment = mlflow.set_experiment("dspy-gepa-musique")
    mlflow.dspy.autolog(
        log_compiles=True,
        log_evals=True,
        log_traces_from_compile=True,
    )
    print(f"✅ MLflow tracking enabled at {os.getenv('MLFLOW_TRACKING_URI')}")
    return experiment

mlflow_exp = setup_mlflow()

✅ MLflow tracking enabled at http://localhost:5005


In [4]:
EXP_ID = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
EXP_DIR = Path(f"../outputs/dspy-gepa-musique/{EXP_ID}")
EXP_DIR.mkdir(parents=True, exist_ok=True)
EXP_DIR

PosixPath('../outputs/dspy-gepa-musique/20251003_174554')

In [5]:
# lm = dspy.LM(
#     "openai/Qwen/Qwen2.5-7B-Instruct",
#     temperature=0.6,
#     max_tokens=8192,
#     api_key="local",
#     api_base="http://0.0.0.0:8000/v1",
#     cache=False,
# )
lm = dspy.LM(
    "openai/Qwen/Qwen3-8B",
    temperature=0.6,
    max_tokens=16000,
    api_key="local",
    api_base="http://0.0.0.0:8001/v1",
    cache=False,
)
mlflow_exp._tags["student_lm"] = {
    "model": lm.model,
}
dspy.configure(lm=lm)

reflection_lm = dspy.LM(
    "gemini/gemini-2.5-pro",
    api_key=os.getenv("GEMINI_API_KEY"),
    max_tokens=16384,
    thinking={"type": "enabled"},
    cache=False,
)
mlflow_exp._tags["reflection_lm"] = {
    "model": reflection_lm.model,
}
# reflection_lm = dspy.LM(
#     "openai/Qwen/Qwen3-32B",
#     temperature=0.6,
#     max_tokens=16384,
#     api_key="local",
#     api_base="http://0.0.0.0:8001/v1",
#     cache=False,
# )

In [6]:
lm(messages=[{"role": "user", "content": "Hello"}])

['Hello! How can I assist you today? 😊']

In [None]:
reflection_lm(messages=[{"role": "user", "content": "What is largest prime number below 10?"}])

In [7]:
from rlvr.dspy.mhqa.data import prepare_musique_dataset

ds = prepare_musique_dataset(datasets_str="bdsaglam/musique-mini,answerable,train")
random.Random(89).shuffle(ds)
train_size = int(len(ds)*0.80)
train_ds, val_ds = ds[:train_size], ds[train_size:]

test_ds = prepare_musique_dataset(datasets_str="bdsaglam/musique-mini,answerable,validation[:50]")

Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map: 100%|##########| 50/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
from rlvr.dspy.mhqa.baleen import MultiHopQA

program = MultiHopQA(prompt_technique="cot")
program

generate_query.predict = Predict(StringSignature(question, collected_info -> reasoning, search_query, top_n
    instructions='Given a multi-hop question and information collected so far, generate a search query\nto find the next piece of information needed to answer the question.\nFocus on entities, dates, or facts that need to be resolved step by step.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'The multi-hop question to answer', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    collected_info = Field(annotation=str required=True json_schema_extra={'desc': 'Information collected from previous retrieval steps.', '__dspy_field_type': 'input', 'prefix': 'Collected Info:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    search_query = Field(annotation=str required=True json_schema_extra={'desc': 'Search q

In [9]:
example = train_ds[0]
example

Example({'question': "What county contains the work location of the president making father's day a national holiday?", 'answer': 'Washington County', 'answers': ['Washington County', 'washington county'], 'docs': [{'body': 'Thanksgiving, or Thanksgiving Day, is a public holiday celebrated on the fourth Thursday of November in the United States. It originated as a harvest festival. Thanksgiving has been celebrated nationally on and off since 1789, after Congress requested a proclamation by George Washington. It has been celebrated as a federal holiday every year since 1863, when, during the American Civil War, President Abraham Lincoln proclaimed a national day of "Thanksgiving and Praise to our beneficent Father who dwelleth in the Heavens,"to be celebrated on the last Thursday in November. Together with Christmas and the New Year, Thanksgiving is a part of the broader fall / winter holiday season in the U.S.', 'id': '0', 'is_supporting': False, 'text': '# Thanksgiving (United States)

In [10]:
pred = program(example.question, example.docs)
pred

Prediction(
    answer='Washington County, Kansas.',
    collected_info=[KeyInformation(info="President Richard Nixon signed the bill making Father's Day a permanent national holiday in 1972.", source_doc_id='14'), KeyInformation(info="President Richard Nixon signed the bill making Father's Day a permanent national holiday in 1972.", source_doc_id='8'), KeyInformation(info='Washington is the county seat of Washington County, Kansas.', source_doc_id='7')],
    retrieved_doc_ids=['14', '8', '7', '3', '5'],
    citations=['7'],
    n_turns=5
)

In [11]:
from rlvr.dspy.mhqa.metrics import metric

In [12]:
metric(example, pred)

0.7389189189189189

In [13]:
# Evaluate original program
print("📊 Evaluating ORIGINAL program...")
original_evaluate = dspy.Evaluate(
    devset=test_ds,
    metric=metric,
    num_threads=16,
    display_table=False,
    display_progress=True
)
original_eval_result = original_evaluate(program)

📊 Evaluating ORIGINAL program...
Average Metric: 32.62 / 50 (65.2%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:37<00:00,  1.95s/it]

2025/10/03 17:48:41 INFO dspy.evaluate.evaluate: Average Metric: 32.61968893468664 / 50 (65.2%)



🏃 View run eval at: http://localhost:5005/#/experiments/1/runs/a899584fa46e497c90e69a89e7afbed5
🧪 View experiment at: http://localhost:5005/#/experiments/1


In [14]:
# raise ValueError()

## GEPA Optimization

GEPA is a reflective prompt optimizer that uses textual feedback to improve performance. We'll create feedback functions for each evaluation aspect and optimize our multi-hop QA program.


In [15]:
from rlvr.dspy.mhqa.feedback_metrics import metric_with_feedback

In [16]:
# Test the feedback metric on our example
feedback_result = metric_with_feedback(example, pred)
print(f"Score: {feedback_result.score:.3f}")
print(f"Feedback: {feedback_result.feedback}")


Score: 0.739
Feedback: Overall performance breakdown:
- Answer F1 Score: Good partial match (F1: 0.80). Your answer contains relevant information but could be more complete or precise. Consider including more specific details from the retrieved documents.
- Retrieval Recall: Perfect retrieval! You found all 3 supporting documents: ['14', '5', '7']
- Retrieval Precision: Moderate precision (precision: 0.60). Only 3 out of 5 retrieved documents are relevant. Many irrelevant docs retrieved: ['3', '8']. Your search queries are too broad - focus on more specific terms and entities.
- Citations F1 Score: Good citations (F1: 0.50). Correct: ['7']. Missing: ['14', '5']. Be more precise about which documents actually support your answer.
- Hop Efficiency: Moderate efficiency (penalty: 0.64). You took 5 turns vs reference 3 hops. Your retrieval queries may be too specific or missing key entities. Try broader, more comprehensive initial searches.


In [17]:
from dspy import GEPA

# Set up GEPA optimizer with reflection LM for optimization
optimizer = GEPA(
    metric=metric_with_feedback,
    auto="light",  # Use light budget for faster experimentation. Use "heavy" for best performance
    num_threads=16,
    track_stats=True,
    use_merge=False,
    reflection_lm=reflection_lm  
)

print("✅ GEPA optimizer configured")

✅ GEPA optimizer configured


In [None]:
# Run GEPA optimization
print("🚀 Starting GEPA optimization...")

optimized_program = optimizer.compile(
    program,
    trainset=train_ds,
    valset=val_ds,
)

print("✅ GEPA optimization completed!")

2025/10/03 17:48:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cc25b43752d045cc89b5dfdce780ed02', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current dspy workflow
2025/10/03 17:48:42 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 2065 metric calls of the program. This amounts to 6.88 full evals on the train+val set.
2025/10/03 17:48:42 INFO dspy.teleprompt.gepa.gepa: Using 60 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.


🚀 Starting GEPA optimization...


GEPA Optimization:   0%|                                                                                                                              | 0/2065 [00:00<?, ?rollouts/s]2025/10/03 17:51:24 INFO dspy.evaluate.evaluate: Average Metric: 36.27238962477294 / 60 (60.5%)


🏃 View run eval_0 at: http://localhost:5005/#/experiments/1/runs/c98283f27394440dba15b7184f9cdf9c
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:51:24 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.6045398270795489
GEPA Optimization:   3%|███▎                                                                                                               | 60/2065 [02:42<1:30:35,  2.71s/rollouts]2025/10/03 17:51:24 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.6045398270795489


Average Metric: 1.56 / 3 (51.9%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:23<00:00,  8.00s/it]

2025/10/03 17:51:49 INFO dspy.evaluate.evaluate: Average Metric: 1.5559353471118178 / 3 (51.9%)



🏃 View run eval_1 at: http://localhost:5005/#/experiments/1/runs/9d5fe4ef39554e8ebeaf37c275657891
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:52:18 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for generate_query.predict: You are an expert at decomposing complex, multi-hop questions and formulating precise search queries to answer them step-by-step.

Your task is to generate the next search query to find a missing piece of information needed to answer a given question, based on the information collected so far.

Follow these steps:
1.  **Decompose the Question:** First, carefully analyze the multi-hop question to understand the chain of entities and relationships. Break down the full question into a logical sequence of smaller, answerable sub-questions.
2.  **Formulate a Reasoning Plan:** In the `reasoning` field, articulate your step-by-step plan. Clearly state what information is needed first and how it connects to the subsequent steps. If information has already been collected, use it to determine the next logical sub-question to resolve.
3.  **Craft a Precise and Unambiguous Search Query:**

🏃 View run eval_2 at: http://localhost:5005/#/experiments/1/runs/68f0c0ecbb954f84b1789925ecf499dc
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:55:41 INFO dspy.evaluate.evaluate: Average Metric: 34.48856141540009 / 60 (57.5%)


🏃 View run eval_3 at: http://localhost:5005/#/experiments/1/runs/393e950d615b4e759402bb97e691d785
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:55:41 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Full valset score for new program: 0.5748093569233345
2025/10/03 17:55:41 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Full train_val score for new program: 0.5748093569233345
2025/10/03 17:55:41 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Individual valset scores for new program: [0.6505945945945946, 0.5632016632016632, 0.5822822822822823, 0.5837837837837838, 0.41366366366366364, 0.4905692926969522, 0.7401719901719902, 0.6801801801801801, 0.39472178060413354, 0.7788697788697788, 0.45063063063063064, 0.5942134442134442, 0.6297297297297297, 0.2581081081081081, 0.16216216216216214, 0.6027027027027027, 0.9405405405405405, 0.3256756756756757, 0.9369369369369369, 0.8472385428907168, 0.7972972972972973, 0.7683397683397684, 0.5554054054054054, 0.7198198198198197, 0.6890159390159389, 0.6216216216216215, 0.6426073131955484, 0.617117117117117, 0.6342342342342342, 0.7033616473616474, 0.5875315315315315, 0.8310810810810811, 0.447297

Average Metric: 1.92 / 3 (64.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:01<00:00, 20.44s/it]

2025/10/03 17:56:43 INFO dspy.evaluate.evaluate: Average Metric: 1.9196138996138994 / 3 (64.0%)



🏃 View run eval_4 at: http://localhost:5005/#/experiments/1/runs/c5858faca4824d15991805006b04a6a4
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:57:21 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for extract_info.predict: You are a specialized information extraction component in a larger, multi-step question-answering system. Your task is to analyze a complex question and a single retrieved document, and then extract the key pieces of information from that document that are directly relevant to answering the question.

Your primary goal is to find the "next link in the chain" of reasoning. The information you extract will be used to perform another search or to assemble the final answer.

### Instructions:

1.  **Analyze the Question**: First, break down the question to understand the core entities (people, places, things), relationships (e.g., "director of", "spouse of", "born in"), and specific facts (like dates) it is asking about.

2.  **Scan the Document for Connections**: Read the provided document and identify any information that directly matches or relates to the components you identifie

🏃 View run eval_5 at: http://localhost:5005/#/experiments/1/runs/2b693c4f92a04151a1d10cf119e83f24
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:58:17 INFO dspy.teleprompt.gepa.gepa: Iteration 2: New subsample score is not better, skipping
GEPA Optimization:   6%|███████▎                                                                                                          | 132/2065 [09:34<2:43:49,  5.09s/rollouts]2025/10/03 17:58:17 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Selected program 0 score: 0.6045398270795489


Average Metric: 2.12 / 3 (70.7%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:36<00:00, 12.07s/it]

2025/10/03 17:58:53 INFO dspy.evaluate.evaluate: Average Metric: 2.12040887040887 / 3 (70.7%)



🏃 View run eval_6 at: http://localhost:5005/#/experiments/1/runs/40abc2f2dccd4c1181682bf94d69af7b
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 17:59:33 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for extract_info.predict: You are an expert information extractor in a multi-step question-answering system. Your task is to meticulously analyze a given question and a set of retrieved documents to extract key pieces of information. These extracted facts are crucial, as they will either directly answer a part of the question or serve as the foundation for a subsequent retrieval or reasoning step.

### Core Principles:

1.  **Deconstruct the Question:** Before reading the documents, break down the user's question into its fundamental components. Identify all the entities (people, places, organizations), the relationships between them, and the specific data points being requested (e.g., a date, a location, a name, a numerical value).

2.  **Extract Atomic and Precise Facts:** Your primary goal is to extract information as specific, concise data points, not as full sentences.
    *   **Good Example:** If t

🏃 View run eval_7 at: http://localhost:5005/#/experiments/1/runs/bf1d701cd1a4405fa1280ae0f13dce16
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:00:23 INFO dspy.teleprompt.gepa.gepa: Iteration 3: New subsample score is not better, skipping
GEPA Optimization:   7%|███████▌                                                                                                          | 138/2065 [11:40<3:31:54,  6.60s/rollouts]2025/10/03 18:00:23 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 1 score: 0.5748093569233345


Average Metric: 1.48 / 3 (49.4%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:36<00:00, 12.13s/it]

2025/10/03 18:00:59 INFO dspy.evaluate.evaluate: Average Metric: 1.4833818433818433 / 3 (49.4%)



🏃 View run eval_8 at: http://localhost:5005/#/experiments/1/runs/bb79339c281041edaedc4d1c74e4d302
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:01:26 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for decide_info_collect.predict: You are an expert component in a multi-step question-answering system. Your specific role is to act as a gatekeeper. You must determine if the information collected so far is sufficient to answer the user's question completely and accurately.

**Task:**
Given a `question` and a body of text called `all_information`, you must produce a boolean field `has_collected_enough_info`.

**Instructions:**

1.  **Deconstruct the Question:** Carefully analyze the `question` to identify every single piece of information required to formulate a final answer. This often involves breaking the question down into a chain of entities and relationships.
    *   For example, for the question "What county neighbors the county that contains the birth city of Anthony Joseph Zerilli?", you must identify the following required facts:
        1.  The birth city of Anthony Joseph Zerilli.
        2.

🏃 View run eval_9 at: http://localhost:5005/#/experiments/1/runs/ea8f8d1e70464a95ac81dffbaa80fa82
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:02:03 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New subsample score is not better, skipping
GEPA Optimization:   7%|███████▉                                                                                                          | 144/2065 [13:21<4:09:59,  7.81s/rollouts]2025/10/03 18:02:03 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Selected program 1 score: 0.5748093569233345


Average Metric: 1.72 / 3 (57.3%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:43<00:00, 14.50s/it]

2025/10/03 18:02:47 INFO dspy.evaluate.evaluate: Average Metric: 1.7177477477477476 / 3 (57.3%)



🏃 View run eval_10 at: http://localhost:5005/#/experiments/1/runs/1aae5f1cd65b49fdb09284fecb9c5f4a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:03:25 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for generate_answer.predict: You are an expert question-answering assistant. Your task is to answer a multi-hop question based *only* on a provided collection of text snippets (documents). You must provide a concise answer and cite all supporting documents.

Follow these steps carefully:

1.  **Deconstruct the Question:** First, analyze the multi-hop question to understand the chain of information required. Break it down into a series of smaller, simpler questions or steps. For example, for the question "When was the brand opened in the city where the Central Naval Museum was located?", the steps are:
    *   Step A: Find the city where the Central Naval Museum is located.
    *   Step B: Find when the brand opened in the city identified in Step A.

2.  **Chain the Information:** Go through the provided documents to find the answers for each step of your deconstructed question. You will often need to con

🏃 View run eval_11 at: http://localhost:5005/#/experiments/1/runs/52ff29e783ba433999cf450967421100
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:07:08 INFO dspy.evaluate.evaluate: Average Metric: 35.239508373836955 / 60 (58.7%)


🏃 View run eval_12 at: http://localhost:5005/#/experiments/1/runs/a37bb08cb01c47bfa69260b647e993a6
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:07:09 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full valset score for new program: 0.5873251395639492
2025/10/03 18:07:09 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full train_val score for new program: 0.5873251395639492
2025/10/03 18:07:09 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Individual valset scores for new program: [0.6505945945945946, 0.6108108108108107, 0.3256756756756757, 0.6432432432432432, 0.49699699699699695, 0.5112003895787679, 0.7041359541359541, 0.7747747747747747, 0.6606177606177606, 0.7897897897897898, 0.47297297297297297, 0.6432432432432432, 0.545045045045045, 0.5126126126126126, 0.4135135135135135, 0.4945945945945946, 0.5614250614250613, 0.3256756756756757, 1.0, 0.8472385428907168, 0.7972972972972973, 0.49567567567567566, 0.634029484029484, 0.5524324324324325, 0.5596161378770074, 0.5180180180180181, 0.6594594594594594, 0.3922136422136422, 0.6418918918918919, 0.33978378378378377, 0.4614054054054054, 0.8310810810810811, 0.6342342342342342, 0.73

Average Metric: 1.89 / 3 (63.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:50<00:00, 16.99s/it]

2025/10/03 18:08:00 INFO dspy.evaluate.evaluate: Average Metric: 1.8911911911911912 / 3 (63.0%)



🏃 View run eval_13 at: http://localhost:5005/#/experiments/1/runs/cdbaccec2eda4af99a024f4d41c1741f
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:08:32 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for decide_info_collect.predict: Your task is to act as a verifier. You will be given a `question` and a collection of text snippets called `all_information`. Your goal is to determine if the `all_information` contains every single piece of data required to construct a definitive and complete answer to the `question`.

You must produce one output field:
- `has_collected_enough_info`: A boolean (`True` or `False`).

To accomplish this, follow this process:

1.  **Deconstruct the Question:** Break down the `question` into a chain of smaller, essential facts that must be known to arrive at the final answer. Identify all the entities (people, places, things) and the relationships between them.

2.  **Verify Each Fact:** For each essential fact identified in step 1, meticulously check if it is explicitly stated in the `all_information` snippets.

3.  **Assess for Completeness:**
    *   If **every single fact

🏃 View run eval_14 at: http://localhost:5005/#/experiments/1/runs/02499d324be74c93a8c6f70293244d71
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:09:23 INFO dspy.teleprompt.gepa.gepa: Iteration 6: New subsample score is not better, skipping
GEPA Optimization:  10%|███████████▉                                                                                                      | 216/2065 [20:41<3:36:56,  7.04s/rollouts]2025/10/03 18:09:23 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Selected program 1 score: 0.5748093569233345


Average Metric: 1.96 / 3 (65.4%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:13<00:00, 24.55s/it]

2025/10/03 18:10:37 INFO dspy.evaluate.evaluate: Average Metric: 1.9614718614718614 / 3 (65.4%)



🏃 View run eval_15 at: http://localhost:5005/#/experiments/1/runs/f859d1e93a84483d8ae3109a99a81225
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:11:11 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for generate_query.predict: You are an expert search query strategist for a multi-hop question-answering system. Your task is to generate the single *next* search query required to continue the process of answering a complex question, based on the information gathered so far.

Your primary goal is to reach the final answer in the fewest steps possible. This requires a balance between creating highly precise queries and, when possible, combining multiple logical steps into a single, efficient query.

Follow these steps to generate your response:

1.  **Decompose the Question:** First, analyze the full multi-hop question. Identify the chain of entities and the relationships between them. Mentally break down the question into a sequence of smaller, dependent sub-questions. For example, for the question "What is the population of the city where the director of film X was born?", the sub-questions are:
    * 

🏃 View run eval_16 at: http://localhost:5005/#/experiments/1/runs/e7d142ae97254394b720e5ea16eb8141
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:14:46 INFO dspy.evaluate.evaluate: Average Metric: 35.423241362517004 / 60 (59.0%)


🏃 View run eval_17 at: http://localhost:5005/#/experiments/1/runs/769d6063fdc64a46a51cf301dae0d6e0
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:14:47 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Full valset score for new program: 0.59038735604195
2025/10/03 18:14:47 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Full train_val score for new program: 0.59038735604195
2025/10/03 18:14:47 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Individual valset scores for new program: [0.7297297297297297, 0.6109530583214794, 0.5822822822822823, 0.4891891891891892, 0.5713790713790714, 0.5939990838295923, 0.7401719901719902, 0.5900900900900901, 0.5236486486486486, 0.7837837837837838, 0.47297297297297297, 0.6081081081081081, 0.7648648648648648, 0.3414414414414414, 0.1297297297297297, 0.5643500643500644, 0.7648648648648648, 0.3256756756756757, 0.9369369369369369, 0.7888601645123384, 0.7972972972972973, 0.6216216216216215, 0.7147340889276372, 0.7297297297297297, 0.6599099099099098, 0.4234234234234234, 0.7189189189189189, 0.43378378378378374, 0.4013513513513513, 0.33978378378378377, 0.5875315315315315, 0.8108108108108107, 0.3864864864

Average Metric: 1.62 / 3 (54.2%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:44<00:00, 14.87s/it]

2025/10/03 18:15:32 INFO dspy.evaluate.evaluate: Average Metric: 1.624918918918919 / 3 (54.2%)



🏃 View run eval_18 at: http://localhost:5005/#/experiments/1/runs/477a27db27f44356b60f21f1cc5a02ac
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:16:08 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for extract_info.predict: You are an expert information extractor. Your task is to meticulously analyze a given question and a set of retrieved documents, and then extract all key pieces of information from the documents that are relevant to answering the question.

**Core Principles:**

1.  **Strictly Grounded in Documents:** Every piece of information you extract **must** come directly and explicitly from the provided documents. Do not use any external knowledge, make assumptions, or infer information that is not stated in the text. For example, if a document mentions "Riverbank State Park is 21m over the Hudson River", you can extract that. You cannot add external facts like "The Hudson is the largest river in New York".

2.  **Extract All Relevant Clues, Not Just the Final Answer:** The documents may not contain the complete answer. Your goal is to find all the relevant puzzle pieces that help constr

🏃 View run eval_19 at: http://localhost:5005/#/experiments/1/runs/741c9f1784244d7d8e02ad5f3cc82d2c
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:20:13 INFO dspy.evaluate.evaluate: Average Metric: 28.162047009303222 / 60 (46.9%)


🏃 View run eval_20 at: http://localhost:5005/#/experiments/1/runs/96415ca649184c2291faec4edfb24c2d
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:20:13 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Full valset score for new program: 0.46936745015505377
2025/10/03 18:20:13 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Full train_val score for new program: 0.46936745015505377
2025/10/03 18:20:13 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Individual valset scores for new program: [0.6505945945945946, 0.6047297297297296, 0.3256756756756757, 0.3864864864864864, 0.49930699930699923, 0.3031531531531531, 0.5604422604422604, 0.5317117117117117, 0.32, 0.7204914004914004, 0.40108108108108104, 0.3864864864864864, 0.7648648648648648, 0.3211711711711711, 0.3414414414414414, 0.32477477477477473, 0.4945945945945946, 0.3256756756756757, 0.38482882882882885, 0.32, 0.7972972972972973, 0.40108108108108104, 0.6558558558558558, 0.5524324324324325, 0.3256756756756757, 0.33978378378378377, 0.5524324324324325, 0.2905405405405405, 0.5837837837837838, 0.33978378378378377, 0.41636036036036034, 0.33978378378378377, 0.4135135135135135, 0.401081

Average Metric: 1.54 / 3 (51.2%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:44<00:00, 14.94s/it]

2025/10/03 18:20:59 INFO dspy.evaluate.evaluate: Average Metric: 1.5357013139621833 / 3 (51.2%)



🏃 View run eval_21 at: http://localhost:5005/#/experiments/1/runs/eb4f6a07c85d45d7a8b5c6085e111b48
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:21:28 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for decide_info_collect.predict: You are an expert AI assistant that acts as a gatekeeper in a question-answering system. Your task is to determine if the information collected so far is sufficient to fully and accurately answer a given question.

**Your Goal:**
Given a `question` and a body of text called `all_information`, you must produce a boolean field `has_collected_enough_info`.

**Instructions:**

1.  **Deconstruct the Question:** First, carefully analyze the `question` and break it down into all its essential components and sub-questions. Identify every piece of information that is required to construct a complete answer. This includes entities, their attributes, and the relationships between them.

2.  **Verify Information Availability:** For each component identified in step 1, you must verify if a corresponding piece of information exists within the provided `all_information`. Your evaluation

🏃 View run eval_22 at: http://localhost:5005/#/experiments/1/runs/392e4144ac1b4e6ea7348e1cb354c99b
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:25:48 INFO dspy.evaluate.evaluate: Average Metric: 34.93723275441926 / 60 (58.2%)


🏃 View run eval_23 at: http://localhost:5005/#/experiments/1/runs/df4c4ad84bd34468b24d550eadf9b892
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:25:48 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Full valset score for new program: 0.5822872125736542
2025/10/03 18:25:48 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Full train_val score for new program: 0.5822872125736542
2025/10/03 18:25:48 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Individual valset scores for new program: [0.6055495495495495, 0.62, 0.5822822822822823, 0.5567567567567567, 0.4339339339339339, 0.3237900691389063, 0.6536855036855036, 0.5900900900900901, 0.6627627627627628, 0.7837837837837838, 0.47297297297297297, 0.6052199258081611, 0.7648648648648648, 0.2581081081081081, 0.3414414414414414, 0.545123384253819, 0.7047395221308264, 0.3256756756756757, 0.9369369369369369, 0.8472385428907168, 0.7972972972972973, 0.49567567567567566, 0.594015444015444, 0.6605405405405405, 0.606133056133056, 0.5180180180180181, 0.7485714285714284, 0.43378378378378374, 0.7297297297297297, 0.75006435006435, 0.2722162162162162, 0.8310810810810811, 0.4472972972972973, 0.66684

Average Metric: 1.21 / 3 (40.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.74s/it]

2025/10/03 18:26:24 INFO dspy.evaluate.evaluate: Average Metric: 1.2138378378378378 / 3 (40.5%)



🏃 View run eval_24 at: http://localhost:5005/#/experiments/1/runs/d531566c175c4e6ab8130d6555eb23e1
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:26:50 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for decide_info_collect.predict: Your task is to act as a gatekeeper in a multi-step question-answering process. Given a `question` and the `all_information` collected so far, you must determine if the information is sufficient to formulate a complete and final answer. You will produce a boolean field `has_collected_enough_info`.

**Instructions:**

1.  **Deconstruct the Question:** First, break down the `question` into the essential pieces of information required to answer it. Identify all the entities and the relationships between them that need to be resolved.
    *   For example, for the question "What is the equivalent of organization X in the country that helped establish entity Y?", you need to know:
        1.  The country that helped establish entity Y.
        2.  The equivalent of organization X in that specific country.

2.  **Evaluate the Collected Information:** Carefully examine the text 

🏃 View run eval_25 at: http://localhost:5005/#/experiments/1/runs/6e467f4845184a76835959f2651facb9
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:31:11 INFO dspy.evaluate.evaluate: Average Metric: 28.574612298429198 / 60 (47.6%)


🏃 View run eval_26 at: http://localhost:5005/#/experiments/1/runs/ea4757029beb485b850ce67c64b2ef34
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:31:12 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Full valset score for new program: 0.4762435383071534
2025/10/03 18:31:12 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Full train_val score for new program: 0.4762435383071534
2025/10/03 18:31:12 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Individual valset scores for new program: [0.6505945945945946, 0.49567567567567566, 0.3256756756756757, 0.3864864864864864, 0.4387387387387387, 0.4340403694834074, 0.6212530712530713, 0.6235675675675675, 0.4281081081081081, 0.7747747747747747, 0.49567567567567566, 0.6565835065835066, 0.545045045045045, 0.4621621621621621, 0.1297297297297297, 0.5252252252252252, 0.5351351351351351, 0.3256756756756757, 0.3172612612612613, 0.32, 0.4659099099099099, 0.32, 0.7088803088803088, 0.5524324324324325, 0.3256756756756757, 0.33978378378378377, 0.7416216216216216, 0.4755469755469755, 0.4621621621621621, 0.33978378378378377, 0.4189189189189189, 0.33978378378378377, 0.3864864864864864, 0.4010810810

Average Metric: 2.00 / 3 (66.6%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:29<00:00,  9.95s/it]

2025/10/03 18:31:42 INFO dspy.evaluate.evaluate: Average Metric: 1.9966224406224404 / 3 (66.6%)



🏃 View run eval_27 at: http://localhost:5005/#/experiments/1/runs/2c9cad8099bd48babc94fd7be5fd7d9c
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:32:07 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for generate_answer.predict: You are an expert question-answering assistant. Your task is to answer a multi-hop question based on a collection of provided information snippets. You must provide a concise answer and a list of citations supporting it.

Follow these instructions carefully:

### Answer Generation Rules

1.  **Be Concise and Direct:** The answer must directly address what the question asks for and be as brief as possible. Do not use full sentences or rephrase the question in your answer.
    *   **Example:** For a question like "Who is the president of Anthony Njokuani's country?", the correct answer is "Goodluck Jonathan", not "The president of Anthony Njokuani's country, Nigeria, is Goodluck Jonathan."

2.  **Use Exact Information:** Extract the specific entities, dates, and facts directly from the provided documents. Do not paraphrase, generalize, or alter the information.
    *   **Examp

🏃 View run eval_28 at: http://localhost:5005/#/experiments/1/runs/073e9d7887b041a28ddc2dfcbc931fa2
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:34:46 INFO dspy.evaluate.evaluate: Average Metric: 40.45789360789361 / 60 (67.4%)


🏃 View run eval_29 at: http://localhost:5005/#/experiments/1/runs/ec4ec686477b435096207860d6e1ec82
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:34:47 INFO dspy.teleprompt.gepa.gepa: Iteration 11: New program is on the linear pareto front
2025/10/03 18:34:47 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Full valset score for new program: 0.6742982267982267
2025/10/03 18:34:47 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Full train_val score for new program: 0.6742982267982267
2025/10/03 18:34:47 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Individual valset scores for new program: [0.7297297297297297, 0.658108108108108, 0.5972972972972973, 0.49999999999999994, 0.8243243243243242, 0.45945945945945943, 0.7522522522522522, 0.7477477477477477, 0.7189189189189189, 0.7702702702702703, 0.6108108108108107, 0.5396396396396396, 0.6418918918918919, 0.7084942084942084, 0.8783783783783783, 0.5900900900900902, 0.8256756756756757, 0.704054054054054, 0.9549549549549549, 0.9662162162162162, 0.9189189189189189, 0.6756756756756757, 0.9405405405405405, 0.658108108108108, 0.9121621621621622, 0.7162162162162161, 0.4821621621621621, 0

Average Metric: 1.35 / 3 (45.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:36<00:00, 12.24s/it]

2025/10/03 18:35:24 INFO dspy.evaluate.evaluate: Average Metric: 1.3492792792792792 / 3 (45.0%)



🏃 View run eval_30 at: http://localhost:5005/#/experiments/1/runs/843dc6996aea4ed0bc800039fe64c2e8
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:35:50 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for generate_query.predict: You are an expert search query generator for a multi-hop question-answering system. Your task is to analyze a complex question and the information gathered so far, then generate the single best search query to find the next piece of information required to answer the question.

### Instructions:

1.  **Deconstruct the Question:** First, carefully analyze the user's `question`. Break it down into a logical sequence of smaller, answerable sub-questions. This is your reasoning chain. Your `reasoning` output should clearly articulate this plan.

2.  **Identify the Next Necessary Information:** Based on your reasoning chain and the `collected_info`, determine the immediate next piece of information you need.
    *   If `collected_info` is empty, your query should target the first sub-question in your chain.
    *   If `collected_info` contains information, use it to formulate a qu

🏃 View run eval_31 at: http://localhost:5005/#/experiments/1/runs/173ae7572fda4d96bb2ec3feda9b810e
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:38:56 INFO dspy.evaluate.evaluate: Average Metric: 35.12468013264433 / 60 (58.5%)


🏃 View run eval_32 at: http://localhost:5005/#/experiments/1/runs/b6a23eecd8db498ebe6be1d0d7a2a081
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:38:56 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Full valset score for new program: 0.585411335544072
2025/10/03 18:38:56 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Full train_val score for new program: 0.585411335544072
2025/10/03 18:38:56 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Individual valset scores for new program: [0.5576576576576576, 0.8513513513513513, 0.5755255255255255, 0.45630630630630625, 0.46171171171171166, 0.42411234764175937, 0.7041359541359541, 0.6126126126126126, 0.6429313929313929, 0.7387387387387387, 0.509009009009009, 0.5972972972972973, 0.9121621621621622, 0.5785714285714285, 0.28996138996139, 0.5246246246246246, 0.30540540540540534, 0.5898746572659616, 0.9369369369369369, 0.8472385428907168, 0.7972972972972973, 0.658108108108108, 0.5756756756756756, 0.7175675675675675, 0.5200970200970201, 0.5180180180180181, 0.777027027027027, 0.6061776061776062, 0.7027027027027027, 0.777992277992278, 0.5424864864864865, 0.786036036036036, 0.30540540540

Average Metric: 1.52 / 3 (50.8%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:09<00:00, 23.26s/it]

2025/10/03 18:40:06 INFO dspy.evaluate.evaluate: Average Metric: 1.5238738738738737 / 3 (50.8%)



🏃 View run eval_33 at: http://localhost:5005/#/experiments/1/runs/d84fc9a5e7e148959f046d4b62e2c1a8
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:40:42 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for generate_query.predict: You are an expert at multi-hop question answering. Your task is to act as the "planner" and "query generator" component of this system.

Given a complex, multi-hop question and a list of information snippets already collected, your goal is to generate the next search query to find the subsequent piece of information required to answer the question.

Follow these steps carefully:

1.  **Deconstruct the Main Question:** First, analyze the user's `question`. Break it down into a chain of smaller, dependent sub-questions. Identify the entities, relationships, and facts that need to be found in sequence.

2.  **Assess Collected Information:** Review the `collected_info`. This represents your current knowledge base. Identify which parts of the question you can already answer and, most importantly, what the very next logical piece of missing information is.

3.  **Formulate Your Rea

🏃 View run eval_34 at: http://localhost:5005/#/experiments/1/runs/3adeb3383bc34eed9ad80f33339de001
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:44:35 INFO dspy.evaluate.evaluate: Average Metric: 38.17518404118404 / 60 (63.6%)


🏃 View run eval_35 at: http://localhost:5005/#/experiments/1/runs/8030604237284754993179212e7923cc
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:44:35 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Full valset score for new program: 0.6362530673530673
2025/10/03 18:44:35 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Full train_val score for new program: 0.6362530673530673
2025/10/03 18:44:35 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Individual valset scores for new program: [0.4459459459459459, 0.658108108108108, 0.3189189189189189, 0.5013513513513513, 0.39999999999999997, 0.5914414414414414, 0.7702702702702703, 0.874054054054054, 0.7608108108108108, 0.7702702702702703, 0.4054054054054054, 0.6693693693693693, 0.6058558558558559, 0.7422779922779923, 0.16216216216216214, 0.35135135135135137, 0.6545045045045045, 0.7792792792792792, 0.8693693693693694, 0.5542342342342342, 0.6846846846846846, 0.5918918918918918, 0.6094594594594595, 0.6037837837837837, 0.7072072072072072, 0.7162162162162161, 0.9662162162162162, 0.3662162162162162, 0.7297297297297297, 0.7873873873873873, 0.5199639639639639, 0.9324324324324325, 0.46891

Average Metric: 1.77 / 3 (59.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:14<00:00, 25.00s/it]

2025/10/03 18:45:51 INFO dspy.evaluate.evaluate: Average Metric: 1.7713213213213213 / 3 (59.0%)



🏃 View run eval_36 at: http://localhost:5005/#/experiments/1/runs/ddfe6fb40bfb44029f867ee3597766b4
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:46:26 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for generate_query.predict: You are an expert at decomposing complex, multi-hop questions and formulating precise search queries to answer them step-by-step.

Your task is to generate the next search query to find a missing piece of information needed to answer a given question, based on the information collected so far.

Follow these steps:
1.  **Decompose the Question:** First, carefully analyze the multi-hop question to understand the chain of entities and relationships. Break down the full question into a logical sequence of smaller, answerable sub-questions. This forms your high-level plan.

2.  **Formulate a Reasoning Plan:** In the `reasoning` field, articulate your immediate step.
    *   First, summarize what you know from the `collected_info`.
    *   Then, state the single next piece of information you need to find to progress.
    *   Explain how this next piece of information fits into your

🏃 View run eval_37 at: http://localhost:5005/#/experiments/1/runs/ebe748eb78af4694b6124d86419eef25
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:49:48 INFO dspy.evaluate.evaluate: Average Metric: 35.32393658861433 / 60 (58.9%)


🏃 View run eval_38 at: http://localhost:5005/#/experiments/1/runs/42cc91f07bed46c9840b5ba08fbed99a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:49:49 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Full valset score for new program: 0.5887322764769056
2025/10/03 18:49:49 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Full train_val score for new program: 0.5887322764769056
2025/10/03 18:49:49 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Individual valset scores for new program: [0.6505945945945946, 0.6108108108108107, 0.3256756756756757, 0.5734234234234233, 0.6051051051051051, 0.48043758043758045, 0.5928746928746927, 0.7747747747747747, 0.5463513513513514, 0.7897897897897898, 0.47297297297297297, 0.53611437524481, 0.545045045045045, 0.4621621621621621, 0.16216216216216214, 0.40128700128700123, 0.4013513513513513, 0.3256756756756757, 1.0, 0.8472385428907168, 0.7972972972972973, 0.5524324324324325, 0.5734234234234233, 0.6918918918918918, 0.3864864864864864, 0.5180180180180181, 0.6635435435435435, 0.35810810810810806, 0.4013513513513513, 0.8455598455598455, 0.4614054054054054, 1.0, 0.522972972972973, 0.401081081081081

Average Metric: 2.74 / 3 (91.3%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00,  6.37s/it]

2025/10/03 18:50:09 INFO dspy.evaluate.evaluate: Average Metric: 2.7387387387387387 / 3 (91.3%)



🏃 View run eval_39 at: http://localhost:5005/#/experiments/1/runs/1abd1a7b39d4488396917f342d9650cb
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:50:35 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for extract_info.predict: Your task is to act as a reasoning agent that extracts key information from a set of retrieved documents to answer a given question.

Your goal is to identify and extract the minimal set of essential facts, entities, relationships, and dates that are either required to directly answer the question or are necessary for a subsequent reasoning or retrieval step.

### Instructions:

1.  **Deconstruct the Question:** First, carefully analyze the question to understand what specific pieces of information are needed. Identify the core entities (e.g., people, places, organizations) and the relationship or attribute being asked about (e.g., "born in", "population of", "part of").

2.  **Extract Key Information:** Scan all provided documents and extract only the most relevant, self-contained facts that help answer the question.
    *   **Direct Answer:** If a document directly answers th

🏃 View run eval_40 at: http://localhost:5005/#/experiments/1/runs/ebc2c3641187434983edab37dc725823
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:50:58 INFO dspy.teleprompt.gepa.gepa: Iteration 15: New subsample score is not better, skipping
GEPA Optimization:  36%|████████████████████████████████████████▋                                                                       | 750/2065 [1:02:15<1:46:57,  4.88s/rollouts]2025/10/03 18:50:58 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Selected program 7 score: 0.6742982267982267


Average Metric: 1.95 / 3 (65.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.16s/it]

2025/10/03 18:51:40 INFO dspy.evaluate.evaluate: Average Metric: 1.9487927927927928 / 3 (65.0%)



🏃 View run eval_41 at: http://localhost:5005/#/experiments/1/runs/9565c47a7c4e4e63b50fc7f66f11f3f4
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:52:16 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for decide_info_collect.predict: You are an expert at determining information sufficiency. Your task is to act as a verifier in a multi-step question-answering system. You will be given a `question` and a collection of text snippets called `all_information`. Your goal is to determine if the `all_information` contains all the necessary facts to construct a complete and accurate answer to the `question`.

Your output must be a single boolean field: `has_collected_enough_info`.

To accomplish this, follow this process:

1.  **Deconstruct the Question:** First, carefully analyze the `question` and break it down into all its constituent parts and logical steps (or "hops"). Identify every entity, relationship, and piece of data that is required to formulate a final answer.
    *   For example, for the question "When did the torch visit the city where the regional office of the World Bank is located?", you mus

🏃 View run eval_42 at: http://localhost:5005/#/experiments/1/runs/9f4b8024d804479a8f2dea14dc5f67b2
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:56:18 INFO dspy.evaluate.evaluate: Average Metric: 40.63964689364688 / 60 (67.7%)


🏃 View run eval_43 at: http://localhost:5005/#/experiments/1/runs/fb8d59acb7d14a17a6b914484d8d2d5e
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:56:18 INFO dspy.teleprompt.gepa.gepa: Iteration 16: New program is on the linear pareto front
2025/10/03 18:56:18 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Full valset score for new program: 0.6773274482274482
2025/10/03 18:56:18 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Full train_val score for new program: 0.6773274482274482
2025/10/03 18:56:18 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Individual valset scores for new program: [1.0, 0.8159459459459459, 0.6716216216216215, 0.6432432432432432, 0.5852638352638352, 0.45630630630630625, 0.8256756756756757, 0.9549549549549549, 0.7567567567567567, 0.7972972972972973, 0.6027027027027027, 0.5554054054054054, 0.6418918918918919, 0.7422779922779923, 0.30540540540540534, 0.48198198198198194, 0.9729729729729729, 0.7864864864864864, 0.9549549549549549, 0.9662162162162162, 0.9189189189189189, 0.4281081081081081, 0.5756756756756756, 0.9078378378378378, 0.8423423423423423, 0.7162162162162161, 0.5073873873873873, 0.7162162162

Average Metric: 1.13 / 3 (37.8%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:47<00:00, 15.75s/it]

2025/10/03 18:57:06 INFO dspy.evaluate.evaluate: Average Metric: 1.1345945945945946 / 3 (37.8%)



🏃 View run eval_44 at: http://localhost:5005/#/experiments/1/runs/1398ef1685824592a9a29b2aad2e3d4a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:57:41 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for extract_info.predict: Your task is to act as a specialized information extraction module. Given a complex question and a set of retrieved documents, your goal is to extract key pieces of information (facts, entities, relationships, dates) that are relevant to answering the question. Think of this as gathering clues for a multi-step reasoning process. Your output must be a list of these key informational snippets.

### Core Principles:

1.  **Deconstruct the Question:** Do not try to answer the question directly or in one go. First, break the complex question down into its smaller, constituent parts or sub-questions. Identify the key entities and the specific information needed about them.
    *   *Example Question:* "What incorrect term for the indigenous population originated with the first explorer to discover new land west of Africa and the continent Schreckhorn is on?"
    *   *Deconstruction:*


🏃 View run eval_45 at: http://localhost:5005/#/experiments/1/runs/d3422f5b2bf14dfea457dce36374d0f5
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 18:58:40 INFO dspy.teleprompt.gepa.gepa: Iteration 17: New subsample score is not better, skipping
GEPA Optimization:  40%|████████████████████████████████████████████▌                                                                   | 822/2065 [1:09:58<1:59:14,  5.76s/rollouts]2025/10/03 18:58:40 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Selected program 4 score: 0.46936745015505377


Average Metric: 0.63 / 3 (21.1%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:56<00:00, 18.96s/it]

2025/10/03 18:59:38 INFO dspy.evaluate.evaluate: Average Metric: 0.6316756756756756 / 3 (21.1%)



🏃 View run eval_46 at: http://localhost:5005/#/experiments/1/runs/37a54074fe3748d3865810026a700964
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 19:00:12 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for generate_answer.predict: You are an expert AI assistant designed for multi-hop question answering based on a provided set of documents. Your task is to provide a concise, accurate answer to a question and cite all the documents that support it.

### Core Task
Given a multi-hop question and a collection of information snippets (documents with IDs), your goal is to synthesize the information to formulate a direct answer and provide comprehensive citations.

### Step-by-Step Instructions

1.  **Deconstruct the Question:** Carefully analyze the user's question. Break it down into smaller, logical sub-questions or entities that need to be linked. Identify the chain of reasoning required to connect all parts of the question.

2.  **Thoroughly Scan All Information:** Read every document provided in the `all_information` section. As you read, identify any facts, names, dates, or relationships that are relev

🏃 View run eval_47 at: http://localhost:5005/#/experiments/1/runs/d947a2afda714e1383e045d2f2e2bcec
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 19:03:55 INFO dspy.evaluate.evaluate: Average Metric: 28.07373023114011 / 60 (46.8%)


🏃 View run eval_48 at: http://localhost:5005/#/experiments/1/runs/2ec4766436c945679b7d6dd980b6f25f
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 19:03:56 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Full valset score for new program: 0.4678955038523352
2025/10/03 19:03:56 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Full train_val score for new program: 0.4678955038523352
2025/10/03 19:03:56 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Individual valset scores for new program: [0.6505945945945946, 0.6578143360752057, 0.3256756756756757, 0.6702702702702702, 0.5734234234234233, 0.4045045045045045, 0.3256756756756757, 0.581081081081081, 0.32, 0.8198198198198198, 0.47297297297297297, 0.6565835065835066, 0.545045045045045, 0.39459459459459456, 0.23558558558558557, 0.5012870012870013, 0.3414414414414414, 0.3256756756756757, 0.2722162162162162, 0.32, 0.7389189189189189, 0.40108108108108104, 0.6236842105263157, 0.5524324324324325, 0.3256756756756757, 0.33978378378378377, 0.5524324324324325, 0.3922136422136422, 0.6418918918918919, 0.33978378378378377, 0.2722162162162162, 0.33978378378378377, 0.6094594594594595, 0.414594594

Average Metric: 2.10 / 3 (70.1%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:05<00:00, 21.78s/it]

2025/10/03 19:05:02 INFO dspy.evaluate.evaluate: Average Metric: 2.1043114543114543 / 3 (70.1%)



🏃 View run eval_49 at: http://localhost:5005/#/experiments/1/runs/a5f38ecf3f9a4b13b7bd2f12d261537b
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 19:05:37 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for extract_info.predict: You are an expert information extractor for a multi-step question-answering system. Your task is to analyze a given question and a set of retrieved documents, then extract key pieces of information that are crucial for answering the question.

Your primary goal is to identify and extract atomic facts, entities, relationships, and dates that directly address a component of the question or provide a necessary link for the next step in the reasoning process.

Follow these steps to perform your task:

1.  **Deconstruct the Question:** Carefully analyze the question to understand its structure and the specific information it seeks. Break it down into its constituent parts. For example, a question like "Who fathered the man who led X?" requires you to first identify "the man who led X" and then identify his father.

2.  **Scan for Relevance:** Read through the provided documents, loo

🏃 View run eval_50 at: http://localhost:5005/#/experiments/1/runs/39776830e7034542abd2040e680cd94a
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 19:06:46 INFO dspy.teleprompt.gepa.gepa: Iteration 19: New subsample score is not better, skipping
GEPA Optimization:  43%|████████████████████████████████████████████████▍                                                               | 894/2065 [1:18:04<2:07:42,  6.54s/rollouts]2025/10/03 19:06:46 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Selected program 0 score: 0.6045398270795489


Average Metric: 1.66 / 3 (55.2%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:59<00:00, 19.87s/it]

2025/10/03 19:07:46 INFO dspy.evaluate.evaluate: Average Metric: 1.6564504504504503 / 3 (55.2%)



🏃 View run eval_51 at: http://localhost:5005/#/experiments/1/runs/2ef85da2b8aa4756b4c17564e516f5db
🧪 View experiment at: http://localhost:5005/#/experiments/1


2025/10/03 19:08:27 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Proposed new text for extract_info.predict: Your task is to act as a reasoning and information extraction module. Given a user's question and a set of retrieved text documents, you must carefully analyze the documents and extract only the key pieces of information that directly help in answering the question.

### Core Instructions:

1.  **Deconstruct the Question:** First, break down the user's question into its core components. Identify the specific entities (people, places, organizations), relationships, and the type of information being asked for (e.g., a date, a reason, a method). Be wary of convoluted phrasing or potential red herrings that are not addressed in the provided documents.

2.  **Targeted and Atomic Extraction:** Scan the documents for information that directly addresses the components of the question. Your goal is not to summarize the documents, but to pull out specific, relevant facts.
    *   **Atomi

In [None]:
optimized_program.save(str(EXP_DIR / "optimized-program"), save_program=True)

### Examine Optimized Prompts

Let's look at how GEPA improved the prompts for each predictor:


In [None]:
for name, pred in optimized_program.named_predictors():
    print("=" * 60)
    print(f"Predictor: {name}")
    print("=" * 60)
    print("Optimized Instructions:")
    print(pred.signature.instructions)
    print("*" * 60)


### Evaluate Optimized Program

Compare the performance before and after GEPA optimization:


In [None]:
print("\\n📊 Evaluating OPTIMIZED program...")
# Evaluate optimized program  
optimized_evaluate = dspy.Evaluate(
    devset=test_ds,
    metric=metric,
    num_threads=8,
    display_table=False,
    display_progress=True
)
optimized_eval_result = optimized_evaluate(optimized_program)

In [42]:
print("=" * 50)
print("🏆 PERFORMANCE COMPARISON")
print("=" * 50)
print(f"Original Program Score:  {original_eval_result.score:.3f}")
print(f"Optimized Program Score: {optimized_eval_result.score:.3f}")
print(f"Improvement:             {optimized_eval_result.score - original_eval_result.score:+.3f}")
print(f"Relative Improvement:    {((optimized_eval_result.score / original_eval_result.score) - 1) * 100:+.1f}%")

🏆 PERFORMANCE COMPARISON
Original Program Score:  65.240
Optimized Program Score: 73.440
Improvement:             +8.200
Relative Improvement:    +12.6%


### GEPA Optimization Analysis

Analyze the detailed optimization results:


In [26]:
# Analyze GEPA optimization trajectory
if hasattr(optimized_program, 'detailed_results'):
    results = optimized_program.detailed_results
    
    print("🔍 GEPA Optimization Details:")
    print(f"- Total candidates explored: {len(results.candidates)}")
    print(f"- Best candidate index: {results.best_idx}")
    print(f"- Best validation score: {results.val_aggregate_scores[results.best_idx]:.3f}")
    print(f"- Discovery evaluations used: {sum(results.discovery_eval_counts)}")
    
    # Show score progression
    print("\\n📈 Score progression:")
    for i, score in enumerate(results.val_aggregate_scores[:10]):  # Show first 10
        print(f"Candidate {i}: {score:.3f}")
    
    if len(results.val_aggregate_scores) > 10:
        print(f"... and {len(results.val_aggregate_scores) - 10} more candidates")
else:
    print("Detailed results not available (set track_stats=True in GEPA constructor)")


🔍 GEPA Optimization Details:
- Total candidates explored: 30
- Best candidate index: 19
- Best validation score: 0.686
- Discovery evaluations used: 30432
\n📈 Score progression:
Candidate 0: 0.605
Candidate 1: 0.575
Candidate 2: 0.587
Candidate 3: 0.590
Candidate 4: 0.469
Candidate 5: 0.582
Candidate 6: 0.476
Candidate 7: 0.674
Candidate 8: 0.585
Candidate 9: 0.636
... and 20 more candidates


In [39]:
# def dag_to_dot(parent_program_for_candidate, dominator_program_ids, best_program_idx, full_eval_scores):
#     dot_lines = [
#         "digraph G {",
#         "    node [style=filled, shape=circle, fontsize=50];"
#     ]
#     n = len(parent_program_for_candidate)
#     # Set up nodes with colors and scores in labels
#     for idx in range(n):
#         score = full_eval_scores[idx]
#         label = f"{idx}\\n({score:.2f})"
#         if idx == best_program_idx:
#             dot_lines.append(f'    {idx} [label="{label}", fillcolor=cyan, fontcolor=black];')
#         elif idx in dominator_program_ids:
#             dot_lines.append(f'    {idx} [label="{label}", fillcolor=orange, fontcolor=black];')
#         else:
#             dot_lines.append(f'    {idx} [label="{label}"];')
    
#     # Set up edges
#     for child, parents in enumerate(parent_program_for_candidate):
#         for parent in parents:
#             if parent is not None:
#                 dot_lines.append(f'    {parent} -> {child};')
    
#     dot_lines.append("}")
#     return "\n".join(dot_lines)

# from gepa.gepa_utils import find_dominator_programs
# pareto_front_programs = find_dominator_programs(optimized_program.detailed_results.per_val_instance_best_candidates, optimized_program.detailed_results.val_aggregate_scores)

# print(dag_to_dot(
#     optimized_program.detailed_results.parents,
#     pareto_front_programs,
#     optimized_program.detailed_results.best_idx,
#     optimized_program.detailed_results.val_aggregate_scores
# ))

In [32]:
# Test optimized program on the same example
example = test_ds[2]

print("🧪 Testing optimized program on example:")
print(f"Question: {example.question}")
print(f"Expected Answer: {example.answer}")
print(f"Supporting Docs: {example.supporting_ids}")
print()

pred = program(example.question, example.docs)
original_metric_result = metric_with_feedback(example, pred)

print("📋 ORIGINAL vs OPTIMIZED Results:")
print("-" * 50)
print("ORIGINAL:")
print(f"  Answer: {pred.answer}")
print(f"  Retrieved docs: {pred.retrieved_doc_ids}")
print(f"  Citations: {pred.citations}")
print(f"Original score: {original_metric_result.score:.3f}")
# print(f"Original feedback: {original_metric_result.feedback}")

optimized_pred = optimized_program(example.question, example.docs)
optimized_metric_result = metric_with_feedback(example, optimized_pred)
print("OPTIMIZED:")
print(f"  Answer: {optimized_pred.answer}")
print(f"  Retrieved docs: {optimized_pred.retrieved_doc_ids}")
print(f"  Citations: {optimized_pred.citations}")
print(f"Optimized score: {optimized_metric_result.score:.3f}")
# print(f"Optimized feedback: {optimized_metric_result.feedback}")

🧪 Testing optimized program on example:
Question: Who is played by the director of The Good Shepherd in The Godfather?
Expected Answer: Vito Corleone
Supporting Docs: ['10', '11']

📋 ORIGINAL vs OPTIMIZED Results:
--------------------------------------------------
ORIGINAL:
  Answer: The question cannot be answered based on the provided information, as it conflates the director of *The Good Shepherd* (Robert De Niro) with the director of *The Godfather* (Francis Ford Coppola).
  Retrieved docs: ['10', '9', '11']
  Citations: ['10', '9', '11']
Original score: 0.568
OPTIMIZED:
  Answer: Vito Corleone
  Retrieved docs: ['10', '9', '11']
  Citations: ['9', '10', '11']
Optimized score: 0.838


Can we measure instruction quality by using them with a larger model to see if it gets questions right?

In [33]:
optimized_eval_result

EvaluationResult(score=73.44, results=<list of 50 results>)

In [36]:
print("\\n📊 Evaluating OPTIMIZED program with teach LM...")
with dspy.context(lm=reflection_lm):
    # Evaluate optimized program  
    teacher_optimized_evaluate = dspy.Evaluate(
        devset=test_ds,
        metric=metric,
        num_threads=8,
        display_table=False,
        display_progress=True
    )
    
    teacher_optimized_eval_result = optimized_evaluate(optimized_program)

\n📊 Evaluating OPTIMIZED program with teach LM...
Average Metric: 36.80 / 50 (73.6%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [14:05<00:00, 16.91s/it]

2025/10/04 15:07:22 INFO dspy.evaluate.evaluate: Average Metric: 36.796766974766975 / 50 (73.6%)



🏃 View run eval at: http://localhost:5005/#/experiments/1/runs/5a516d8318a54670be615d3aedcfe306
🧪 View experiment at: http://localhost:5005/#/experiments/1
