In [1]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy")

<Experiment: artifact_location='mlflow-artifacts:/374362034103955121', creation_time=1741686562632, experiment_id='374362034103955121', last_update_time=1741686562632, lifecycle_stage='active', name='DSPy', tags={}>

In [2]:
mlflow.dspy.autolog()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import dspy

lm = dspy.LM('fireworks_ai/accounts/fireworks/models/llama-v3p1-8b-instruct', max_tokens=3000)
gpt4o = dspy.LM('openai/gpt-4o', max_tokens=3000)

dspy.configure(lm=lm)

In [None]:
%pip install -U bm25s PyStemmer "jax[cpu]"

In [2]:
from dspy.utils import download

download("https://huggingface.co/dspy/cache/resolve/main/wiki.abstracts.2017.tar.gz")
!tar -xzvf wiki.abstracts.2017.tar.gz

Downloading 'wiki.abstracts.2017.tar.gz'...
x wiki.abstracts.2017.jsonl


In [4]:
import ujson
corpus = []

with open("wiki.abstracts.2017.jsonl") as f:
    for line in f:
        line = ujson.loads(line)
        corpus.append(f"{line['title']} | {' '.join(line['text'])}")

len(corpus)

5233330

In [5]:
import bm25s
import Stemmer

stemmer = Stemmer.Stemmer("english")
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

retriever = bm25s.BM25(k1=0.9, b=0.4)
retriever.index(corpus_tokens)

                                                                                   

In [6]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:200], hover[200:500], hover[650:]

In [7]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Chris Noonan', 'Academy Award for Best Director', 'Miss Potter']


In [8]:
def search(query: str, k: int) -> list[str]:
    tokens = bm25s.tokenize(query, stopwords="en", stemmer=stemmer, show_progress=False)
    results, scores = retriever.retrieve(tokens, k=k, n_threads=1, show_progress=False)
    run = {corpus[doc]: float(score) for doc, score in zip(results[0], scores[0])}
    return run

In [9]:
class Hop(dspy.Module):
    def __init__(self, num_docs=10, num_hops=4):
        self.num_docs, self.num_hops = num_docs, num_hops
        self.generate_query = dspy.ChainOfThought('claim, notes -> query')
        self.append_notes = dspy.ChainOfThought('claim, notes, context -> new_notes: list[str], titles: list[str]')

    def forward(self, claim: str) -> list[str]:
        notes = []
        titles = []

        for _ in range(self.num_hops):
            query = self.generate_query(claim=claim, notes=notes).query
            context = search(query, k=self.num_docs)
            prediction = self.append_notes(claim=claim, notes=notes, context=context)
            notes.extend(prediction.new_notes)
            titles.extend(prediction.titles)
        
        return dspy.Prediction(notes=notes, titles=list(set(titles)))

In [10]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [11]:
evaluate(Hop())

Average Metric: 16.67 / 71 (23.5%):  23%|██▎       | 70/300 [02:13<09:15,  2.42s/it]

2025/03/13 20:04:48 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The band that collaborated with Neva Dinova on the album "One Jug of Wine, Two Vessels", and Disturbed are both rock bands from America.', 'titles': ['Bright Eyes (band)', 'Disturbed (band)', 'One Jug of Wine, Two Vessels']}) (input_keys={'claim'}): Expected dict_keys(['reasoning', 'new_notes', 'titles']) but got dict_keys(['reasoning', 'new_notes']). Set `provide_traceback=True` for traceback.


Average Metric: 82.33 / 299 (27.5%): 100%|██████████| 300/300 [08:32<00:00,  1.71s/it]

2025/03/13 20:11:05 INFO dspy.evaluate.evaluate: Average Metric: 82.33333333333334 / 300 (27.4%)





Unnamed: 0,claim,example_titles,notes,pred_titles,top5_recall,titles
0,Nike football team has had a player endorse the football boot Nike...,"[Nike Total 90, Marcus Rashford, Nike Hypervenom]",['The Nike Total 90 range has now been replaced with Nike Hyperven...,['2011 AFC Cup | The 2011 AFC Cup was the 8th edition of the AFC C...,✔️ [0.333],
1,Bill Boyd is the chairman of the appliance company that operates t...,"[Suncoast Hotel and Casino, Thomas Eje, Boyd Gaming]",['The claim does not match any of the context information provided...,"['Portrait of an Unknown Woman', 'The Unknown Warrior', 'Frederick...",,
2,The president of South Korea was born 24 January 1953. The group t...,"[Moon Jae-in, Euh Yoon-dae, Presidential Council on Nation Brandin...","['The president of South Korea was born in 1924, not 1953.', 'The ...","['Euh Yoon-Dae', '2000 inter-Korean summit', 'President of South K...",,
3,The movie Khan Kluay was released 2 months before the 2009 movie t...,"[Fantastic Mr. Fox (film), Khan Kluay, Jason Schwartzman]","['The movie Khan Kluay was released in 2006', 'Jason Schwartzman c...","['Fantastic Mr Fox (opera)', 'Khan Kluay', 'List of Khan Kluay cha...",✔️ [0.333],
4,The director of Finding Dory co-directed the film A Bug's Life.,"[Andrew Stanton, Finding Dory, A Bug's Life]","['The director of Finding Dory is Andrew Stanton.', ""Andrew Stanto...","[Finding Dory, Co-director of A Bug's Life, Incorrect Claim, Andre...",✔️ [0.667],


27.44

In [12]:
models = dict(prompt_model=gpt4o, teacher_settings=dict(lm=gpt4o))
tp = dspy.MIPROv2(metric=top5_recall, auto="medium", num_threads=16, **models)

kwargs = dict(minibatch_size=40, minibatch_full_eval_steps=4, requires_permission_to_run=False)
optimized = tp.compile(Hop(), trainset=trainset, max_bootstrapped_demos=4, max_labeled_demos=4, **kwargs)

2025/03/13 20:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 9
valset size: 160

2025/03/13 20:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/03/13 20:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/03/13 20:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=9 sets of demonstrations...


Bootstrapping set 1/9
Bootstrapping set 2/9
Bootstrapping set 3/9


 40%|████      | 16/40 [09:15<13:53, 34.73s/it]


Bootstrapped 4 full traces after 16 examples for up to 1 rounds, amounting to 16 attempts.
Bootstrapping set 4/9


  2%|▎         | 1/40 [00:38<25:08, 38.68s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/9


 15%|█▌        | 6/40 [02:27<13:55, 24.56s/it]


Bootstrapped 2 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 6/9


  8%|▊         | 3/40 [01:16<15:38, 25.38s/it]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 7/9


 10%|█         | 4/40 [02:22<21:21, 35.61s/it]


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 8/9


  8%|▊         | 3/40 [02:16<28:06, 45.58s/it]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 9/9


 10%|█         | 4/40 [01:34<14:06, 23.52s/it]
2025/03/13 20:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/03/13 20:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/03/13 20:31:28 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/03/13 20:35:57 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/03/13 20:35:57 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `claim`, `notes`, produce the fields `query`.

2025/03/13 20:35:57 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given an initial `claim` and its associated `notes`, systematically reason through the notes using a "Chain of Thought" approach to produce a logical reasoning pathway. This reasoning should clarify the connections and insights derived from the notes, ultimately leading to the formulation of a coherent and precise `query`. The `query` should be designed to guide subsequent searches for additional information or evidence that can further validate or expand upon the initial claim. Ensure that the reasoning is clear and step-by-step, justifying each part of the query to facilitate effective information retrieval i

Average Metric: 47.33 / 160 (29.6%): 100%|██████████| 160/160 [04:43<00:00,  1.77s/it]

2025/03/13 20:40:41 INFO dspy.evaluate.evaluate: Average Metric: 47.33333333333335 / 160 (29.6%)
2025/03/13 20:40:41 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 29.58

2025/03/13 20:40:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 32 - Minibatch ==



Average Metric: 25.67 / 40 (64.2%): 100%|██████████| 40/40 [01:27<00:00,  2.18s/it]

2025/03/13 20:42:08 INFO dspy.evaluate.evaluate: Average Metric: 25.66666666666666 / 40 (64.2%)
2025/03/13 20:42:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 5'].
2025/03/13 20:42:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17]
2025/03/13 20:42:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58]
2025/03/13 20:42:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 29.58


2025/03/13 20:42:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 32 - Minibatch ==



Average Metric: 21.67 / 40 (54.2%): 100%|██████████| 40/40 [01:42<00:00,  2.56s/it]

2025/03/13 20:43:50 INFO dspy.evaluate.evaluate: Average Metric: 21.666666666666668 / 40 (54.2%)
2025/03/13 20:43:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 1'].
2025/03/13 20:43:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17]
2025/03/13 20:43:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58]
2025/03/13 20:43:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 29.58


2025/03/13 20:43:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 32 - Minibatch ==



Average Metric: 26.33 / 40 (65.8%): 100%|██████████| 40/40 [01:32<00:00,  2.30s/it]

2025/03/13 20:45:22 INFO dspy.evaluate.evaluate: Average Metric: 26.333333333333332 / 40 (65.8%)
2025/03/13 20:45:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.83 on minibatch of size 40 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 20:45:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83]
2025/03/13 20:45:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58]
2025/03/13 20:45:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 29.58


2025/03/13 20:45:22 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 32 - Full Evaluation =====
2025/03/13 20:45:22 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 65.83) from minibatch trials...



Average Metric: 100.00 / 160 (62.5%): 100%|██████████| 160/160 [02:58<00:00,  1.11s/it]

2025/03/13 20:48:21 INFO dspy.evaluate.evaluate: Average Metric: 100.00000000000003 / 160 (62.5%)
2025/03/13 20:48:21 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 62.5
2025/03/13 20:48:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5]
2025/03/13 20:48:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5
2025/03/13 20:48:21 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 20:48:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 32 - Minibatch ==



Average Metric: 20.67 / 40 (51.7%): 100%|██████████| 40/40 [01:23<00:00,  2.09s/it]

2025/03/13 20:49:44 INFO dspy.evaluate.evaluate: Average Metric: 20.666666666666668 / 40 (51.7%)
2025/03/13 20:49:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 51.67 on minibatch of size 40 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 0'].
2025/03/13 20:49:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67]
2025/03/13 20:49:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5]
2025/03/13 20:49:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5


2025/03/13 20:49:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 7 / 32 - Minibatch ==



Average Metric: 20.33 / 40 (50.8%): 100%|██████████| 40/40 [00:58<00:00,  1.45s/it]

2025/03/13 20:50:42 INFO dspy.evaluate.evaluate: Average Metric: 20.33333333333333 / 40 (50.8%)
2025/03/13 20:50:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.83 on minibatch of size 40 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/03/13 20:50:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83]
2025/03/13 20:50:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5]
2025/03/13 20:50:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5


2025/03/13 20:50:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 32 - Minibatch ==



Average Metric: 27.00 / 40 (67.5%): 100%|██████████| 40/40 [01:27<00:00,  2.18s/it]

2025/03/13 20:52:10 INFO dspy.evaluate.evaluate: Average Metric: 27.000000000000004 / 40 (67.5%)
2025/03/13 20:52:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 20:52:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5]
2025/03/13 20:52:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5]
2025/03/13 20:52:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5


2025/03/13 20:52:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 32 - Full Evaluation =====
2025/03/13 20:52:10 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 67.5) from minibatch trials...



Average Metric: 88.67 / 143 (62.0%):  89%|████████▉ | 142/160 [02:48<00:17,  1.02it/s]

2025/03/13 20:55:10 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Fredric March costarred in 1935 Australian drama film that Frances Drake is best know for, with the husband of Elsa Lanchester.', 'titles': ['Les Misérables (1935 film)', 'Frances Drake', 'Charles Laughton']}) (input_keys={'claim'}): Expected dict_keys(['reasoning', 'new_notes', 'titles']) but got dict_keys(['reasoning']). Set `provide_traceback=True` for traceback.


Average Metric: 99.67 / 159 (62.7%): 100%|██████████| 160/160 [03:47<00:00,  1.42s/it]

2025/03/13 20:55:57 INFO dspy.evaluate.evaluate: Average Metric: 99.66666666666673 / 160 (62.3%)
2025/03/13 20:55:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29]
2025/03/13 20:55:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5
2025/03/13 20:55:57 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 20:55:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 32 - Minibatch ==



Average Metric: 25.00 / 40 (62.5%): 100%|██████████| 40/40 [01:29<00:00,  2.25s/it]

2025/03/13 20:57:27 INFO dspy.evaluate.evaluate: Average Metric: 25.000000000000004 / 40 (62.5%)
2025/03/13 20:57:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 20:57:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5]
2025/03/13 20:57:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29]
2025/03/13 20:57:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5


2025/03/13 20:57:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 32 - Minibatch ==



Average Metric: 27.33 / 40 (68.3%): 100%|██████████| 40/40 [02:11<00:00,  3.28s/it]

2025/03/13 20:59:38 INFO dspy.evaluate.evaluate: Average Metric: 27.333333333333343 / 40 (68.3%)
2025/03/13 20:59:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.33 on minibatch of size 40 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 20:59:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33]
2025/03/13 20:59:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29]
2025/03/13 20:59:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5


2025/03/13 20:59:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 32 - Minibatch ==



Average Metric: 27.00 / 40 (67.5%): 100%|██████████| 40/40 [01:24<00:00,  2.12s/it]

2025/03/13 21:01:03 INFO dspy.evaluate.evaluate: Average Metric: 27.0 / 40 (67.5%)
2025/03/13 21:01:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:01:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5]
2025/03/13 21:01:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29]
2025/03/13 21:01:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.5


2025/03/13 21:01:03 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 32 - Full Evaluation =====
2025/03/13 21:01:03 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.33) from minibatch trials...



Average Metric: 103.67 / 160 (64.8%): 100%|██████████| 160/160 [03:52<00:00,  1.45s/it]

2025/03/13 21:04:55 INFO dspy.evaluate.evaluate: Average Metric: 103.66666666666671 / 160 (64.8%)
2025/03/13 21:04:55 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 64.79
2025/03/13 21:04:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79]
2025/03/13 21:04:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.79
2025/03/13 21:04:55 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 21:04:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 32 - Minibatch ==



Average Metric: 23.67 / 40 (59.2%): 100%|██████████| 40/40 [01:02<00:00,  1.56s/it]

2025/03/13 21:05:58 INFO dspy.evaluate.evaluate: Average Metric: 23.666666666666664 / 40 (59.2%)
2025/03/13 21:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17]
2025/03/13 21:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79]
2025/03/13 21:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.79


2025/03/13 21:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 32 - Minibatch ==



Average Metric: 25.67 / 40 (64.2%): 100%|██████████| 40/40 [01:13<00:00,  1.85s/it]

2025/03/13 21:07:12 INFO dspy.evaluate.evaluate: Average Metric: 25.666666666666668 / 40 (64.2%)
2025/03/13 21:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 4'].
2025/03/13 21:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17]
2025/03/13 21:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79]
2025/03/13 21:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.79


2025/03/13 21:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 32 - Minibatch ==



Average Metric: 21.67 / 40 (54.2%): 100%|██████████| 40/40 [01:34<00:00,  2.36s/it]

2025/03/13 21:08:46 INFO dspy.evaluate.evaluate: Average Metric: 21.666666666666664 / 40 (54.2%)
2025/03/13 21:08:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 6'].
2025/03/13 21:08:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17]
2025/03/13 21:08:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79]
2025/03/13 21:08:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.79


2025/03/13 21:08:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 32 - Full Evaluation =====
2025/03/13 21:08:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 67.5) from minibatch trials...



Average Metric: 38.00 / 57 (66.7%):  36%|███▌      | 57/160 [01:11<02:02,  1.19s/it]

2025/03/13 21:09:58 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actress that starred as a parent in Sky High (2005 film) is married to John Travolta. She also starred in a summer 1994 film about a pregnant woman who is stranded at a trading post during the American Civil War.', 'titles': ['Kelly Preston', 'Cheyenne Warrior', 'Sky High (2005 film)']}) (input_keys={'claim'}): Expected dict_keys(['reasoning', 'query']) but got dict_keys(['reasoning']). Set `provide_traceback=True` for traceback.


Average Metric: 104.00 / 159 (65.4%): 100%|██████████| 160/160 [03:42<00:00,  1.39s/it]

2025/03/13 21:12:29 INFO dspy.evaluate.evaluate: Average Metric: 104.00000000000006 / 160 (65.0%)
2025/03/13 21:12:29 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 65.0
2025/03/13 21:12:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0]
2025/03/13 21:12:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0
2025/03/13 21:12:29 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 21:12:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 32 - Minibatch ==



Average Metric: 23.00 / 40 (57.5%): 100%|██████████| 40/40 [01:15<00:00,  1.88s/it]

2025/03/13 21:13:44 INFO dspy.evaluate.evaluate: Average Metric: 23.000000000000004 / 40 (57.5%)
2025/03/13 21:13:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:13:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5]
2025/03/13 21:13:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0]
2025/03/13 21:13:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:13:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 19 / 32 - Minibatch ==



Average Metric: 21.67 / 40 (54.2%): 100%|██████████| 40/40 [01:14<00:00,  1.86s/it]

2025/03/13 21:14:59 INFO dspy.evaluate.evaluate: Average Metric: 21.666666666666668 / 40 (54.2%)
2025/03/13 21:14:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:14:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17]
2025/03/13 21:14:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0]
2025/03/13 21:14:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:14:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 32 - Minibatch ==



Average Metric: 25.00 / 40 (62.5%): 100%|██████████| 40/40 [01:20<00:00,  2.01s/it]

2025/03/13 21:16:19 INFO dspy.evaluate.evaluate: Average Metric: 24.999999999999996 / 40 (62.5%)
2025/03/13 21:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5]
2025/03/13 21:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0]
2025/03/13 21:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 21 / 32 - Full Evaluation =====
2025/03/13 21:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 64.17) from minibatch trials...



Average Metric: 99.33 / 160 (62.1%): 100%|██████████| 160/160 [03:45<00:00,  1.41s/it]

2025/03/13 21:20:05 INFO dspy.evaluate.evaluate: Average Metric: 99.33333333333339 / 160 (62.1%)
2025/03/13 21:20:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08]
2025/03/13 21:20:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0
2025/03/13 21:20:05 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 21:20:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 32 - Minibatch ==



Average Metric: 22.33 / 40 (55.8%): 100%|██████████| 40/40 [01:30<00:00,  2.26s/it]

2025/03/13 21:21:35 INFO dspy.evaluate.evaluate: Average Metric: 22.333333333333336 / 40 (55.8%)
2025/03/13 21:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.83 on minibatch of size 40 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 1'].
2025/03/13 21:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83]
2025/03/13 21:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08]
2025/03/13 21:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 23 / 32 - Minibatch ==



Average Metric: 24.00 / 40 (60.0%): 100%|██████████| 40/40 [01:43<00:00,  2.58s/it]

2025/03/13 21:23:19 INFO dspy.evaluate.evaluate: Average Metric: 24.000000000000007 / 40 (60.0%)
2025/03/13 21:23:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 40 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:23:19 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0]
2025/03/13 21:23:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08]
2025/03/13 21:23:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:23:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 24 / 32 - Minibatch ==



Average Metric: 26.00 / 40 (65.0%): 100%|██████████| 40/40 [01:41<00:00,  2.54s/it]

2025/03/13 21:25:00 INFO dspy.evaluate.evaluate: Average Metric: 26.000000000000007 / 40 (65.0%)
2025/03/13 21:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.0 on minibatch of size 40 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0]
2025/03/13 21:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08]
2025/03/13 21:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 32 - Full Evaluation =====
2025/03/13 21:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 65.0) from miniba


Average Metric: 95.67 / 160 (59.8%): 100%|██████████| 160/160 [03:34<00:00,  1.34s/it]

2025/03/13 21:28:35 INFO dspy.evaluate.evaluate: Average Metric: 95.66666666666674 / 160 (59.8%)
2025/03/13 21:28:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79]
2025/03/13 21:28:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0
2025/03/13 21:28:35 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 21:28:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 26 / 32 - Minibatch ==



Average Metric: 24.67 / 40 (61.7%): 100%|██████████| 40/40 [01:00<00:00,  1.52s/it]

2025/03/13 21:29:36 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 40 (61.7%)
2025/03/13 21:29:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 61.67 on minibatch of size 40 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 6', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:29:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0, 61.67]
2025/03/13 21:29:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79]
2025/03/13 21:29:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:29:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 27 / 32 - Minibatch ==



Average Metric: 25.67 / 40 (64.2%): 100%|██████████| 40/40 [01:19<00:00,  1.98s/it]

2025/03/13 21:30:55 INFO dspy.evaluate.evaluate: Average Metric: 25.666666666666668 / 40 (64.2%)
2025/03/13 21:30:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.17 on minibatch of size 40 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 5'].
2025/03/13 21:30:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0, 61.67, 64.17]
2025/03/13 21:30:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79]
2025/03/13 21:30:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:30:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 28 / 32 - Minibatch ==



Average Metric: 23.00 / 40 (57.5%): 100%|██████████| 40/40 [00:01<00:00, 36.09it/s]

2025/03/13 21:30:57 INFO dspy.evaluate.evaluate: Average Metric: 23.000000000000007 / 40 (57.5%)
2025/03/13 21:30:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:30:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0, 61.67, 64.17, 57.5]
2025/03/13 21:30:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79]
2025/03/13 21:30:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:30:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 29 / 32 - Full Evaluation =====
2025/03/13 21:30:57 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (A


Average Metric: 101.67 / 160 (63.5%): 100%|██████████| 160/160 [02:16<00:00,  1.18it/s]

2025/03/13 21:33:13 INFO dspy.evaluate.evaluate: Average Metric: 101.66666666666674 / 160 (63.5%)
2025/03/13 21:33:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79, 63.54]
2025/03/13 21:33:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0
2025/03/13 21:33:13 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 21:33:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 30 / 32 - Minibatch ==



Average Metric: 11.33 / 40 (28.3%): 100%|██████████| 40/40 [01:14<00:00,  1.86s/it]

2025/03/13 21:34:27 INFO dspy.evaluate.evaluate: Average Metric: 11.333333333333336 / 40 (28.3%)
2025/03/13 21:34:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.33 on minibatch of size 40 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/03/13 21:34:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0, 61.67, 64.17, 57.5, 28.33]
2025/03/13 21:34:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79, 63.54]
2025/03/13 21:34:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:34:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 31 / 32 - Minibatch ==



Average Metric: 16.67 / 26 (64.1%):  65%|██████▌   | 26/40 [00:59<00:21,  1.57s/it]

2025/03/13 21:35:27 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A science fiction Western television show stars an Canadian , director, producer, writer, singer, musician, voice artist and stand-up comedian. Laura Jane Laughlin appeared on this show.', 'titles': ['Legend (TV series)', 'John de Lancie', 'Laura Jane Laughlin']}) (input_keys={'claim'}): Expected dict_keys(['reasoning', 'query']) but got dict_keys(['reasoning']). Set `provide_traceback=True` for traceback.


Average Metric: 25.00 / 39 (64.1%): 100%|██████████| 40/40 [01:27<00:00,  2.18s/it]

2025/03/13 21:35:54 INFO dspy.evaluate.evaluate: Average Metric: 25.000000000000004 / 40 (62.5%)
2025/03/13 21:35:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/13 21:35:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0, 61.67, 64.17, 57.5, 28.33, 62.5]
2025/03/13 21:35:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79, 63.54]
2025/03/13 21:35:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:35:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 32 / 32 - Full Evaluation =====
2025/03/13 21:35:54 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top 


Average Metric: 97.00 / 160 (60.6%): 100%|██████████| 160/160 [03:24<00:00,  1.28s/it]

2025/03/13 21:39:19 INFO dspy.evaluate.evaluate: Average Metric: 97.00000000000001 / 160 (60.6%)
2025/03/13 21:39:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79, 63.54, 60.63]
2025/03/13 21:39:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0
2025/03/13 21:39:19 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/13 21:39:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 33 / 32 - Minibatch ==



Average Metric: 25.00 / 40 (62.5%): 100%|██████████| 40/40 [00:42<00:00,  1.06s/it]

2025/03/13 21:40:01 INFO dspy.evaluate.evaluate: Average Metric: 24.999999999999996 / 40 (62.5%)
2025/03/13 21:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 on minibatch of size 40 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 2'].
2025/03/13 21:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.17, 54.17, 65.83, 51.67, 50.83, 67.5, 62.5, 68.33, 67.5, 59.17, 64.17, 54.17, 57.5, 54.17, 62.5, 55.83, 60.0, 65.0, 61.67, 64.17, 57.5, 28.33, 62.5, 62.5]
2025/03/13 21:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [29.58, 62.5, 62.29, 64.79, 65.0, 62.08, 59.79, 63.54, 60.63]
2025/03/13 21:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.0


2025/03/13 21:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 65.0!





In [14]:
optimized.save("optimized_multihop.json")

In [15]:
evaluate(optimized)

Average Metric: 201.67 / 300 (67.2%): 100%|██████████| 300/300 [08:34<00:00,  1.71s/it]

2025/03/13 21:51:32 INFO dspy.evaluate.evaluate: Average Metric: 201.66666666666657 / 300 (67.2%)





Unnamed: 0,claim,example_titles,notes,pred_titles,top5_recall
0,Nike football team has had a player endorse the football boot Nike...,"[Nike Total 90, Marcus Rashford, Nike Hypervenom]","['Nike Total 90 was replaced by Nike Hypervenom.', 'Nike Hyperveno...","[Robert Lewandowski, Kylian Mbappé, Nike Total 90, Marcus Rashford...",✔️ [1.000]
1,Bill Boyd is the chairman of the appliance company that operates t...,"[Suncoast Hotel and Casino, Thomas Eje, Boyd Gaming]",['Bill Boyd is the executive chairman of Boyd Gaming Corporation.'...,"[Boyd Gaming Corporation, Thomas Eje, Suncoast Hotel and Casino, B...",✔️ [0.667]
2,The president of South Korea was born 24 January 1953. The group t...,"[Moon Jae-in, Euh Yoon-dae, Presidential Council on Nation Brandin...","['The president of South Korea was born on January 24, 1953.', 'Eu...","[Euh Yoon-Dae, President of South Korea, Moon Jae-in, KB Financial...",✔️ [0.667]
3,The movie Khan Kluay was released 2 months before the 2009 movie t...,"[Fantastic Mr. Fox (film), Khan Kluay, Jason Schwartzman]","['Khan Kluay was released in 2006.', 'Jason Schwartzman collaborat...","[Fantastic Mr. Fox, Khan Kluay, Jason Schwartzman]",✔️ [0.667]
4,The director of Finding Dory co-directed the film A Bug's Life.,"[Andrew Stanton, Finding Dory, A Bug's Life]","['Andrew Stanton directed Finding Dory.', ""Andrew Stanton co-direc...","[Andrew Stanton, Lee Unkrich, Finding Dory, A Bug's Life]",✔️ [1.000]


67.22

In [19]:
Hop()(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

['Tabataba suivi de pawana',
 'Up Against It!',
 'The Alien',
 'The Illusionist',
 'Bernard-Marie Koltès',
 'In the Solitude of Cotton Fields',
 'Faultless disagreement',
 'Up Against It',
 'The Hook',
 "The Kilroys' List"]

In [20]:
optimized(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

['Joe Orton',
 'Bernard-Marie Koltès',
 'What the Butler Saw (play)',
 'Up Against It']

In [21]:
dspy.inspect_history(n=1)





[34m[2025-03-13T21:59:46.192830][0m

[31mSystem message:[0m

Your input fields are:
1. `claim` (str)
2. `notes` (str)
3. `context` (str)

Your output fields are:
1. `reasoning` (str)
2. `new_notes` (list[str])
3. `titles` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## claim ## ]]
{claim}

[[ ## notes ## ]]
{notes}

[[ ## context ## ]]
{context}

[[ ## reasoning ## ]]
{reasoning}

[[ ## new_notes ## ]]
{new_notes}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## titles ## ]]
{titles}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Using the provided `claim`, `notes`, and `context`, synthesize the information to generate `new_notes` and `titles`. The `new_notes` should include refined and r