In [30]:
import dspy
from dotenv import load_dotenv

load_dotenv()

lm = dspy.LM('openai/gpt-4o-mini', max_tokens=3000)

gpt4o = dspy.LM('openai/gpt-4o', max_tokens=3000)

gpt4o_mini = dspy.LM('gpt-4o-mini-2024-07-18')

dspy.configure(lm=lm)

# Download the data for retrieval

In [3]:
from dspy.utils import download

download("https://huggingface.co/dspy/cache/resolve/main/wiki.abstracts.2017.tar.gz")
!tar -xzvf wiki.abstracts.2017.tar.gz

Downloading 'wiki.abstracts.2017.tar.gz'...


## Load the corpus

In [6]:
import ujson
corpus = []

with open("wiki.abstracts.2017.jsonl") as f:
    for line in f:
        line = ujson.loads(line)
        corpus.append(f"{line['title']} | {' '.join(line['text'])}")

len(corpus)

5233330

## Index it for retrieval with BM25


In [7]:
import bm25s
import Stemmer

stemmer = Stemmer.Stemmer("english")
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

retriever = bm25s.BM25(k1=0.9, b=0.4)
retriever.index(corpus_tokens)

                                                                                   

# Load HoVer dataset for example

In [11]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:200], hover[200:500], hover[650:]

Downloading data: 100%|██████████| 9.21M/9.21M [00:01<00:00, 7.41MB/s]
Downloading data: 100%|██████████| 2.15M/2.15M [00:00<00:00, 4.16MB/s]
Downloading data: 100%|██████████| 899k/899k [00:00<00:00, 6.28MB/s]
Generating train split: 100%|██████████| 18171/18171 [00:00<00:00, 46363.82 examples/s]
Generating validation split: 100%|██████████| 4000/4000 [00:00<00:00, 46954.49 examples/s]
Generating test split: 100%|██████████| 4000/4000 [00:00<00:00, 76419.16 examples/s]


In [12]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Academy Award for Best Director', 'Chris Noonan', 'Miss Potter']


# Define the multi-hop agent

In [13]:
def search(query: str, k: int) -> list[str]:
    tokens = bm25s.tokenize(query, stopwords="en", stemmer=stemmer, show_progress=False)
    results, scores = retriever.retrieve(tokens, k=k, n_threads=1, show_progress=False)
    run = {corpus[doc]: float(score) for doc, score in zip(results[0], scores[0])}
    return run

In [14]:
class Hop(dspy.Module):
    def __init__(self, num_docs=10, num_hops=4):
        self.num_docs, self.num_hops = num_docs, num_hops
        self.generate_query = dspy.ChainOfThought('claim, notes -> query')
        self.append_notes = dspy.ChainOfThought('claim, notes, context -> new_notes: list[str], titles: list[str]')

    def forward(self, claim: str) -> list[str]:
        notes = []
        titles = []

        for _ in range(self.num_hops):
            query = self.generate_query(claim=claim, notes=notes).query
            context = search(query, k=self.num_docs)
            prediction = self.append_notes(claim=claim, notes=notes, context=context)
            notes.extend(prediction.new_notes)
            titles.extend(prediction.titles)
        
        return dspy.Prediction(notes=notes, titles=list(set(titles)))

# Evaluation metric

In [18]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset[:5], metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

# Evaluate the agent before training

In [19]:
evaluate(Hop())

Average Metric: 0.00 / 5 (0.0%): 100%|██████████| 5/5 [00:09<00:00,  1.96s/it]

2025/03/26 14:50:43 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 5 (0.0%)





Unnamed: 0,claim,example_titles,notes,pred_titles,top5_recall
0,Nike football team has had a player endorse the football boot Nike...,"[Nike Hypervenom, Nike Total 90, Marcus Rashford]",[],[],
1,Bill Boyd is the chairman of the appliance company that operates t...,"[Boyd Gaming, Thomas Eje, Suncoast Hotel and Casino]",[],"[Thomas Eje's Debut at Suncoast Casino, Bill Boyd's Role in Boyd G...",
2,The president of South Korea was born 24 January 1953. The group t...,"[Euh Yoon-dae, Moon Jae-in, Presidential Council on Nation Brandin...",[],"[Euh Yoon-Dae's Role in South Korean Government, Presidential Auth...",
3,The movie Khan Kluay was released 2 months before the 2009 movie t...,"[Jason Schwartzman, Khan Kluay, Fantastic Mr. Fox (film)]",[],[],
4,The director of Finding Dory co-directed the film A Bug's Life.,"[A Bug's Life, Finding Dory, Andrew Stanton]",[],[],


0.0

# Optimize with MIPROv2 (prompt optimization)

In [22]:
models = dict(prompt_model=gpt4o, teacher_settings=dict(lm=gpt4o))
tp = dspy.MIPROv2(metric=top5_recall, auto="medium", num_threads=16, **models)

kwargs = dict(minibatch_size=40, minibatch_full_eval_steps=4, requires_permission_to_run=False)
optimized = tp.compile(Hop(), trainset=trainset[:10], max_bootstrapped_demos=4, max_labeled_demos=4, **kwargs)


2025/03/26 14:53:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: False
num_candidates: 9
valset size: 8

2025/03/26 14:53:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/03/26 14:53:27 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/03/26 14:53:27 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=9 sets of demonstrations...


Bootstrapping set 1/9
Bootstrapping set 2/9
Bootstrapping set 3/9


100%|██████████| 2/2 [00:44<00:00, 22.20s/it]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 4/9


100%|██████████| 2/2 [00:00<00:00, 15.13it/s]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/9


100%|██████████| 2/2 [00:00<00:00, 16.59it/s]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/9


100%|██████████| 2/2 [00:00<00:00, 14.45it/s]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/9


100%|██████████| 2/2 [00:00<00:00, 16.53it/s]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/9


100%|██████████| 2/2 [00:00<00:00, 16.75it/s]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 9/9


100%|██████████| 2/2 [00:00<00:00, 16.73it/s]
2025/03/26 14:54:12 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/03/26 14:54:12 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.


2025/03/26 14:54:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/03/26 14:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/03/26 14:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `claim`, `notes`, produce the fields `query`.

2025/03/26 14:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Imagine you are a film historian tasked with uncovering the hidden connections between acclaimed films and their creators. Given the fields `claim` and `notes`, generate a precise query that will help you delve deeper into the cinematic achievements associated with the claim. This query should be designed to extract significant insights that can enhance our understanding of the film's impact and its creators' contributions.

2025/03/26 14:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are an information retrieval expert. Given the fields `claim` and `notes`, analyze the information and generate a well-st

Average Metric: 0.67 / 8 (8.3%): 100%|██████████| 8/8 [00:05<00:00,  1.50it/s] 

2025/03/26 14:56:17 INFO dspy.evaluate.evaluate: Average Metric: 0.6666666666666666 / 8 (8.3%)
2025/03/26 14:56:17 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 8.33

2025/03/26 14:56:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 25 =====



Average Metric: 1.67 / 8 (20.8%): 100%|██████████| 8/8 [00:46<00:00,  5.78s/it]

2025/03/26 14:57:03 INFO dspy.evaluate.evaluate: Average Metric: 1.6666666666666665 / 8 (20.8%)
2025/03/26 14:57:03 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 20.83
2025/03/26 14:57:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.83 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 5'].
2025/03/26 14:57:03 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83]
2025/03/26 14:57:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 20.83


2025/03/26 14:57:03 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 25 =====



Average Metric: 1.33 / 8 (16.7%): 100%|██████████| 8/8 [00:32<00:00,  4.05s/it]

2025/03/26 14:57:35 INFO dspy.evaluate.evaluate: Average Metric: 1.3333333333333333 / 8 (16.7%)
2025/03/26 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 16.67 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67]
2025/03/26 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 20.83


2025/03/26 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 25 =====



Average Metric: 2.33 / 8 (29.2%): 100%|██████████| 8/8 [00:33<00:00,  4.21s/it]

2025/03/26 14:58:09 INFO dspy.evaluate.evaluate: Average Metric: 2.333333333333333 / 8 (29.2%)
2025/03/26 14:58:09 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 29.17
2025/03/26 14:58:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 29.17 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 3'].
2025/03/26 14:58:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17]
2025/03/26 14:58:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 29.17


2025/03/26 14:58:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 25 =====



Average Metric: 0.00 / 8 (0.0%): 100%|██████████| 8/8 [00:46<00:00,  5.80s/it]

2025/03/26 14:58:56 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 8 (0.0%)
2025/03/26 14:58:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 0'].
2025/03/26 14:58:56 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0]
2025/03/26 14:58:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 29.17


2025/03/26 14:58:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 25 =====



Average Metric: 4.33 / 8 (54.2%): 100%|██████████| 8/8 [00:27<00:00,  3.47s/it]

2025/03/26 14:59:23 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 8 (54.2%)
2025/03/26 14:59:23 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 54.17
2025/03/26 14:59:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/03/26 14:59:23 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17]
2025/03/26 14:59:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 54.17


2025/03/26 14:59:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 25 =====



Average Metric: 0.67 / 8 (8.3%): 100%|██████████| 8/8 [00:33<00:00,  4.25s/it] 

2025/03/26 14:59:57 INFO dspy.evaluate.evaluate: Average Metric: 0.6666666666666666 / 8 (8.3%)
2025/03/26 14:59:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.33 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/26 14:59:57 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33]
2025/03/26 14:59:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 54.17


2025/03/26 14:59:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 25 =====



Average Metric: 2.33 / 8 (29.2%): 100%|██████████| 8/8 [00:35<00:00,  4.38s/it]

2025/03/26 15:00:32 INFO dspy.evaluate.evaluate: Average Metric: 2.3333333333333335 / 8 (29.2%)
2025/03/26 15:00:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 29.17 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 3'].
2025/03/26 15:00:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17]
2025/03/26 15:00:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 54.17


2025/03/26 15:00:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 25 =====



Average Metric: 0.00 / 8 (0.0%): 100%|██████████| 8/8 [00:41<00:00,  5.15s/it]

2025/03/26 15:01:14 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 8 (0.0%)
2025/03/26 15:01:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 0'].
2025/03/26 15:01:14 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0]
2025/03/26 15:01:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 54.17


2025/03/26 15:01:14 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 25 =====



Average Metric: 3.00 / 8 (37.5%): 100%|██████████| 8/8 [00:31<00:00,  3.99s/it]

2025/03/26 15:01:46 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 8 (37.5%)
2025/03/26 15:01:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 37.5 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 7'].
2025/03/26 15:01:46 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5]
2025/03/26 15:01:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 54.17


2025/03/26 15:01:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 25 =====



Average Metric: 4.67 / 8 (58.3%): 100%|██████████| 8/8 [00:24<00:00,  3.00s/it]

2025/03/26 15:02:10 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 8 (58.3%)
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 58.33
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.33 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33]
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 12 / 25 =====



Average Metric: 4.33 / 8 (54.2%): 100%|██████████| 8/8 [00:00<00:00, 26.84it/s]

2025/03/26 15:02:10 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 8 (54.2%)
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17]
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 25 =====



Average Metric: 4.33 / 8 (54.2%): 100%|██████████| 8/8 [00:00<00:00, 26.80it/s]

2025/03/26 15:02:10 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 8 (54.2%)
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 4'].
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17]
2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 14 / 25 =====



Average Metric: 2.33 / 8 (29.2%): 100%|██████████| 8/8 [00:36<00:00,  4.56s/it]

2025/03/26 15:02:47 INFO dspy.evaluate.evaluate: Average Metric: 2.3333333333333335 / 8 (29.2%)
2025/03/26 15:02:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 29.17 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:02:47 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17]
2025/03/26 15:02:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:02:47 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 15 / 25 =====



Average Metric: 2.33 / 8 (29.2%): 100%|██████████| 8/8 [00:31<00:00,  3.93s/it]

2025/03/26 15:03:18 INFO dspy.evaluate.evaluate: Average Metric: 2.333333333333333 / 8 (29.2%)
2025/03/26 15:03:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 29.17 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 6'].
2025/03/26 15:03:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17]
2025/03/26 15:03:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:03:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 16 / 25 =====



Average Metric: 4.33 / 8 (54.2%): 100%|██████████| 8/8 [00:25<00:00,  3.16s/it]

2025/03/26 15:03:44 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 8 (54.2%)
2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.17 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17]
2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 25 =====



Average Metric: 4.67 / 8 (58.3%): 100%|██████████| 8/8 [00:00<00:00, 28.56it/s]

2025/03/26 15:03:44 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 8 (58.3%)
2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.33 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33]
2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:03:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 18 / 25 =====



Average Metric: 1.67 / 8 (20.8%): 100%|██████████| 8/8 [00:28<00:00,  3.55s/it]

2025/03/26 15:04:12 INFO dspy.evaluate.evaluate: Average Metric: 1.6666666666666665 / 8 (20.8%)
2025/03/26 15:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.83 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 6', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83]
2025/03/26 15:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 25 =====



Average Metric: 3.00 / 8 (37.5%): 100%|██████████| 8/8 [00:32<00:00,  4.07s/it]

2025/03/26 15:04:45 INFO dspy.evaluate.evaluate: Average Metric: 2.9999999999999996 / 8 (37.5%)
2025/03/26 15:04:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 37.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:04:45 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5]
2025/03/26 15:04:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.33


2025/03/26 15:04:45 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 20 / 25 =====



Average Metric: 5.00 / 8 (62.5%): 100%|██████████| 8/8 [00:28<00:00,  3.62s/it]

2025/03/26 15:05:14 INFO dspy.evaluate.evaluate: Average Metric: 5.0 / 8 (62.5%)
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 62.5
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5, 62.5]
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 62.5


2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 21 / 25 =====



Average Metric: 5.00 / 8 (62.5%): 100%|██████████| 8/8 [00:00<00:00, 41.53it/s]

2025/03/26 15:05:14 INFO dspy.evaluate.evaluate: Average Metric: 5.0 / 8 (62.5%)
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 8'].
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5, 62.5, 62.5]
2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 62.5


2025/03/26 15:05:14 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 22 / 25 =====



Average Metric: 0.00 / 8 (0.0%): 100%|██████████| 8/8 [00:43<00:00,  5.43s/it]

2025/03/26 15:05:58 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 8 (0.0%)
2025/03/26 15:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 4', 'Predictor 1: Few-Shot Set 8'].
2025/03/26 15:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5, 62.5, 62.5, 0.0]
2025/03/26 15:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 62.5


2025/03/26 15:05:58 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 25 =====



Average Metric: 5.00 / 8 (62.5%): 100%|██████████| 8/8 [00:27<00:00,  3.47s/it]

2025/03/26 15:06:25 INFO dspy.evaluate.evaluate: Average Metric: 5.0 / 8 (62.5%)
2025/03/26 15:06:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 8'].
2025/03/26 15:06:25 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5, 62.5, 62.5, 0.0, 62.5]
2025/03/26 15:06:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 62.5


2025/03/26 15:06:25 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 24 / 25 =====



Average Metric: 5.00 / 8 (62.5%): 100%|██████████| 8/8 [00:00<00:00, 41.73it/s]

2025/03/26 15:06:26 INFO dspy.evaluate.evaluate: Average Metric: 5.0 / 8 (62.5%)
2025/03/26 15:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 8'].
2025/03/26 15:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5, 62.5, 62.5, 0.0, 62.5, 62.5]
2025/03/26 15:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 62.5


2025/03/26 15:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 25 =====



Average Metric: 4.67 / 8 (58.3%): 100%|██████████| 8/8 [00:24<00:00,  3.07s/it]

2025/03/26 15:06:50 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 8 (58.3%)
2025/03/26 15:06:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.33 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 2'].
2025/03/26 15:06:50 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [8.33, 20.83, 16.67, 29.17, 0.0, 54.17, 8.33, 29.17, 0.0, 37.5, 58.33, 54.17, 54.17, 29.17, 29.17, 54.17, 58.33, 20.83, 37.5, 62.5, 62.5, 0.0, 62.5, 62.5, 58.33]
2025/03/26 15:06:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 62.5


2025/03/26 15:06:50 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 62.5!





## Evaluate the optimized agent

In [None]:
evaluate(optimized)

In [23]:
optimized.save("optimized_hop_4o-mini.json")

# Fine-tune gpt-4o-mini from teacher program

In [27]:
# Enable experimental features
dspy.settings.experimental = True

In [31]:
student_4o_mini = optimized.deepcopy()
student_4o_mini.set_lm(gpt4o_mini)

In [28]:
optimizer = dspy.BootstrapFinetune(metric=top5_recall, num_threads=16)
finetuned_4o_mini = optimizer.compile(student_4o_mini, teacher=optimized, trainset=trainset[:10])

2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Preparing the student and teacher programs...
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Bootstrapping data...


Average Metric: 7.00 / 10 (70.0%): 100%|██████████| 10/10 [00:00<00:00, 45.22it/s]

2025/03/26 15:12:55 INFO dspy.evaluate.evaluate: Average Metric: 7.0 / 10 (70.0%)
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Preparing the train data...
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Collected data for 10 examples
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: After filtering with the metric, 10 examples remain
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Using 80 data points for fine-tuning the model: gpt-4o-mini-2024-07-18
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Starting LM fine-tuning...
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: 1 fine-tuning job(s) to start
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Starting 1 fine-tuning job(s)...
2025/03/26 15:12:55 INFO dspy.teleprompt.bootstrap_finetune: Calling lm.kill() on the LM to be fine-tuned to free up resources. This won't have any effect if the LM is not running.



[OpenAI Provider] Validating the data format
[OpenAI Provider] Saving the data to a file
[OpenAI Provider] Data saved to /Users/benediktstroebl/.dspy_cache/finetune/30276e0e4ead02e1.jsonl
[OpenAI Provider] Uploading the data to the provider
[OpenAI Provider] Starting remote training
[OpenAI Provider] Job started with the OpenAI Job ID ftjob-BnWwwRUFUWPx1bSv4OWIU3aR
[OpenAI Provider] Waiting for training to complete
[OpenAI Provider] 2025-03-26 15:12:58 Validating training file: file-XvpoWpeKzciVQnofQzyzXy


## Evaluate the finetuned agent

In [None]:
evaluate(finetuned_4o_mini)

In [None]:
finetuned_4o_mini.save('finetuned_4o_mini_001.pkl')

## Example: Load the finetuned agent from file

In [None]:
loaded = Hop()
loaded.load('finetuned_4o_mini_001.pkl')