In [1]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy")

<Experiment: artifact_location='mlflow-artifacts:/374362034103955121', creation_time=1741686562632, experiment_id='374362034103955121', last_update_time=1741686562632, lifecycle_stage='active', name='DSPy', tags={}>

In [2]:
mlflow.dspy.autolog()

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from dotenv import load_dotenv

load_dotenv()


True

In [23]:
import dspy

llama3b = dspy.LM('fireworks_ai/accounts/fireworks/models/llama-v3p2-3b-instruct', temperature=0.7)
gpt4o = dspy.LM('openai/gpt-4o', temperature=0.7)

dspy.configure(lm=llama3b)

In [5]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

Downloading builder script: 100%|██████████| 4.48k/4.48k [00:00<00:00, 15.9kB/s]
Downloading readme: 100%|██████████| 4.13k/4.13k [00:00<00:00, 17.7kB/s]
Downloading data: 9.21MB [00:00, 77.4MB/s]                   
Downloading data: 2.15MB [00:00, 44.9MB/s]                  
Downloading data: 899kB [00:00, 35.5MB/s]                   
Generating train split: 100%|██████████| 18171/18171 [00:00<00:00, 36946.44 examples/s]
Generating validation split: 100%|██████████| 4000/4000 [00:00<00:00, 28205.96 examples/s]
Generating test split: 100%|██████████| 4000/4000 [00:00<00:00, 68392.45 examples/s]


In [6]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Miss Potter', 'Chris Noonan', 'Academy Award for Best Director']


In [7]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [8]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [9]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [19]:
react(claim="David Gregory was born in 1625.").titles[:3]

[]

In [20]:
dspy.inspect_history(n=1)





[34m[2025-03-12T18:38:23.559367][0m

[31mSystem message:[0m

Your input fields are:
1. `claim` (str)
2. `trajectory` (str)

Your output fields are:
1. `reasoning` (str)
2. `titles` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## claim ## ]]
{claim}

[[ ## trajectory ## ]]
{trajectory}

[[ ## reasoning ## ]]
{reasoning}

[[ ## titles ## ]]
{titles}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Find all Wikipedia titles relevant to verifying (or refuting) the claim.


[31mUser message:[0m

[[ ## claim ## ]]
David Gregory was born in 1625.

[[ ## trajectory ## ]]
[[ ## thought_0 ## ]]
Searching for information about David Gregory's birth.

[[ ## tool_name_0 ## ]]
search_wikipedia

[[ ## tool_args_0 ## ]]
{"query": {"type": "string"}}

[[ ## observation_0 ## ]]


In [21]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [24]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

Average Metric: 25.00 / 100 (25.0%): 100%|██████████| 100/100 [04:23<00:00,  2.63s/it]

2025/03/12 18:43:58 INFO dspy.evaluate.evaluate: Average Metric: 24.99999999999999 / 100 (25.0%)





Unnamed: 0,claim,example_titles,trajectory,reasoning,pred_titles,top5_recall
0,The Church of England's movement that inspired the Trinity Episcop...,"[Oxford Movement, Trinity Episcopal Church (Houghton, Michigan), S...","{'thought_0': ""The claim mentions the Church of England and the Tr...",The claim mentions the Church of England and the Trinity Episcopal...,"['John Lettice', 'John Kettlewell', 'John Purchas', 'Joachim Lütke...",
1,"Red, White & Crüe and this athlete both fight. The french fighter ...","[Mike Tyson, Bobby Stewart, Red, White &amp; Crüe]","{'thought_0': 'The french fighter is likely Patrick Surtain', 'too...",The reasoning is that the tool was unable to find reliable informa...,"['Billy Stewart', 'Jean-Patrick Lescarboura', 'Simon Patrick Stewa...",
2,The writer/director/actor from Glen or Glenda and Fernand Rivers s...,"[Fernand Rivers, Ed Wood, Glen or Glenda]",{'thought_0': 'The writer/director/actor from Glen or Glenda and F...,The writer/director/actor from Glen or Glenda and Fernand Rivers i...,['The Incredibly Strange Creatures Who Stopped Living and Became M...,
3,The film by Sandi Sissel was released before The End of Suburbia.,"[Sandi Sissel, Chicken Ranch (film), The End of Suburbia]",{'thought_0': 'Search for the director of The End of Suburbia to s...,The director of The End of Suburbia may have also directed the fil...,"['Sandi Sissel', 'Escape from Suburbia', 'Suburb (1951 film)', 'Su...",✔️ [0.333]
4,The actor who played captain hook in the live production with Tayl...,"[Taylor Louderman, Christopher Walken, Peter Pan Live!]",{'thought_0': 'The actor who played captain hook in the live produ...,The reasoning behind this response is that none of the provided to...,"['Cyril Ritchard', 'Captain James Hook', 'E. Holman Clark', 'Peter...",


25.0

In [25]:
dspy.inspect_history(n=1)





[34m[2025-03-12T18:43:58.296078][0m

[31mSystem message:[0m

Your input fields are:
1. `claim` (str)
2. `trajectory` (str)

Your output fields are:
1. `reasoning` (str)
2. `titles` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## claim ## ]]
{claim}

[[ ## trajectory ## ]]
{trajectory}

[[ ## reasoning ## ]]
{reasoning}

[[ ## titles ## ]]
{titles}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Find all Wikipedia titles relevant to verifying (or refuting) the claim.


[31mUser message:[0m

[[ ## claim ## ]]
The director of Daughters of Mara's first album is the son of the producer who worked for the group Badfinger.

[[ ## trajectory ## ]]
[[ ## thought_0 ## ]]
The director of Daughters of Mara's first album is the son of the producer who worked for the group 

In [29]:
kwargs = dict(teacher_settings=dict(lm=gpt4o), prompt_model=gpt4o, max_errors=999)

tp = dspy.MIPROv2(metric=top5_recall, auto="medium", num_threads=16, **kwargs)
optimized_react = tp.compile(react, trainset=trainset, max_bootstrapped_demos=3, max_labeled_demos=0)

2025/03/12 18:47:12 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 9
valset size: 80

2025/03/12 18:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/03/12 18:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/03/12 18:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=9 sets of demonstrations...


Bootstrapping set 1/9
Bootstrapping set 2/9


 35%|███▌      | 7/20 [01:45<03:15, 15.03s/it]


Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 3/9


 35%|███▌      | 7/20 [01:16<02:21, 10.90s/it]


Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 4/9


 15%|█▌        | 3/20 [00:22<02:05,  7.39s/it]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/9


 40%|████      | 8/20 [00:30<00:46,  3.86s/it]


Bootstrapped 3 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 6/9


 25%|██▌       | 5/20 [00:09<00:28,  1.87s/it]


Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 7/9


 20%|██        | 4/20 [00:31<02:05,  7.85s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 8/9


 15%|█▌        | 3/20 [00:00<00:00, 82.19it/s]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 9/9


 25%|██▌       | 5/20 [00:24<01:12,  4.84s/it]
2025/03/12 18:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/03/12 18:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.


2025/03/12 18:52:56 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/03/12 18:57:12 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/03/12 18:57:12 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Find all Wikipedia titles relevant to verifying (or refuting) the claim.

You will be given `claim` and your goal is to finish with `titles`.

To do this, you will interleave Thought, Tool Name, and Tool Args, and receive a resulting Observation.

Thought can reason about the current situation, and Tool Name can be the following types:

(1) search_wikipedia, whose description is <desc>Returns top-5 results and then the titles of the top-5 to top-30 results.</desc>. It takes arguments {'query': {'type': 'string'}} in JSON format.
(2) lookup_wikipedia, whose description is <desc>Returns the text of the Wikipedia page, if it exists.</desc>. It takes arguments {'title': {'type': 'string'}} in JSON format.
(3) finish, whose description is <d

Average Metric: 22.33 / 80 (27.9%): 100%|██████████| 80/80 [03:32<00:00,  2.65s/it]

2025/03/12 19:00:44 INFO dspy.evaluate.evaluate: Average Metric: 22.333333333333325 / 80 (27.9%)
2025/03/12 19:00:44 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 27.92

2025/03/12 19:00:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 28 - Minibatch ==



Average Metric: 10.67 / 25 (42.7%): 100%|██████████| 25/25 [00:19<00:00,  1.25it/s]

2025/03/12 19:01:04 INFO dspy.evaluate.evaluate: Average Metric: 10.666666666666666 / 25 (42.7%)
2025/03/12 19:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 5'].
2025/03/12 19:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67]
2025/03/12 19:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 28 - Minibatch ==



Average Metric: 11.33 / 25 (45.3%): 100%|██████████| 25/25 [00:20<00:00,  1.20it/s]

2025/03/12 19:01:25 INFO dspy.evaluate.evaluate: Average Metric: 11.333333333333332 / 25 (45.3%)
2025/03/12 19:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 1'].
2025/03/12 19:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33]
2025/03/12 19:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 28 - Minibatch ==



Average Metric: 7.67 / 25 (30.7%): 100%|██████████| 25/25 [00:16<00:00,  1.54it/s]

2025/03/12 19:01:41 INFO dspy.evaluate.evaluate: Average Metric: 7.666666666666666 / 25 (30.7%)
2025/03/12 19:01:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 30.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 3'].
2025/03/12 19:01:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67]
2025/03/12 19:01:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:01:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:01:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 28 - Minibatch ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:16<00:00,  1.49it/s]

2025/03/12 19:01:58 INFO dspy.evaluate.evaluate: Average Metric: 10.999999999999998 / 25 (44.0%)
2025/03/12 19:01:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 0'].
2025/03/12 19:01:58 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0]
2025/03/12 19:01:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:01:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:01:58 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 28 - Minibatch ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]

2025/03/12 19:02:16 INFO dspy.evaluate.evaluate: Average Metric: 11.0 / 25 (44.0%)
2025/03/12 19:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/03/12 19:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0]
2025/03/12 19:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 7 / 28 - Minibatch ==



Average Metric: 8.33 / 25 (33.3%): 100%|██████████| 25/25 [01:15<00:00,  3.03s/it]

2025/03/12 19:03:32 INFO dspy.evaluate.evaluate: Average Metric: 8.333333333333334 / 25 (33.3%)
2025/03/12 19:03:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 33.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 3'].
2025/03/12 19:03:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33]
2025/03/12 19:03:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:03:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:03:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 28 - Minibatch ==



Average Metric: 7.67 / 25 (30.7%): 100%|██████████| 25/25 [00:37<00:00,  1.52s/it]

2025/03/12 19:04:10 INFO dspy.evaluate.evaluate: Average Metric: 7.666666666666666 / 25 (30.7%)
2025/03/12 19:04:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 30.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 3'].
2025/03/12 19:04:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67]
2025/03/12 19:04:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:04:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:04:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 28 - Minibatch ==



Average Metric: 10.67 / 25 (42.7%): 100%|██████████| 25/25 [00:18<00:00,  1.33it/s]

2025/03/12 19:04:29 INFO dspy.evaluate.evaluate: Average Metric: 10.666666666666668 / 25 (42.7%)
2025/03/12 19:04:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 0'].
2025/03/12 19:04:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67]
2025/03/12 19:04:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:04:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:04:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 28 - Minibatch ==



Average Metric: 8.67 / 25 (34.7%): 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]

2025/03/12 19:04:48 INFO dspy.evaluate.evaluate: Average Metric: 8.666666666666666 / 25 (34.7%)
2025/03/12 19:04:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 7'].
2025/03/12 19:04:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67]
2025/03/12 19:04:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92]
2025/03/12 19:04:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.92


2025/03/12 19:04:48 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 28 - Full Evaluation =====
2025/03/12 19:04:48 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 45.33) from minibatch trials...



Average Metric: 32.67 / 80 (40.8%): 100%|██████████| 80/80 [00:39<00:00,  2.04it/s]

2025/03/12 19:05:27 INFO dspy.evaluate.evaluate: Average Metric: 32.66666666666667 / 80 (40.8%)
2025/03/12 19:05:27 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 40.83
2025/03/12 19:05:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:05:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83
2025/03/12 19:05:27 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/12 19:05:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 28 - Minibatch ==



Average Metric: 8.67 / 25 (34.7%): 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]

2025/03/12 19:05:49 INFO dspy.evaluate.evaluate: Average Metric: 8.666666666666668 / 25 (34.7%)
2025/03/12 19:05:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 8', 'Predictor 1: Instruction 6', 'Predictor 1: Few-Shot Set 0'].
2025/03/12 19:05:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67]
2025/03/12 19:05:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:05:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:05:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 13 / 28 - Minibatch ==



Average Metric: 7.33 / 25 (29.3%): 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]

2025/03/12 19:06:08 INFO dspy.evaluate.evaluate: Average Metric: 7.33333333333333 / 25 (29.3%)
2025/03/12 19:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 29.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 0'].
2025/03/12 19:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33]
2025/03/12 19:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 28 - Minibatch ==



Average Metric: 10.67 / 25 (42.7%): 100%|██████████| 25/25 [00:18<00:00,  1.37it/s]

2025/03/12 19:06:26 INFO dspy.evaluate.evaluate: Average Metric: 10.666666666666666 / 25 (42.7%)
2025/03/12 19:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 4', 'Predictor 1: Few-Shot Set 2'].
2025/03/12 19:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67]
2025/03/12 19:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:06:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 28 - Minibatch ==



Average Metric: 10.67 / 25 (42.7%): 100%|██████████| 25/25 [00:22<00:00,  1.11it/s]

2025/03/12 19:06:49 INFO dspy.evaluate.evaluate: Average Metric: 10.666666666666668 / 25 (42.7%)
2025/03/12 19:06:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 1'].
2025/03/12 19:06:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67]
2025/03/12 19:06:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:06:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:06:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 28 - Minibatch ==



Average Metric: 9.67 / 25 (38.7%): 100%|██████████| 25/25 [00:23<00:00,  1.07it/s]

2025/03/12 19:07:12 INFO dspy.evaluate.evaluate: Average Metric: 9.666666666666668 / 25 (38.7%)
2025/03/12 19:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 38.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 1'].
2025/03/12 19:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67]
2025/03/12 19:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:07:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 28 - Minibatch ==



Average Metric: 8.67 / 25 (34.7%): 100%|██████████| 25/25 [00:07<00:00,  3.37it/s]

2025/03/12 19:07:20 INFO dspy.evaluate.evaluate: Average Metric: 8.666666666666668 / 25 (34.7%)
2025/03/12 19:07:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 1'].
2025/03/12 19:07:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67]
2025/03/12 19:07:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:07:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:07:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 28 - Minibatch ==



Average Metric: 7.33 / 25 (29.3%): 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]

2025/03/12 19:07:39 INFO dspy.evaluate.evaluate: Average Metric: 7.333333333333331 / 25 (29.3%)
2025/03/12 19:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 29.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 0'].
2025/03/12 19:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33]
2025/03/12 19:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 19 / 28 - Minibatch ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:21<00:00,  1.15it/s]

2025/03/12 19:08:01 INFO dspy.evaluate.evaluate: Average Metric: 11.000000000000002 / 25 (44.0%)
2025/03/12 19:08:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 1'].
2025/03/12 19:08:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0]
2025/03/12 19:08:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:08:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:08:01 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 28 - Minibatch ==



Average Metric: 10.33 / 25 (41.3%): 100%|██████████| 25/25 [00:16<00:00,  1.55it/s]

2025/03/12 19:08:18 INFO dspy.evaluate.evaluate: Average Metric: 10.333333333333332 / 25 (41.3%)
2025/03/12 19:08:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 41.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 8', 'Predictor 1: Few-Shot Set 1'].
2025/03/12 19:08:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33]
2025/03/12 19:08:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83]
2025/03/12 19:08:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:08:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 21 / 28 - Full Evaluation =====
2025/03/12 19:08:18 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 44.0) from minibatch trials...



Average Metric: 26.00 / 80 (32.5%): 100%|██████████| 80/80 [00:27<00:00,  2.93it/s]

2025/03/12 19:08:45 INFO dspy.evaluate.evaluate: Average Metric: 25.999999999999993 / 80 (32.5%)
2025/03/12 19:08:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:08:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83
2025/03/12 19:08:45 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/12 19:08:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 28 - Minibatch ==



Average Metric: 11.33 / 25 (45.3%): 100%|██████████| 25/25 [00:24<00:00,  1.04it/s]

2025/03/12 19:09:09 INFO dspy.evaluate.evaluate: Average Metric: 11.333333333333334 / 25 (45.3%)
2025/03/12 19:09:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 6'].
2025/03/12 19:09:09 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33, 45.33]
2025/03/12 19:09:09 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:09:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:09:09 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 23 / 28 - Minibatch ==



Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:15<00:00,  1.64it/s]

2025/03/12 19:09:25 INFO dspy.evaluate.evaluate: Average Metric: 10.0 / 25 (40.0%)
2025/03/12 19:09:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 7'].
2025/03/12 19:09:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33, 45.33, 40.0]
2025/03/12 19:09:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:09:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:09:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 24 / 28 - Minibatch ==



Average Metric: 10.67 / 25 (42.7%): 100%|██████████| 25/25 [00:16<00:00,  1.51it/s]

2025/03/12 19:09:41 INFO dspy.evaluate.evaluate: Average Metric: 10.666666666666668 / 25 (42.7%)
2025/03/12 19:09:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 6'].
2025/03/12 19:09:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33, 45.33, 40.0, 42.67]
2025/03/12 19:09:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:09:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:09:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 25 / 28 - Minibatch ==



Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:17<00:00,  1.42it/s]

2025/03/12 19:09:59 INFO dspy.evaluate.evaluate: Average Metric: 9.999999999999998 / 25 (40.0%)
2025/03/12 19:09:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 6'].
2025/03/12 19:09:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33, 45.33, 40.0, 42.67, 40.0]
2025/03/12 19:09:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:09:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:09:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 26 / 28 - Minibatch ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:06<00:00,  4.14it/s]

2025/03/12 19:10:05 INFO dspy.evaluate.evaluate: Average Metric: 11.000000000000002 / 25 (44.0%)
2025/03/12 19:10:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 8'].
2025/03/12 19:10:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33, 45.33, 40.0, 42.67, 40.0, 44.0]
2025/03/12 19:10:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:10:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:10:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 27 / 28 - Minibatch ==



Average Metric: 8.67 / 25 (34.7%): 100%|██████████| 25/25 [01:18<00:00,  3.16s/it]

2025/03/12 19:11:24 INFO dspy.evaluate.evaluate: Average Metric: 8.666666666666668 / 25 (34.7%)
2025/03/12 19:11:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 6'].
2025/03/12 19:11:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [42.67, 45.33, 30.67, 44.0, 44.0, 33.33, 30.67, 42.67, 34.67, 34.67, 29.33, 42.67, 42.67, 38.67, 34.67, 29.33, 44.0, 41.33, 45.33, 40.0, 42.67, 40.0, 44.0, 34.67]
2025/03/12 19:11:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5]
2025/03/12 19:11:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.83


2025/03/12 19:11:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 28 / 28 - Full Evaluation =====
2025/03/12 19:11:24 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg 


Average Metric: 33.00 / 80 (41.2%): 100%|██████████| 80/80 [00:46<00:00,  1.73it/s]

2025/03/12 19:12:11 INFO dspy.evaluate.evaluate: Average Metric: 33.0 / 80 (41.2%)
2025/03/12 19:12:11 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 41.25
2025/03/12 19:12:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.92, 40.83, 32.5, 41.25]
2025/03/12 19:12:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 41.25
2025/03/12 19:12:11 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/03/12 19:12:11 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 41.25!





In [30]:
evaluate(optimized_react)

Average Metric: 40.33 / 100 (40.3%): 100%|██████████| 100/100 [01:16<00:00,  1.31it/s]

2025/03/12 20:11:03 INFO dspy.evaluate.evaluate: Average Metric: 40.33333333333333 / 100 (40.3%)





Unnamed: 0,claim,example_titles,trajectory,reasoning,pred_titles,top5_recall
0,The Church of England's movement that inspired the Trinity Episcop...,"[Oxford Movement, Trinity Episcopal Church (Houghton, Michigan), S...",{'thought_0': 'The claim mentions a Sussex clergyman who wrote dev...,The claim states that the Church of England's movement that inspir...,"[Oxford Movement, Sussex clergyman devotional literature Trinity E...",✔️ [0.333]
1,"Red, White & Crüe and this athlete both fight. The french fighter ...","[Mike Tyson, Bobby Stewart, Red, White &amp; Crüe]",{'thought_0': 'The claim mentions a French fighter trained by Bobb...,The claim mentions a French fighter trained by Bobby Stewart. The ...,"[Daniel Cormier, Bobby Stewart, French fighter]",✔️ [0.333]
2,The writer/director/actor from Glen or Glenda and Fernand Rivers s...,"[Fernand Rivers, Ed Wood, Glen or Glenda]",{'thought_0': 'The claim requires finding information about the ca...,The claim states that Ed Wood Jr. and Fernand Rivers share the car...,"[writer, producer, director]",
3,The film by Sandi Sissel was released before The End of Suburbia.,"[Sandi Sissel, Chicken Ranch (film), The End of Suburbia]",{'thought_0': 'The claim that the film by Sandi Sissel was release...,The claim states that the film by Sandi Sissel was released before...,[],
4,The actor who played captain hook in the live production with Tayl...,"[Taylor Louderman, Christopher Walken, Peter Pan Live!]",{'thought_0': 'The search results indicate that there is no clear ...,The claim states that the actor who played Captain Hook in a live ...,"[Cyril Ritchard, John Cazale, Taylor Louderman, The Deer Hunter, H...",✔️ [0.333]


40.33

In [31]:
dspy.inspect_history(n=1)





[34m[2025-03-12T20:11:03.541089][0m

[31mSystem message:[0m

Your input fields are:
1. `claim` (str)
2. `trajectory` (str)

Your output fields are:
1. `reasoning` (str)
2. `titles` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## claim ## ]]
{claim}

[[ ## trajectory ## ]]
{trajectory}

[[ ## reasoning ## ]]
{reasoning}

[[ ## titles ## ]]
{titles}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Identify and gather Wikipedia titles that are essential for assessing the validity of the given claim. Your process should involve step-by-step reasoning to determine the relevant entities and relationships within the claim, utilizing available tools to search and retrieve necessary information. Conclude with a comprehensive list of Wikipedia titles that serve as evidence

In [33]:
optimized_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

['Joe Orton', 'Bernard-Marie Koltès', 'Arthur Miller']

In [35]:
react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles[:3]

['Joe Orton', 'Bernard-Marie Koltès', 'Jean-Jacques Bernard']

In [36]:
optimized_react.save("optimized_react.json")

loaded_react = dspy.ReAct("claim -> titles: list[str]", tools=[search_wikipedia, lookup_wikipedia], max_iters=20)
loaded_react.load("optimized_react.json")

loaded_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

['Joe Orton', 'Bernard-Marie Koltès', 'Arthur Miller']