In [2]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy")

<Experiment: artifact_location='mlflow-artifacts:/374362034103955121', creation_time=1741686562632, experiment_id='374362034103955121', last_update_time=1741686562632, lifecycle_stage='active', name='DSPy', tags={}>

In [3]:
mlflow.dspy.autolog()

In [8]:
import os
import tempfile
from datasets import load_dataset
from typing import Dict, Any, List
import dspy

def load_conll_dataset() -> dict:
    """
    Loads the CoNLL-2003 dataset into train, validation, and test splits.
    
    Returns:
        dict: Dataset splits with keys 'train', 'validation', and 'test'.
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        # Use a temporary Hugging Face cache directory for compatibility with certain hosted notebook
        # environments that don't support the default Hugging Face cache directory
        os.environ["HF_DATASETS_CACHE"] = temp_dir
        return load_dataset("conll2003", trust_remote_code=True)

def extract_people_entities(data_row: Dict[str, Any]) -> List[str]:
    """
    Extracts entities referring to people from a row of the CoNLL-2003 dataset.
    
    Args:
        data_row (Dict[str, Any]): A row from the dataset containing tokens and NER tags.
    
    Returns:
        List[str]: List of tokens tagged as people.
    """
    return [
        token
        for token, ner_tag in zip(data_row["tokens"], data_row["ner_tags"])
        if ner_tag in (1, 2)  # CoNLL entity codes 1 and 2 refer to people
    ]

def prepare_dataset(data_split, start: int, end: int) -> List[dspy.Example]:
    """
    Prepares a sliced dataset split for use with DSPy.
    
    Args:
        data_split: The dataset split (e.g., train or test).
        start (int): Starting index of the slice.
        end (int): Ending index of the slice.
    
    Returns:
        List[dspy.Example]: List of DSPy Examples with tokens and expected labels.
    """
    return [
        dspy.Example(
            tokens=row["tokens"],
            expected_extracted_people=extract_people_entities(row)
        ).with_inputs("tokens")
        for row in data_split.select(range(start, end))
    ]

# Load the dataset
dataset = load_conll_dataset()

# Prepare the training and test sets
train_set = prepare_dataset(dataset["train"], 0, 50)
test_set = prepare_dataset(dataset["test"], 0, 200)

train_set[1]

Example({'tokens': ['Peter', 'Blackburn'], 'expected_extracted_people': ['Peter', 'Blackburn']}) (input_keys={'tokens'})

In [5]:
from typing import List

class PeopleExtraction(dspy.Signature):
    """
    Extract contiguous tokens referring to specific people, if any, from a list of string tokens.
    Output a list of tokens. In other words, do not combine multiple tokens into a single value.
    """
    tokens: list[str] = dspy.InputField(desc="tokenized text")
    extracted_people: list[str] = dspy.OutputField(desc="all tokens referring to specific people extracted from the tokenized text")

people_extractor = dspy.ChainOfThought(PeopleExtraction)

In [6]:
lm = dspy.LM(model="openai/gpt-4o-mini")
dspy.settings.configure(lm=lm)

In [9]:
def extraction_correctness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> bool:
    """
    Computes correctness of entity extraction predictions.
    
    Args:
        example (dspy.Example): The dataset example containing expected people entities.
        prediction (dspy.Prediction): The prediction from the DSPy people extraction program.
        trace: Optional trace object for debugging.
    
    Returns:
        bool: True if predictions match expectations, False otherwise.
    """
    return prediction.extracted_people == example.expected_extracted_people

evaluate_correctness = dspy.Evaluate(
    devset=test_set,
    metric=extraction_correctness_metric,
    num_threads=24,
    display_progress=True,
    display_table=True
)

In [10]:
evaluate_correctness(people_extractor, devset=test_set)

Average Metric: 181.00 / 200 (90.5%): 100%|██████████| 200/200 [00:16<00:00, 12.32it/s]

2025/03/13 14:25:55 INFO dspy.evaluate.evaluate: Average Metric: 181 / 200 (90.5%)





Unnamed: 0,tokens,expected_extracted_people,reasoning,extracted_people,extraction_correctness_metric
0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, IN, SURPRISE, DEFEAT...",[CHINA],The tokens provided do not contain any specific names of people. T...,[],
1,"[Nadim, Ladki]","[Nadim, Ladki]","The tokens ""Nadim"" and ""Ladki"" refer to specific individuals. They...","[Nadim, Ladki]",✔️ [True]
2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]",[],The provided tokens do not contain any references to specific peop...,[],✔️ [True]
3,"[Japan, began, the, defence, of, their, Asian, Cup, title, with, a...",[],The provided tokens do not contain any specific names of people. T...,[],✔️ [True]
4,"[But, China, saw, their, luck, desert, them, in, the, second, matc...",[],"In the provided tokens, ""China"" and ""Uzbekistan"" are the only toke...",[],✔️ [True]
...,...,...,...,...,...
195,"['The', 'Wallabies', 'have', 'their', 'sights', 'set', 'on', 'a', ...","[David, Campese]","The tokenized text mentions ""David Campese,"" who is a specific per...","[David, Campese]",✔️ [True]
196,"['The', 'Wallabies', 'currently', 'have', 'no', 'plans', 'to', 'ma...",[],"The text mentions ""the 34-year-old winger,"" which refers to a spec...",[],✔️ [True]
197,"['Campese', 'will', 'be', 'up', 'against', 'a', 'familiar', 'foe',...","[Campese, Rob, Andrew]","The tokens contain references to specific people, namely ""Campese""...","[Campese, Rob, Andrew]",✔️ [True]
198,"['""', 'Campo', 'has', 'a', 'massive', 'following', 'in', 'this', '...","[Campo, Andrew]","The tokenized text mentions ""Andrew"" as a specific person. It is t...",[Andrew],


90.5

In [11]:
import mlflow

with mlflow.start_run(run_name="extractor_evaluation"):
    evaluate_correctness = dspy.Evaluate(
        devset=test_set,
        metric=extraction_correctness_metric,
        num_threads=24,
        display_progress=True,
        # To record the outputs and detailed scores to MLflow
        return_all_scores=True,
        return_outputs=True,
    )

    # Evaluate the program as usual
    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)

    # Log the aggregated score
    mlflow.log_metric("exact_match", aggregated_score)
    # Log the detailed evaluation results as a table
    mlflow.log_table(
        {
            "Tokens": [example.tokens for example in test_set],
            "Expected": [example.expected_extracted_people for example in test_set],
            "Predicted": outputs,
            "Exact match": all_scores,
        },
        artifact_file="eval_results.json",
    )

Average Metric: 181.00 / 200 (90.5%): 100%|██████████| 200/200 [00:01<00:00, 113.49it/s]

2025/03/13 14:28:54 INFO dspy.evaluate.evaluate: Average Metric: 181 / 200 (90.5%)



🏃 View run extractor_evaluation at: http://localhost:5000/#/experiments/374362034103955121/runs/9a305735ced747d897fb77fc4d195825
🧪 View experiment at: http://localhost:5000/#/experiments/374362034103955121


In [12]:
mipro_optimizer = dspy.MIPROv2(
    metric=extraction_correctness_metric,
    auto="medium",
)
optimized_people_extractor = mipro_optimizer.compile(
    people_extractor,
    trainset=train_set,
    max_bootstrapped_demos=4,
    requires_permission_to_run=False,
    minibatch=False
)

2025/03/13 14:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: False
num_candidates: 19
valset size: 40

2025/03/13 14:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/03/13 14:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/03/13 14:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...


Bootstrapping set 1/19
Bootstrapping set 2/19
Bootstrapping set 3/19


 40%|████      | 4/10 [00:06<00:10,  1.74s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/19


 40%|████      | 4/10 [00:04<00:06,  1.03s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/19


 20%|██        | 2/10 [00:01<00:06,  1.33it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/19


 20%|██        | 2/10 [00:00<00:00, 531.93it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/19


 10%|█         | 1/10 [00:00<00:00, 610.17it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 8/19


 20%|██        | 2/10 [00:00<00:00, 598.80it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 9/19


 30%|███       | 3/10 [00:00<00:00, 728.52it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 10/19


 10%|█         | 1/10 [00:01<00:13,  1.50s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 11/19


 30%|███       | 3/10 [00:01<00:03,  2.20it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 12/19


 20%|██        | 2/10 [00:00<00:00, 675.90it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 13/19


 30%|███       | 3/10 [00:01<00:03,  2.16it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 14/19


 20%|██        | 2/10 [00:00<00:00, 1076.01it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 15/19


 10%|█         | 1/10 [00:00<00:00, 908.25it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 16/19


 10%|█         | 1/10 [00:00<00:00, 736.88it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 17/19


 30%|███       | 3/10 [00:00<00:00, 1089.34it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 18/19


 20%|██        | 2/10 [00:00<00:00, 724.15it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 19/19


 40%|████      | 4/10 [00:00<00:00, 918.09it/s]
2025/03/13 14:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/03/13 14:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/03/13 14:30:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/03/13 14:32:26 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/03/13 14:32:26 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extract contiguous tokens referring to specific people, if any, from a list of string tokens.
Output a list of tokens. In other words, do not combine multiple tokens into a single value.

2025/03/13 14:32:26 INFO dspy.teleprompt.mipro_optimizer_v2: 1: In a critical situation where food safety regulations are being debated in the European Union, it is essential to identify and extract the names of key individuals involved in the discussions. Given a list of tokenized words from news articles, your task is to extract any contiguous tokens that refer to specific people mentioned in the context of British lamb and mad cow disease regulations. Carefully analyze the tokens and provide a list of extracted names, as this information is vital fo

Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:10<00:00,  3.65it/s] 

2025/03/13 14:32:37 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/03/13 14:32:37 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 92.5

2025/03/13 14:32:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:13<00:00,  2.91it/s] 

2025/03/13 14:32:51 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:32:51 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 97.5
2025/03/13 14:32:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].
2025/03/13 14:32:51 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5]
2025/03/13 14:32:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:32:51 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 25 =====



Average Metric: 30.00 / 40 (75.0%): 100%|██████████| 40/40 [00:16<00:00,  2.48it/s]

2025/03/13 14:33:07 INFO dspy.evaluate.evaluate: Average Metric: 30 / 40 (75.0%)
2025/03/13 14:33:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].
2025/03/13 14:33:07 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0]
2025/03/13 14:33:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:33:07 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 25 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:10<00:00,  3.79it/s] 

2025/03/13 14:33:18 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/03/13 14:33:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].
2025/03/13 14:33:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0]
2025/03/13 14:33:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:33:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 25 =====



Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:16<00:00,  2.43it/s]

2025/03/13 14:33:34 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/03/13 14:33:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.5 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].
2025/03/13 14:33:34 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5]
2025/03/13 14:33:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:33:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 25 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:14<00:00,  2.73it/s] 

2025/03/13 14:33:49 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/03/13 14:33:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].
2025/03/13 14:33:49 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0]
2025/03/13 14:33:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:33:49 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 25 =====



Average Metric: 32.00 / 40 (80.0%): 100%|██████████| 40/40 [00:12<00:00,  3.12it/s]

2025/03/13 14:34:02 INFO dspy.evaluate.evaluate: Average Metric: 32 / 40 (80.0%)
2025/03/13 14:34:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].
2025/03/13 14:34:02 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0]
2025/03/13 14:34:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:34:02 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 25 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:14<00:00,  2.77it/s] 

2025/03/13 14:34:16 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/03/13 14:34:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].
2025/03/13 14:34:16 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0]
2025/03/13 14:34:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:34:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:15<00:00,  2.63it/s] 

2025/03/13 14:34:32 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:34:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].
2025/03/13 14:34:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5]
2025/03/13 14:34:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:34:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 25 =====



Average Metric: 33.00 / 40 (82.5%): 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]

2025/03/13 14:34:43 INFO dspy.evaluate.evaluate: Average Metric: 33 / 40 (82.5%)
2025/03/13 14:34:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].
2025/03/13 14:34:43 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5]
2025/03/13 14:34:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:34:43 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:12<00:00,  3.16it/s] 

2025/03/13 14:34:55 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 3'].
2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5]
2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 12 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:00<00:00, 4957.07it/s] 

2025/03/13 14:34:55 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].
2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5]
2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:34:55 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 25 =====



Average Metric: 34.00 / 40 (85.0%): 100%|██████████| 40/40 [00:14<00:00,  2.85it/s]

2025/03/13 14:35:09 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)
2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 13'].
2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0]
2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 14 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:00<00:00, 5003.94it/s] 

2025/03/13 14:35:09 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].
2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5]
2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:35:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 15 / 25 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:12<00:00,  3.20it/s] 

2025/03/13 14:35:22 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/03/13 14:35:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 17'].
2025/03/13 14:35:22 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0]
2025/03/13 14:35:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:35:22 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 16 / 25 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:13<00:00,  3.02it/s]

2025/03/13 14:35:35 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/03/13 14:35:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 6'].
2025/03/13 14:35:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0]
2025/03/13 14:35:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:35:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:10<00:00,  3.66it/s] 

2025/03/13 14:35:46 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:35:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 10'].
2025/03/13 14:35:46 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5]
2025/03/13 14:35:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:35:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 18 / 25 =====



Average Metric: 31.00 / 40 (77.5%): 100%|██████████| 40/40 [00:13<00:00,  3.03it/s]

2025/03/13 14:35:59 INFO dspy.evaluate.evaluate: Average Metric: 31 / 40 (77.5%)
2025/03/13 14:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.5 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 16'].
2025/03/13 14:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5]
2025/03/13 14:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:12<00:00,  3.27it/s] 

2025/03/13 14:36:12 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:36:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 7'].
2025/03/13 14:36:12 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5]
2025/03/13 14:36:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:36:12 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 20 / 25 =====



Average Metric: 36.00 / 40 (90.0%): 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]

2025/03/13 14:36:23 INFO dspy.evaluate.evaluate: Average Metric: 36 / 40 (90.0%)
2025/03/13 14:36:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 90.0 with parameters ['Predictor 0: Instruction 17', 'Predictor 0: Few-Shot Set 13'].
2025/03/13 14:36:23 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5, 90.0]
2025/03/13 14:36:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:36:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 21 / 25 =====



Average Metric: 31.00 / 40 (77.5%): 100%|██████████| 40/40 [00:13<00:00,  3.05it/s]

2025/03/13 14:36:36 INFO dspy.evaluate.evaluate: Average Metric: 31 / 40 (77.5%)
2025/03/13 14:36:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/03/13 14:36:36 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5, 90.0, 77.5]
2025/03/13 14:36:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:36:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 22 / 25 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:11<00:00,  3.55it/s] 

2025/03/13 14:36:48 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/03/13 14:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 3'].
2025/03/13 14:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5, 90.0, 77.5, 95.0]
2025/03/13 14:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 25 =====



Average Metric: 36.00 / 40 (90.0%): 100%|██████████| 40/40 [00:13<00:00,  2.88it/s]

2025/03/13 14:37:01 INFO dspy.evaluate.evaluate: Average Metric: 36 / 40 (90.0%)
2025/03/13 14:37:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 90.0 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 8'].
2025/03/13 14:37:01 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5, 90.0, 77.5, 95.0, 90.0]
2025/03/13 14:37:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:37:01 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 24 / 25 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:00<00:00, 4431.15it/s] 

2025/03/13 14:37:02 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/03/13 14:37:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 3'].
2025/03/13 14:37:02 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5, 90.0, 77.5, 95.0, 90.0, 97.5]
2025/03/13 14:37:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:37:02 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 25 =====



Average Metric: 33.00 / 40 (82.5%): 100%|██████████| 40/40 [00:16<00:00,  2.43it/s]

2025/03/13 14:37:18 INFO dspy.evaluate.evaluate: Average Metric: 33 / 40 (82.5%)
2025/03/13 14:37:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.5 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 11'].
2025/03/13 14:37:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 97.5, 75.0, 95.0, 92.5, 95.0, 80.0, 95.0, 97.5, 82.5, 97.5, 97.5, 85.0, 97.5, 95.0, 95.0, 97.5, 77.5, 97.5, 90.0, 77.5, 95.0, 90.0, 97.5, 82.5]
2025/03/13 14:37:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/03/13 14:37:18 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 97.5!





In [13]:
dspy.inspect_history(n=1)





[34m[2025-03-13T14:37:18.514253][0m

[31mSystem message:[0m

Your input fields are:
1. `tokens` (list[str]): tokenized text

Your output fields are:
1. `reasoning` (str)
2. `extracted_people` (list[str]): all tokens referring to specific people extracted from the tokenized text

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## tokens ## ]]
{tokens}

[[ ## reasoning ## ]]
{reasoning}

[[ ## extracted_people ## ]]
{extracted_people}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Analyze the provided list of tokenized text and extract any contiguous tokens that specifically refer to identifiable individuals. For each token sequence, apply step-by-step reasoning to determine if they represent names of people. Ensure that you output a list of extracted tokens without combining th

In [18]:
optimized_people_extractor(tokens='The night was windless, the snow drifting straight down out of a cold black sky, yet the leaves of the heart tree were rustling his name. "Theon," they seemed to whisper, "Theon." The old gods, he thought. They know me. They know my name. I was Theon of House Greyjoy. I was a ward of Eddard Stark, a friend and brother to his children.'.split(" "))

Prediction(
    reasoning='The tokens provided contain several references to specific individuals. "Theon" is mentioned multiple times, indicating it is a name of a person. Additionally, "Eddard Stark" is also referenced, which is another specific individual\'s name. Both names are extracted as they refer to identifiable characters. The extraction includes each name as separate tokens.',
    extracted_people=['Theon', 'Eddard', 'Stark']
)

In [None]:
people_extractor(tokens='The night was windless, the snow drifting straight down out of a cold black sky, yet the leaves of the heart tree were rustling his name. "Theon," they seemed to whisper, "Theon." The old gods, he thought. They know me. They know my name. I was Theon of House Greyjoy. I was a ward of Eddard Stark, a friend and brother to his children.'.split(" "))