In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import json
import os
import random
from pathlib import Path

import dspy
import numpy as np
import pandas as pd
import weave
from datasets import load_dataset
from dspy.evaluate import Evaluate

In [3]:
def set_seed(seed):
    np.random.seed(seed % (2**32 - 1))
    random.seed(seed)


def configure_lm(model, temperature):
    lm = dspy.LM(
        "openai/" + model,
        temperature=temperature,
        cache=False,
        api_base=os.getenv("OPENAI_BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    dspy.configure(lm=lm)


In [4]:
# weave.init(project_name="llm-adaptation")

In [5]:
set_seed(89)

configure_lm('llama-3-8b', 0.0)

In [6]:
from typing import Callable


def compute_generalized_scores(
    pred_triples: list[str], 
    reference_triples: list[str], 
    match_function: Callable[[str, str], bool]
):
    """Compute precision, recall, and F1-score using a customizable match function."""

    pred_set = set(pred_triples)
    reference_set = set(reference_triples)

    if not pred_set and not reference_set:
        return {"precision": 1.0, "recall": 1.0, "f1": 1.0}

    if not pred_set or not reference_set:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    # Count true positives using the provided match function
    true_positives = sum(any(match_function(pred, ref) for ref in reference_set) for pred in pred_set)
    
    precision = true_positives / len(pred_set)
    recall = true_positives / len(reference_set)
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [7]:
from fuzzywuzzy import fuzz


def fuzzy_match(pred: str, ref: str) -> bool:
    return fuzz.ratio(pred, ref) > 80

def compute_scores(pred_triples: list[str], reference_triples: list[str]):
    exact_scores = {f"exact.{k}": v for k, v in  compute_generalized_scores(pred_triples, reference_triples, lambda x, y: x == y).items()}
    fuzzy_scores = {f"fuzzy.{k}": v for k, v in  compute_generalized_scores(pred_triples, reference_triples, fuzzy_match).items()}
    return {**exact_scores, **fuzzy_scores}

def parse_triples(triples_str: str):
    return [triple.strip() for triple in triples_str.split('\n') if triple.strip()]


@weave.op()
def evaluate_triples(example, pred, trace=None):
    return compute_scores(parse_triples(pred.triples_str), example.triples)['fuzzy.f1']



In [8]:
from copy import deepcopy


def dynamic_import(module, name):
    import importlib

    return getattr(importlib.import_module(module), name)

def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_triples
    return cls(**kwargs)

In [9]:
class EntityRelationExtraction(dspy.Signature):
    """Extract `subject | predicate | object` triples from text."""

    text: str = dspy.InputField()
    triples_str: str = dspy.OutputField(
        desc='The triples extracted from the text. Each triple should be in the format "subject | predicate | object". Triples should be separated by newlines.'
    )


def make_program():
    return dspy.Predict(EntityRelationExtraction)


In [10]:
text = """
Ankara is the capital of Turkey.
Claude Shannon is the father of information theory.
""".strip()

response = make_program()(text=text)
print(text)
print(response)


Ankara is the capital of Turkey.
Claude Shannon is the father of information theory.
Prediction(
    triples_str='Ankara | is the capital of | Turkey\nClaude Shannon | is the father of | information theory'
)


In [11]:
def train(
    dataset_path: str,
    dataset_name: str,
    dataset_split: str,
    model: str,
    temperature: float,
    optimizer_config: dict,
    out: str,
):
    # Set up LM
    configure_lm(model, temperature)

    # Load dataset
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [dspy.Example(text=x["text"], triples=x["triples"]).with_inputs("text") for x in ds]

    # Create program
    program = make_program()

    # Create and run optimizer
    optimizer = make_optimizer(optimizer_config)
    compile_params = optimizer_config.get("compile_params", {})
    trained_program = optimizer.compile(program, trainset=examples, **compile_params)

    # Save trained program
    Path(out).parent.mkdir(parents=True, exist_ok=True)
    trained_program.save(out)
    return trained_program

In [12]:
def evaluate(
    dataset_path: str,
    dataset_name: str,
    dataset_split: str,
    model: str,
    temperature: float,
    out_dir: str,
    load_from: str | None = None,
):
    # Set up LM
    configure_lm(model, temperature)

    # Load dataset
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [dspy.Example(text=x["text"], triples=x["triples"]).with_inputs("text") for x in ds]

    # Load program
    program = make_program()
    if load_from:
        program.load(load_from)

    # Evaluate
    evaluator = Evaluate(
        metric=evaluate_triples,
        devset=examples,
        num_threads=16,
        display_progress=True,
        return_outputs=True,
    )
    _, results = evaluator(program)

    # Save results
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    # Process and save detailed results
    processed_results = []
    for example, pred, score in results:
        result = {
            "text": example.text,
            "triples": example.triples,
            "predicted_triples": parse_triples(pred.triples_str),
            **compute_scores(parse_triples(pred.triples_str), example.triples),
        }
        processed_results.append(result)
    
    result_df = pd.DataFrame(processed_results)
    result_df.to_json(out_path / "results.jsonl", orient="records", lines=True)

    # Save aggregate scores
    scores = result_df[["exact.precision", "exact.recall", "exact.f1", "fuzzy.precision", "fuzzy.recall", "fuzzy.f1"]].mean().to_dict()
    with open(out_path / "scores.json", "w") as f:
        json.dump(scores, f, indent=2)

    return results, scores

In [13]:
dataset_path = "bdsaglam/web_nlg-erx-concat"
dataset_name = "release_v3.0_en"

Before prompt optimization

In [14]:
%%capture
results, scores = evaluate(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    dataset_split="dev[:100]",
    model="llama-3-8b",
    temperature=0.0,
    out_dir="../tmp/erx/dspy/before",
)

In [15]:
scores

{'exact.precision': 0.011607142857142856,
 'exact.recall': 0.012751696832579184,
 'exact.f1': 0.012068311648983919,
 'fuzzy.precision': 0.34766847244117066,
 'fuzzy.recall': 0.3378176513648623,
 'fuzzy.f1': 0.33974898258125846}

Let's optimize the program with DSPy.

In [20]:
optimizer_config = {
    "class": "BootstrapFewShotWithRandomSearch",
    "params": {
        "max_bootstrapped_demos": 8,
        "max_labeled_demos": 8,
        "max_rounds": 16,
        "num_candidate_programs": 16,
        "num_threads": 16,
        "max_errors": 10,
        "metric_threshold": None,
        "stop_at_score": None,
    },
    "with_metric": True,
}

train(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    dataset_split="train[:100]",
    model="llama-3-8b",
    temperature=0.0,
    optimizer_config=optimizer_config,
    out="../tmp/erx/dspy/trained-program.json",
)

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.
Average Metric: 31.60 / 100 (31.6%): 100%|██████████| 100/100 [00:17<00:00,  5.70it/s]

2025/02/14 13:25:14 INFO dspy.evaluate.evaluate: Average Metric: 31.600189246922703 / 100 (31.6%)



New best score: 31.6 for seed -3
Scores so far: [31.6]
Best score so far: 31.6
Average Metric: 32.02 / 100 (32.0%): 100%|██████████| 100/100 [00:17<00:00,  5.81it/s]

2025/02/14 13:25:31 INFO dspy.evaluate.evaluate: Average Metric: 32.01976466366833 / 100 (32.0%)



New best score: 32.02 for seed -2
Scores so far: [31.6, 32.02]
Best score so far: 32.02


  8%|▊         | 8/100 [00:14<02:41,  1.76s/it]


Bootstrapped 8 full traces after 8 examples for up to 16 rounds, amounting to 9 attempts.
Average Metric: 35.02 / 100 (35.0%): 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]

2025/02/14 13:26:08 INFO dspy.evaluate.evaluate: Average Metric: 35.021371833660346 / 100 (35.0%)



New best score: 35.02 for seed -1
Scores so far: [31.6, 32.02, 35.02]
Best score so far: 35.02


  7%|▋         | 7/100 [00:11<02:27,  1.59s/it]


Bootstrapped 7 full traces after 7 examples for up to 16 rounds, amounting to 8 attempts.
Average Metric: 0.00 / 97 (0.0%):  96%|█████████▌| 96/100 [00:05<00:00, 29.66it/s]

2025/02/14 13:26:28 ERROR dspy.utils.parallelizer: Error processing item Example({'text': "Ann Arbor forms part of Washtenaw County in Michigan, United States. Detroit is the largest city in Michigan, where the capital is Lansing.\nThe TV character Bananaman was created by John Geering. The programme was broadcast by STV and starred Graeme Garden. Bananaman was first shown on 10th March 1983 and last aired on 15th April 1986.\nAkeem Adams' former clubs include Ferencvarosi TC and Palo Seco based United Petrotrin FC.\nVehicles that are related are the Alfa Romeo 164 (made in Italy) and Lancia Thema. The latter is related to the Saab 9000.\nThe AIDAstella was built by the German based company Meyer Werft. It is operated by AIDA Cruises and owned by Costa Crociere, a subsidiary of Carnival Corporation & Plc.\nAleksandr Prudnikov belonged to FC Spartak Moscow, which plays at Otkrytiye Arena. He has played fro FC Terek Grozny and plays for FC Amkar Perm, which is managed by Gadzhi Gadzhiyev

Average Metric: 0.00 / 99 (0.0%): 100%|██████████| 100/100 [00:13<00:00,  7.61it/s]

2025/02/14 13:26:33 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 100 (0.0%)



Scores so far: [31.6, 32.02, 35.02, 0.0]
Best score so far: 35.02


  3%|▎         | 3/100 [00:04<02:12,  1.36s/it]


Bootstrapped 3 full traces after 3 examples for up to 16 rounds, amounting to 3 attempts.
Average Metric: 32.74 / 100 (32.7%): 100%|██████████| 100/100 [00:19<00:00,  5.22it/s]

2025/02/14 13:26:57 INFO dspy.evaluate.evaluate: Average Metric: 32.74268601162082 / 100 (32.7%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74]
Best score so far: 35.02


  1%|          | 1/100 [00:03<06:07,  3.72s/it]


Bootstrapped 1 full traces after 1 examples for up to 16 rounds, amounting to 4 attempts.
Average Metric: 38.90 / 100 (38.9%): 100%|██████████| 100/100 [00:18<00:00,  5.52it/s]

2025/02/14 13:27:19 INFO dspy.evaluate.evaluate: Average Metric: 38.89527223804535 / 100 (38.9%)



New best score: 38.9 for seed 2
Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9]
Best score so far: 38.9


  5%|▌         | 5/100 [00:20<06:22,  4.03s/it]


Bootstrapped 4 full traces after 5 examples for up to 16 rounds, amounting to 20 attempts.
Average Metric: 42.75 / 100 (42.8%): 100%|██████████| 100/100 [00:19<00:00,  5.10it/s]

2025/02/14 13:28:00 INFO dspy.evaluate.evaluate: Average Metric: 42.75407322418495 / 100 (42.8%)



New best score: 42.75 for seed 3
Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75]
Best score so far: 42.75


  4%|▍         | 4/100 [00:09<03:43,  2.32s/it]


Bootstrapped 4 full traces after 4 examples for up to 16 rounds, amounting to 4 attempts.
Average Metric: 31.46 / 100 (31.5%): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]

2025/02/14 13:28:31 INFO dspy.evaluate.evaluate: Average Metric: 31.457793875105306 / 100 (31.5%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46]
Best score so far: 42.75


  5%|▌         | 5/100 [00:08<02:45,  1.75s/it]


Bootstrapped 5 full traces after 5 examples for up to 16 rounds, amounting to 5 attempts.
Average Metric: 33.81 / 100 (33.8%): 100%|██████████| 100/100 [00:20<00:00,  4.78it/s]

2025/02/14 13:29:01 INFO dspy.evaluate.evaluate: Average Metric: 33.812326256486195 / 100 (33.8%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81]
Best score so far: 42.75


  2%|▏         | 2/100 [00:15<12:18,  7.54s/it]


Bootstrapped 2 full traces after 2 examples for up to 16 rounds, amounting to 6 attempts.
Average Metric: 35.01 / 100 (35.0%): 100%|██████████| 100/100 [00:17<00:00,  5.56it/s]

2025/02/14 13:29:35 INFO dspy.evaluate.evaluate: Average Metric: 35.00951853133457 / 100 (35.0%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01]
Best score so far: 42.75


  6%|▌         | 6/100 [00:10<02:45,  1.76s/it]


Bootstrapped 6 full traces after 6 examples for up to 16 rounds, amounting to 6 attempts.
Average Metric: 34.30 / 100 (34.3%): 100%|██████████| 100/100 [00:22<00:00,  4.39it/s]

2025/02/14 13:30:09 INFO dspy.evaluate.evaluate: Average Metric: 34.30256938145835 / 100 (34.3%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3]
Best score so far: 42.75


  4%|▍         | 4/100 [00:14<05:42,  3.57s/it]


Bootstrapped 4 full traces after 4 examples for up to 16 rounds, amounting to 16 attempts.
Average Metric: 36.15 / 100 (36.2%): 100%|██████████| 100/100 [00:20<00:00,  4.96it/s]

2025/02/14 13:30:44 INFO dspy.evaluate.evaluate: Average Metric: 36.153882269280516 / 100 (36.2%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15]
Best score so far: 42.75


  8%|▊         | 8/100 [00:14<02:51,  1.86s/it]


Bootstrapped 8 full traces after 8 examples for up to 16 rounds, amounting to 8 attempts.
Average Metric: 31.24 / 100 (31.2%): 100%|██████████| 100/100 [00:23<00:00,  4.18it/s]

2025/02/14 13:31:23 INFO dspy.evaluate.evaluate: Average Metric: 31.23734205861276 / 100 (31.2%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24]
Best score so far: 42.75


  1%|          | 1/100 [00:03<05:35,  3.38s/it]


Bootstrapped 1 full traces after 1 examples for up to 16 rounds, amounting to 3 attempts.
Average Metric: 32.13 / 100 (32.1%): 100%|██████████| 100/100 [00:18<00:00,  5.34it/s]

2025/02/14 13:31:45 INFO dspy.evaluate.evaluate: Average Metric: 32.13415569360357 / 100 (32.1%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24, 32.13]
Best score so far: 42.75


  8%|▊         | 8/100 [00:16<03:06,  2.02s/it]


Bootstrapped 8 full traces after 8 examples for up to 16 rounds, amounting to 8 attempts.
Average Metric: 35.68 / 100 (35.7%): 100%|██████████| 100/100 [00:23<00:00,  4.23it/s]

2025/02/14 13:32:26 INFO dspy.evaluate.evaluate: Average Metric: 35.67776527737274 / 100 (35.7%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24, 32.13, 35.68]
Best score so far: 42.75


  8%|▊         | 8/100 [00:15<02:59,  1.95s/it]


Bootstrapped 8 full traces after 8 examples for up to 16 rounds, amounting to 8 attempts.
Average Metric: 36.20 / 100 (36.2%): 100%|██████████| 100/100 [00:23<00:00,  4.18it/s]

2025/02/14 13:33:06 INFO dspy.evaluate.evaluate: Average Metric: 36.1985664905158 / 100 (36.2%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24, 32.13, 35.68, 36.2]
Best score so far: 42.75


  5%|▌         | 5/100 [00:07<02:15,  1.43s/it]


Bootstrapped 5 full traces after 5 examples for up to 16 rounds, amounting to 8 attempts.
Average Metric: 42.92 / 100 (42.9%): 100%|██████████| 100/100 [00:18<00:00,  5.27it/s]

2025/02/14 13:33:33 INFO dspy.evaluate.evaluate: Average Metric: 42.917826619044305 / 100 (42.9%)



New best score: 42.92 for seed 13
Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24, 32.13, 35.68, 36.2, 42.92]
Best score so far: 42.92


  2%|▏         | 2/100 [00:09<07:49,  4.79s/it]


Bootstrapped 2 full traces after 2 examples for up to 16 rounds, amounting to 4 attempts.
Average Metric: 36.60 / 100 (36.6%): 100%|██████████| 100/100 [00:20<00:00,  4.97it/s]

2025/02/14 13:34:03 INFO dspy.evaluate.evaluate: Average Metric: 36.59968675831522 / 100 (36.6%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24, 32.13, 35.68, 36.2, 42.92, 36.6]
Best score so far: 42.92


  4%|▍         | 4/100 [00:08<03:21,  2.10s/it]


Bootstrapped 4 full traces after 4 examples for up to 16 rounds, amounting to 6 attempts.
Average Metric: 41.68 / 100 (41.7%): 100%|██████████| 100/100 [00:19<00:00,  5.06it/s]

2025/02/14 13:34:31 INFO dspy.evaluate.evaluate: Average Metric: 41.68390665799123 / 100 (41.7%)



Scores so far: [31.6, 32.02, 35.02, 0.0, 32.74, 38.9, 42.75, 31.46, 33.81, 35.01, 34.3, 36.15, 31.24, 32.13, 35.68, 36.2, 42.92, 36.6, 41.68]
Best score so far: 42.92
19 candidate programs found.


Predict(EntityRelationExtraction(text -> triples_str
    instructions='Extract `subject | predicate | object` triples from text.'
    text = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Text:', 'desc': '${text}'})
    triples_str = Field(annotation=str required=True json_schema_extra={'desc': 'The triples extracted from the text. Each triple should be in the format "subject | predicate | object". Triples should be separated by newlines.', '__dspy_field_type': 'output', 'prefix': 'Triples Str:'})
))

In [18]:
results, scores = evaluate(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    dataset_split="dev[:100]",
    model="llama-3-8b",
    temperature=0.0,
    out_dir="../tmp/erx/dspy/after",
    load_from="../tmp/erx/dspy/trained-program.json",
)

Average Metric: 39.40 / 100 (39.4%): 100%|██████████| 100/100 [00:18<00:00,  5.47it/s]

2025/02/14 13:10:40 INFO dspy.evaluate.evaluate: Average Metric: 39.40046495393733 / 100 (39.4%)



