In [None]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import json
import os
import random
from pathlib import Path

import dspy
import numpy as np
import pandas as pd
import weave
from datasets import load_dataset
from dspy.evaluate import Evaluate

In [3]:
print(os.getenv("OPENAI_BASE_URL"))

http://0.0.0.0:8081/v1


In [29]:
def set_seed(seed):
    np.random.seed(seed % (2**32 - 1))
    random.seed(seed)


def configure_lm(model, temperature):
    lm = dspy.LM(
        "openai/" + model,
        temperature=temperature,
        cache=False,
        api_base=os.getenv("OPENAI_BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    dspy.configure(lm=lm)


In [11]:
# weave.init(project_name="llm-adaptation")

In [12]:
set_seed(89)

configure_lm('llama-3-8b', 0.0)

In [13]:
from typing import Callable


def compute_generalized_scores(
    pred_triples: list[str], 
    reference_triples: list[str], 
    match_function: Callable[[str, str], bool]
):
    """Compute precision, recall, and F1-score using a customizable match function."""

    pred_set = set(pred_triples)
    reference_set = set(reference_triples)

    if not pred_set and not reference_set:
        return {"precision": 1.0, "recall": 1.0, "f1": 1.0}

    if not pred_set or not reference_set:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    # Count true positives using the provided match function
    true_positives = sum(any(match_function(pred, ref) for ref in reference_set) for pred in pred_set)
    
    precision = true_positives / len(pred_set)
    recall = true_positives / len(reference_set)
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [14]:
from fuzzywuzzy import fuzz


def fuzzy_match(pred: str, ref: str) -> bool:
    return fuzz.ratio(pred, ref) > 80

def compute_scores(pred_triples: list[str], reference_triples: list[str]):
    exact_scores = {f"exact.{k}": v for k, v in  compute_generalized_scores(pred_triples, reference_triples, lambda x, y: x == y).items()}
    fuzzy_scores = {f"fuzzy.{k}": v for k, v in  compute_generalized_scores(pred_triples, reference_triples, fuzzy_match).items()}
    return {**exact_scores, **fuzzy_scores}

def parse_triples(triples_str: str):
    return [triple.strip() for triple in triples_str.split('\n') if triple.strip()]


@weave.op()
def evaluate_triples(example, pred, trace=None):
    return compute_scores(parse_triples(pred.triples_str), example.triples)['fuzzy.f1']

In [15]:
from copy import deepcopy


def dynamic_import(module, name):
    import importlib

    return getattr(importlib.import_module(module), name)

def make_optimizer(optimizer_config: dict):
    cls = dynamic_import("dspy.teleprompt", optimizer_config["class"])
    kwargs = deepcopy(optimizer_config["params"])
    if optimizer_config["with_metric"]:
        kwargs["metric"] = evaluate_triples
    return cls(**kwargs)

In [16]:
class EntityRelationExtraction(dspy.Signature):
    """Extract `subject | predicate | object` triples from text."""

    text: str = dspy.InputField()
    triples_str: str = dspy.OutputField(
        desc='The triples extracted from the text. Each triple should be in the format "subject | predicate | object". Triples should be separated by newlines.'
    )


def make_program():
    return dspy.Predict(EntityRelationExtraction)


In [17]:
text = """
Ankara is the capital of Turkey.
Claude Shannon is the father of information theory.
""".strip()

response = make_program()(text=text)
print(text)
print(response)


APIError: litellm.APIError: APIError: OpenAIException - Connection error.

In [12]:
def train(
    dataset_path: str,
    dataset_name: str,
    dataset_split: str,
    model: str,
    temperature: float,
    optimizer_config: dict,
    out: str,
):
    # Set up LM
    configure_lm(model, temperature)

    # Load dataset
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [dspy.Example(text=x["text"], triples=x["triples"]).with_inputs("text") for x in ds]

    # Create program
    program = make_program()

    # Create and run optimizer
    optimizer = make_optimizer(optimizer_config)
    compile_params = optimizer_config.get("compile_params", {})
    trained_program = optimizer.compile(program, trainset=examples, **compile_params)

    # Save trained program
    Path(out).parent.mkdir(parents=True, exist_ok=True)
    trained_program.save(out)
    return trained_program

In [13]:
def evaluate(
    dataset_path: str,
    dataset_name: str,
    dataset_split: str,
    model: str,
    temperature: float,
    out_dir: str,
    load_from: str | None = None,
):
    # Set up LM
    configure_lm(model, temperature)

    # Load dataset
    ds = load_dataset(dataset_path, dataset_name, split=dataset_split)
    examples = [dspy.Example(text=x["text"], triples=x["triples"]).with_inputs("text") for x in ds]

    # Load program
    program = make_program()
    if load_from:
        program.load(load_from)

    # Evaluate
    evaluator = Evaluate(
        metric=evaluate_triples,
        devset=examples,
        num_threads=16,
        display_progress=True,
        return_outputs=True,
    )
    _, results = evaluator(program)

    # Save results
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    # Process and save detailed results
    processed_results = []
    for example, pred, score in results:
        result = {
            "text": example.text,
            "triples": example.triples,
            "predicted_triples": parse_triples(pred.triples_str),
            **compute_scores(parse_triples(pred.triples_str), example.triples),
        }
        processed_results.append(result)
    
    result_df = pd.DataFrame(processed_results)
    result_df.to_json(out_path / "results.jsonl", orient="records", lines=True)

    # Save aggregate scores
    scores = result_df[["exact.precision", "exact.recall", "exact.f1", "fuzzy.precision", "fuzzy.recall", "fuzzy.f1"]].mean().to_dict()
    with open(out_path / "scores.json", "w") as f:
        json.dump(scores, f, indent=2)

    return results, scores

In [14]:
dataset_path = "bdsaglam/web_nlg-erx-concat"
dataset_name = "release_v3.0_en"

Before prompt optimization

In [15]:
%%capture
results, scores = evaluate(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    dataset_split="dev[:100]",
    model="llama-3-8b",
    temperature=0.0,
    out_dir="../tmp/erx/dspy/before",
)

In [16]:
scores

{'exact.precision': 0.012619047619047618,
 'exact.recall': 0.01310883968972204,
 'exact.f1': 0.012762189200004326,
 'fuzzy.precision': 0.36535511660831355,
 'fuzzy.recall': 0.3532727740199849,
 'fuzzy.f1': 0.3564569059040341}

Let's optimize the program with DSPy.

In [17]:
optimizer_config = {
    "class": "BootstrapFewShotWithRandomSearch",
    "params": {
        "max_bootstrapped_demos": 8,
        "max_labeled_demos": 8,
        "max_rounds": 16,
        "num_candidate_programs": 16,
        "num_threads": 16,
        "max_errors": 10,
        "metric_threshold": None,
        "stop_at_score": None,
    },
    "with_metric": True,
}

train(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    dataset_split="train[:100]",
    model="llama-3-8b",
    temperature=0.0,
    optimizer_config=optimizer_config,
    out="../tmp/erx/dspy/trained-program.json",
)

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.
Average Metric: 32.95 / 100 (33.0%): 100%|██████████| 100/100 [00:16<00:00,  5.90it/s]

2025/02/14 15:46:28 INFO dspy.evaluate.evaluate: Average Metric: 32.95016723063732 / 100 (33.0%)



New best score: 32.95 for seed -3
Scores so far: [32.95]
Best score so far: 32.95
Average Metric: 33.69 / 100 (33.7%): 100%|██████████| 100/100 [00:17<00:00,  5.85it/s]

2025/02/14 15:46:46 INFO dspy.evaluate.evaluate: Average Metric: 33.69250472074029 / 100 (33.7%)



New best score: 33.69 for seed -2
Scores so far: [32.95, 33.69]
Best score so far: 33.69


  8%|▊         | 8/100 [00:19<03:40,  2.40s/it]


Bootstrapped 8 full traces after 8 examples for up to 16 rounds, amounting to 14 attempts.
Average Metric: 36.90 / 100 (36.9%): 100%|██████████| 100/100 [00:22<00:00,  4.51it/s]

2025/02/14 15:47:28 INFO dspy.evaluate.evaluate: Average Metric: 36.904625085136594 / 100 (36.9%)



New best score: 36.9 for seed -1
Scores so far: [32.95, 33.69, 36.9]
Best score so far: 36.9


  7%|▋         | 7/100 [00:16<03:37,  2.34s/it]


Bootstrapped 7 full traces after 7 examples for up to 16 rounds, amounting to 18 attempts.
Average Metric: 0.00 / 4 (0.0%):   4%|▍         | 4/100 [00:01<00:32,  2.98it/s]

2025/02/14 15:47:49 ERROR dspy.utils.parallelizer: Error processing item Example({'text': 'Alan Bean was a crew member of Apollo 12.\nBorn in Imst (in Austria-Hungary), Alfons Gorbach died in Graz, in Styria.\nAleksandre Guruli played for the Olympique Lyonnais club who play their home games at the Parc Olympique Lyonnais.\nTrane is a manufacturer of building materials, including building management systems and HVAC.\nChristopher Taylor, politician, leads Ann Arbor, Michigan.\nThe ISSN number for Abhandlungen aus dem Mathematischen Seminar der Universitat Hamburg (abbreviating to Abh. Math. Semin. Univ. Hambg.) is 1865-8784. The establishment is concerned with the academic discipline of Pure Mathematics.\nVfl Wolfsburg play in the Bundesliga.', 'triples': ['Alan Bean | mission | Apollo 12', 'Alfons Gorbach | death place | Styria', 'Alfons Gorbach | death place | Graz', 'Alfons Gorbach | birth place | Austria-Hungary', 'Alfons Gorbach | birth place | Imst', 'Aleksandre Guruli | club | O

Average Metric: 0.00 / 4 (0.0%):   5%|▌         | 5/100 [00:04<02:12,  1.39s/it]

2025/02/14 15:47:49 ERROR dspy.utils.parallelizer: Error processing item Example({'text': "Alfred N. Phillips was a member of the US Army, which fought in the Whiskey Rebellion.\nAndrew Rayel is associated with a number of musical artists, these include: Armin Van Buuren, Bobina, Mark Sixma, Jonathan Mendelsohn, Christian Burns, Jwaydan, Alexander Popov, Jano, Alexandre Bergheau, Jonny Rose, Sylvia Tosun, Lira Yin, and Alexandra Badoi.\n1634 The Ram Rebellion comes from the United States where the leader is known as the President and the Native Americans are an ethnic group.\nAlison O'Donnell plays the instrument called the bodhran.\nAMC Matador, known also as the American Motors Matador, has an AMC V8 engine and was made in Australia.\nThe musical genre of american, Ahmet Ertegun, is rhythm and blues, a derivative of which is disco.", 'triples': ['Alfred N. Phillips | military branch | United States Army', 'United States Army | battle | Whiskey Rebellion', 'Andrew Rayel | associated b

Average Metric: 0.00 / 4 (0.0%):   6%|▌         | 6/100 [00:04<01:30,  1.04it/s]

2025/02/14 15:47:50 ERROR dspy.utils.parallelizer: Error processing item Example({'text': 'Doris Bures is the leader of Austria where Alfons Gorbach died in Styria.\n10 Hygiea has an escape velocity of 0.21 kilometres per second and an apoapsis of 523951582.33968 kilometres.\nThe 1st runway at Alderney Airport is made from Poaceae which is member of the Poales order. Poaceae belongs to the Commelinids order, within the flowering plants and classed as Monocotyledon.\n"The Secret Scripture" followed the book "A Long Long Way," which was preceded by "Annie Dunne".', 'triples': ['Austria | leader | Doris Bures', 'Alfons Gorbach | death place | Styria', 'Alfons Gorbach | death place | Austria', '10 Hygiea | escape velocity | 0.21 (kilometrePerSeconds)', '10 Hygiea | apoapsis | 523951582.33968 (kilometres)', 'Poaceae | division | Flowering plant', 'Alderney Airport | 1st runway surface type | Poaceae', 'Poaceae | order | Poales', 'Poaceae | order | Commelinids', 'Poaceae | class | Monocotyle

Average Metric: 0.00 / 4 (0.0%):   6%|▌         | 6/100 [00:05<01:30,  1.04it/s]

2025/02/14 15:47:50 ERROR dspy.utils.parallelizer: Error processing item Example({'text': "AS Roma's manager is Luciano Spalletti, who played for Udinese Calcio. He plays for both, Empoli F.C. and Virtus Entella.\nThe Atlas II is from the United States, where the capital is Washington D.C. and the language English. The leader of the United States is the President of the United States and one of its ethnic groups is Asian Americans.\nThe runway length of Afonso Pena International Airport is 2215.0.\nAWH Engineering College at Kerala has 250 academic staff. Kerala's leader is Kochi.\nAbove the Veil followed the book Aenir.\nThe ISBN number of Aenir is 0-439-17684-0.\nTarrant County, with the largest city of Fort Worth, is home to Arlington, Texas, United States.", 'triples': ['Luciano Spalletti | club | Udinese Calcio', 'Luciano Spalletti | club | Empoli F.C.', 'A.S. Roma | manager | Luciano Spalletti', 'Luciano Spalletti | club | Virtus Entella', 'United States | ethnic group | Asian Am

Average Metric: 0.00 / 4 (0.0%):   7%|▋         | 7/100 [00:05<01:09,  1.34it/s]

2025/02/14 15:47:50 ERROR dspy.utils.parallelizer: Error processing item Example({'text': 'Akeem Priestley is connected to the Orange County Blues Football club which is managed by Oliver Wyss, but he also plays for Jacksonville Dolphins who have their baseball field in the John Sessions stadium.\nWilliam Anders (born in British Hong Kong) was a crew member of Apollo 8 and was selected by NASA in 1963. He retired on 1969-09-01.', 'triples': ['Orange County Blues FC | manager | Oliver Wyss', 'Akeem Priestley | club | Orange County Blues FC', 'Akeem Priestley | club | Jacksonville Dolphins', 'Jacksonville Dolphins | stadium | John Sessions Stadium', 'William Anders | date of retirement | 1969-09-01', 'William Anders | selected by nasa | 1963', 'William Anders | birth place | British Hong Kong', 'William Anders | mission | Apollo 8']}) (input_keys={'text'}): litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': 'litellm.BadRequestError: OpenAIException - Faile

Average Metric: 0.00 / 4 (0.0%):   9%|▉         | 9/100 [00:05<00:42,  2.15it/s]

2025/02/14 15:47:50 ERROR dspy.utils.parallelizer: Error processing item Example({'text': 'Alpena County Regional Airport is located in Wilson Township, Alpena County, Michigan, United States. The airport has a runway length of 1533.0 and is at an elevation of 210 metres above sea level.\nThe temperature of 1097 Vicia is 171.0 (kelvins) and its apoapsis is 511592000.0 km.\nAdam Holloway was born in Kent and began his career on May 5th 2005. He is a membe of the Conservative Party in the UK and served in the Grenadier Guards. His alma mater is Magdalene College, Cambridge.', 'triples': ['Alpena County Regional Airport | location | Wilson Township, Alpena County, Michigan', 'Wilson Township, Alpena County, Michigan | country | United States', 'Alpena County Regional Airport | elevation above the sea level | 210', 'Alpena County Regional Airport | runway length | 1533.0', '1097 Vicia | apoapsis | 511592000.0 (kilometres)', '1097 Vicia | temperature | 171.0 (kelvins)', 'Adam Holloway | par

Average Metric: 0.00 / 4 (0.0%):   9%|▉         | 9/100 [00:05<00:42,  2.15it/s]

2025/02/14 15:47:50 ERROR dspy.utils.parallelizer: Error processing item Example({'text': "Huseyin Butuner and Hilmi Guner designed the red granite and white marble Baku Turkish Martyrs memorial which is dedicated to the Ottoman army soldiers killed in the Battle of Baku. The memorial is located in Azerbaijan where the leader is Artur Rasizade.\nThe leader of the United States has the title President of the United States.\nCleveland's governing body is Cleveland City Council.", 'triples': ["Baku Turkish Martyrs' Memorial | material | Red granite and white marble", "Baku Turkish Martyrs' Memorial | dedicated to | Ottoman Army soldiers killed in the Battle of Baku", "Baku Turkish Martyrs' Memorial | location | Azerbaijan", 'Azerbaijan | leader | Artur Rasizade', "Baku Turkish Martyrs' Memorial | designer | Hüseyin Bütüner and Hilmi Güner", 'United States | leader title | President of the United States', 'Cleveland | governing body | Cleveland City Council']}) (input_keys={'text'}): litel

Average Metric: 0.00 / 4 (0.0%):  11%|█         | 11/100 [00:05<00:27,  3.25it/s]

2025/02/14 15:47:50 ERROR dspy.utils.parallelizer: Error processing item Example({'text': 'African Americans are one of the ethnic groups in the United States, which is led by the President. Also in the U.S., is Angola which is part of the state of Indiana.\nAlpena, Michigan is located in the United States.\nAaron Hunt plays for SV Werder Bremen which is managed by Viktor Skrypnyk.', 'triples': ['United States | ethnic group | African Americans', 'Angola, Indiana | is part of | Indiana', 'United States | leader title | President of the United States', 'Angola, Indiana | country | United States', 'Alpena, Michigan | country | United States', 'Aaron Hunt | club | SV Werder Bremen', 'SV Werder Bremen | manager | Viktor Skrypnyk']}) (input_keys={'text'}): litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': 'litellm.BadRequestError: OpenAIException - Failed to deserialize the JSON body into the target type: response_format: missing field `value` at line 1 colu

Average Metric: 0.00 / 4 (0.0%):  12%|█▏        | 12/100 [00:05<00:23,  3.76it/s]

BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'error': {'message': 'litellm.BadRequestError: OpenAIException - Failed to deserialize the JSON body into the target type: response_format: missing field `value` at line 1 column 9585\nReceived Model Group=llama-3-8b\nAvailable Model Group Fallbacks=None', 'type': None, 'param': None, 'code': '400'}}

In [18]:
results, scores = evaluate(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    dataset_split="dev[:100]",
    model="llama-3-8b",
    temperature=0.0,
    out_dir="../tmp/erx/dspy/after",
    load_from="../tmp/erx/dspy/trained-program.json",
)

Average Metric: 39.40 / 100 (39.4%): 100%|██████████| 100/100 [00:18<00:00,  5.47it/s]

2025/02/14 13:10:40 INFO dspy.evaluate.evaluate: Average Metric: 39.40046495393733 / 100 (39.4%)



