In [67]:
import importlib
import extractor
importlib.reload(extractor)
import crawl
importlib.reload(crawl)


<module 'crawl' from '/Users/davidburke/src/pragmatic_dspy/crawl.py'>

In [68]:
#all env variables are in local.env 
import dotenv
dotenv.load_dotenv('local.env')
import dspy
from dspy.evaluate import Evaluate
import os
from langfuse import Langfuse
from langfuse.api.resources.commons.types import DatasetStatus
from typing import Literal
from bs4 import BeautifulSoup
import requests
import json
import glob
from datetime import datetime
#DSPy is using litellm to call the LLM so its useful to tweak the litellm config
import litellm
from extractor import GroundTruthEvaluator
from extractor import Extractor
import uuid



In [14]:
#Ensure the env variables below are set in local.env or whatever you're using to run this notebook
litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]
#langfuse client is used for storing and retrieving datasets
langfuse = Langfuse(secret_key=os.getenv('LANGFUSE_SECRET_KEY'),
    public_key=os.getenv('LANGFUSE_PUBLIC_KEY'),
    host=os.getenv('LANGFUSE_HOST'))

#4o-mini was quick to get up and running
lm = dspy.LM('openai/gpt-4o-mini', api_key=os.getenv('OPENAI_API_KEY'),temperature=0.0)
dspy.configure(lm=lm,suppress_debug_info=False)

In [5]:
@observe(as_type=Literal["trace"])
def fetch_html(url: str) -> str:
    """Fetch HTML content from a URL.
    
    Args:
        url: The URL to fetch HTML from
        
    Returns:
        The HTML content as a string, with just the contents of the <body> tag
    """
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.find('body')
    return body.decode_contents() if body else ''

In [31]:
#extractor is a very simple Signature that expects html and url as input
# lets start with a trivial synthetic example to see if it works
prediction = extractor.extract(html="""
<html>
<body>
<h1>Event Title</h1>
<p>Event Description</p>
<p>Event Location</p>
<p>Start: 1pm 12th December</p>
<p>End: 2pm 12th December</p>
</body>
</html>
""",url="https://bobsevents.com/event1")
print(prediction)


Prediction(
    title='Event Title',
    description='Event Description',
    location='Event Location',
    start_time='1pm 12th December',
    end_time='2pm 12th December'
)


In [109]:

#set the url to the event page you want to extract the event details from
url = "https://theworkmansclub.com/events/overhead-the-albatross-4/"
html = fetch_html(url)
simple_extractor = Extractor()
simple_extractor.extract(html, url)


Prediction(
    title='OVERHEAD, THE ALBATROSS',
    description='To celebrate the release of their sophomore album titled “I Leave You This”, Overhead, The Albatross will be playing an intimate gig showcasing the new material in the Workman’s Cellar on November 23rd.',
    location='The Workman’s Cellar, Dublin, Ireland',
    start_time='2024-11-23T20:00:00',
    end_time='2024-11-23T22:00:00'
)

In [110]:
lm.history[-1]

{'prompt': None,
 'messages': [{'role': 'system',
   'content': 'Your input fields are:\n1. `url` (str): The url of the event page\n2. `html` (str): The html of the event page\n3. `title` (str): The extracted title of the event\n4. `description` (str): The extracted description of the event\n5. `location` (str): The extracted location of the event\n6. `start_time` (str): The extracted start time of the event\n7. `end_time` (str): The extracted end time of the event.\n\nYour output fields are:\n1. `score` (float): How close the extracted event information is to the ground truth from the html, out of 1. Inferred information should be scored lower than a lack of information\n2. `reasoning` (str): The reasoning for the score\n\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## url ## ]]\n{url}\n\n[[ ## html ## ]]\n{html}\n\n[[ ## title ## ]]\n{title}\n\n[[ ## description ## ]]\n{description}\n\n[[ ## location ## ]]\n{location}\n\n[[ #

In [36]:
from crawl import crawl
#Optional crawls the websites(subpages of the url) and saves the html to a file in data folder
urls = []
for url in urls:
    crawl(url)

{'html': '<html><body style="margin:0"><div '
         'src="https://geo.captcha-delivery.com/captcha/?initialCid=AHrlqAAAAAMAf2xl1y6_XVkAwY4n_A==&amp;cid=9_QdmrUGqH6FKNuONrOkEVk21s7rqH9gfwcAxEU2BNrJJqh5ZOeRkFBs4IiXejnPso_6hxyDoVEucRINgZwcS_08CtnAuf0P2aMg96lKPhWWrICZd2h_r6HodQ~qad_1&amp;referer=https%3A%2F%2Fra.co%2Fevents%2Fie%2Fdublin&amp;hash=107A2F9ACF118F5EFF46550CD47084&amp;t=fe&amp;s=41462&amp;e=2dc03465270799d3fee2b82ede7b9de2495b642a5a66cfae470357823a0721ce&amp;ir=37&amp;dm=dc_ir" '
         'sandbox="allow-scripts allow-same-origin allow-forms" width="100%" '
         'height="100%" style="height:100vh;" frameborder="0" border="0" '
         'scrolling="yes" data-original-tag="iframe"><base '
         'target="_parent"><link rel="stylesheet" type="text/css" '
         'href="cid:css-51ba8fef-58d7-40ad-96d1-57b5aeb9533b@mhtml.blink"><link '
         'rel="stylesheet" type="text/css" '
         'href="cid:css-a271e1d5-1c94-4ff2-b523-dc92afa85552@mhtml.blink">\n'
         '     

In [48]:
#run all the examples from the data folder through the extractor

def run(data_dir: str, extractor: Extractor):
    run_id = str(uuid.uuid4())
    print(f"Running with run_id: {run_id}")
    json_files = glob.glob(os.path.join(data_dir, "*.json"))
    for json_file in json_files:
        with open(json_file, "r+") as f:
            if f.readable() and f.read().strip(): 
                f.seek(0)
                data = json.load(f)
                soup = BeautifulSoup(data["html"], 'html.parser')
                body = soup.find('body')
                for script in soup.find_all('script'):
                    script.decompose()
                extractor.extract(html=body.decode_contents(), url=data["metadata"]["url"],run_id=run_id)
            else:
                print(f"file not readable:{json_file}")
    return run_id

In [49]:
run("data/synthetic",simple_extractor)
#you may want to run for other websites
#run("data/<amazingevents>",simple_extractor)

Running with run_id: 63fda532-8663-450c-a4c4-de02fb356a82


'63fda532-8663-450c-a4c4-de02fb356a82'

In [None]:
#now I can easily do a quick check of the results in langtrace using the run_id tag to filter

In [74]:
#much like fine tuning a model in fastai I can now create a dataset to optimise the extractor
#ill bootstrap using the simple_extractor and then fine tune from there
#of course I could write some code to set the current output as the expected output but i've already reviews traces I am reasonable happy with so Ill create the dataset from them

def add_traces_to_dataset(run_id: str, dataset_name: str):
    langfuse.create_dataset(name=dataset_name,description="pragmatic dspy events")
    traces = langfuse.fetch_traces(tags=[run_id])
    for trace in traces.data:
        langfuse.create_dataset_item(dataset_name=dataset_name,input=trace.input,expected_output=trace.output,metadata={"trace_run_id":run_id})

In [76]:
#add whatever runs we are happy with to the dataset
add_traces_to_dataset('63fda532-8663-450c-a4c4-de02fb356a82','events')


In [111]:

ground_truth_evaluator = GroundTruthEvaluator()
def run_evaluation(name, model: str = "openai/gpt-4o-mini",dataset_name: str = "events", extractor: Extractor = Extractor(), evaluator: GroundTruthEvaluator = GroundTruthEvaluator()):
    dataset = langfuse.get_dataset(dataset_name)
    for item in dataset.items:
        print(item.id)
        if item.status != DatasetStatus.ARCHIVED:
            with item.observe(run_name=name, run_metadata={"model": model}) as trace_id:
                pred = extractor.extract(html=item.input["kwargs"]["html"], url=item.input["kwargs"]["url"],model=model)
                eval = evaluator.evaluate(html=item.input["kwargs"]["html"], url=item.input["kwargs"]["url"], title=pred.title, description=pred.description, location=pred.location, start_time=pred.start_time, end_time=pred.end_time)
                langfuse.score(name="GroundTruthEvaluator",value=eval.score,trace_id=trace_id,comment=eval.reasoning)


In [112]:
run_evaluation("baseline",dataset_name="events")


9fca1abf-9c5c-4116-afa0-7f246a9ac906
23b084cd-cf8e-4db2-ab27-5bde8ee73dd4
c2b1db76-ab44-46e1-9037-62b2024c75e2


In [80]:
#one thing to notice is that html pages can have very large token size even with just the body tag so 
#I've written a quick DSPy program to exclude pages with many events and limit to only the relvant html
#running across the data set it appears very reliable
from extractor import singular_event_page_evaluator
ds = langfuse.get_dataset("events")
ds_reduced = langfuse.create_dataset(name="events_reduced",description="pragmatic dspy events reduced")
for item in ds.items:
    pred = singular_event_page_evaluator(html=item.input["kwargs"]["html"], url=item.input["kwargs"]["url"])
    if pred.is_singular:
        langfuse.create_dataset_item(dataset_name=ds_reduced.name,input=item.input,expected_output=item.expected_output,metadata=item.metadata)



In [None]:
ground_truth_evaluator = GroundTruthEvaluator()
test_html = """
<div class="event-details">
    <h1>An Event</h1>
    <p class="description">Great event</p>
    <div class="location">Dublin</div>
    <div class="datetime">
        <time datetime="2024-12-12T13:00:00">1pm, 12th December 2024</time>
    </div>
</div>
"""
eval1 = ground_truth_evaluator.evaluate(html=test_html, url="https://bobsevents.com/event1", title="An Event", description="Great event", location="Dublin", start_time="2024-12-12T13:00:00", end_time=None)
eval2 = ground_truth_evaluator.evaluate(html=test_html, url="https://bobsevents.com/event1", title="An Event", description="Great event", location="Dublin", start_time="2024-12-12T13:00:00", end_time="2024-12-12T15:00:00")
print(eval1)
print(eval2)

In [85]:
#Now I can already see that the extractor often speculates on an end time when there is none specified
#I'd prefer to have the end date set to be None if there is ground truth in the html
#so I can now create a new dataset with the reduced set of pages and optimise the extractor
ds = langfuse.get_dataset("events_reduced")
eval_dataset = langfuse.create_dataset(name="eval_dataset",description="pragmatic dspy events reduced optimised")
for item in ds.items:
    if item.status != DatasetStatus.ARCHIVED:
        input = {"html":item.input["kwargs"]["html"],"url":item.input["kwargs"]["url"],"title":item.expected_output['_store']['title'],"description":item.expected_output['_store']['description'],"location":item.expected_output['_store']['location'],"start_time":item.expected_output['_store']['start_time'],"end_time":item.expected_output['_store']['end_time']}
        score = ground_truth_evaluator.evaluate(**input)
        if item.expected_output['_store']['end_time'] is None:
            score = 1
        langfuse.create_dataset_item(dataset_name=eval_dataset.name,input=input,expected_output=score)


{'_store': {'title': 'Tech Meetup Dublin', 'end_time': '2024-03-15T21:00:00', 'location': 'The Marker Hotel, Grand Canal Square, Dublin 2', 'start_time': '2024-03-15T18:00:00', 'description': "Join us for an evening of tech talks and networking in Dublin's Silicon Docks."}, '_completions': {'signature': {'__dict__': '<mappingproxy>', '__pydantic_extra__': '<member_descriptor>', '__pydantic_private__': '<member_descriptor>', '__pydantic_fields_set__': '<member_descriptor>'}, '_completions': {'title': ['Tech Meetup Dublin'], 'end_time': ['2024-03-15T21:00:00'], 'location': ['The Marker Hotel, Grand Canal Square, Dublin 2'], 'start_time': ['2024-03-15T18:00:00'], 'description': ["Join us for an evening of tech talks and networking in Dublin's Silicon Docks."]}}}


In [86]:
#split the dataset into train and test
dataset = langfuse.get_dataset("eval_dataset")
evaluator_examples = []
for item in dataset.items:
    evaluator_examples.append(dspy.Example(html=item.input["html"], url=item.input["url"], title=item.input["title"], description=item.input["description"], location=item.input["location"], start_time=item.input["start_time"], end_time=item.input["end_time"],score=item.expected_output["score"],reasoning=item.expected_output["reasoning"]).with_inputs("html","url","title","description","location","start_time","end_time"))
evaluator_train, evaluator_test = evaluator_examples[:int(len(evaluator_examples)*0.8)], evaluator_examples[int(len(evaluator_examples)*0.8):]
len(evaluator_train), len(evaluator_test), len(evaluator_examples)

(0, 1, 1)

In [None]:
def same_score(example, prediction, trace=None):
    return 1- abs(example.score - prediction.score) 

tp = dspy.MIPROv2(metric=same_score, auto="heavy", num_threads=6)
ground_truth_evaluator = tp.compile(ground_truth_evaluator, trainset=evaluator_train, max_bootstrapped_demos=2, max_labeled_demos=2, requires_permission_to_run=False)
ground_truth_evaluator.save("gteo.json")

In [103]:
ground_truth_evaluator = GroundTruthEvaluator(config_file="gteo.json")
eval1 = ground_truth_evaluator.evaluate(html=test_html, url="https://bobsevents.com/event1", title="An Event", description="Great event", location="Dublin", start_time="2024-12-12T13:00:00", end_time=None)
eval2 = ground_truth_evaluator.evaluate(html=test_html, url="https://bobsevents.com/event1", title="An Event", description="Great event", location="Dublin", start_time="2024-12-12T13:00:00", end_time="2024-12-12T15:00:00")
print(eval1)
print(eval2)

Prediction(
    score=0.8,
    reasoning='The extracted title, description, and location match the information found in the HTML. However, the end time is missing, which is a significant piece of information for an event. The start time is correctly extracted, but since the end time is not provided, it affects the overall score. Therefore, the score reflects a high accuracy for the available information but deducts points for the lack of end time.'
)
Prediction(
    score=0.8,
    reasoning='The extracted title, description, and location match the information found in the HTML. However, the end time is inferred and not explicitly stated in the HTML, which lowers the score. The start time is correctly extracted from the HTML. Overall, the information is mostly accurate, but the inferred end time affects the score.'
)


# now I can optimise the extractor
# first I need to split the dataset into train and test
# save the dataset back to langfuse

In [106]:
import random
random.seed(42)
from langfuse.api.resources.commons.types import DatasetStatus
dataset = langfuse.get_dataset("events_reduced")
train = []
test = []
data_size = len(dataset.items)
for item in dataset.items:
    if item.status != DatasetStatus.ARCHIVED:
        metadata = item.metadata
        if metadata is None:
            metadata = {}
        if random.random() < 0.8:
            train.append(dspy.Example(html=item.input["kwargs"]["html"], url=item.input["kwargs"]["url"], title=item.expected_output["_store"]["title"], description=item.expected_output["_store"]["description"], location=item.expected_output["_store"]["location"], start_time=item.expected_output["_store"]["start_time"], end_time=item.expected_output["_store"]["end_time"]).with_inputs("html","url"))
            metadata["set"] = "train"
        else:
            test.append(dspy.Example(html=item.input["kwargs"]["html"], url=item.input["kwargs"]["url"], title=item.expected_output["_store"]["title"], description=item.expected_output["_store"]["description"], location=item.expected_output["_store"]["location"], start_time=item.expected_output["_store"]["start_time"], end_time=item.expected_output["_store"]["end_time"]).with_inputs("html","url"))
            metadata["set"] = "test"
        langfuse.create_dataset_item("events_reduced",item.input,item.expected_output,metadata,item.source_trace_id,item.source_observation_id,item.status,item.id)
len(train), len(test), len(dataset.items)


(1, 0, 1)

In [None]:
@observe()
def ogt_evaluate(example, prediction, trace=None):
    ground_truth_evaluator = GroundTruthEvaluator(config_file="gteo.json")
    return ground_truth_evaluator.evaluate(html=example.html, url=example.url, title=prediction.title, description=prediction.description, location=prediction.location, start_time=prediction.start_time, end_time=prediction.end_time).score

def run_optimisation():
    prompt_lm = dspy.LM(model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",temperature=0.0,aws_region_name="eu-central-1",cooldown_time=30,provide_traceback=True)
    tp = dspy.MIPROv2(metric=ogt_evaluate, auto="medium", num_threads=3,verbose=True, prompt_model=prompt_lm, task_model=prompt_lm,max_errors=20)
    return tp.compile(simple_extractor.extract, trainset=train, max_bootstrapped_demos=2, max_labeled_demos=5, requires_permission_to_run=False)

optimised_extractor = run_optimisation()
optimised_extractor.save("optimised_extractor.json")


In [115]:
run_evaluation("optimised",dataset_name="events_reduced",extractor=Extractor(config_file="optimized_extract_v5.json"),evaluator=GroundTruthEvaluator(config_file="gteo.json"))


12241d32-f1cd-4330-b7dc-2579ed3beaff
