# HTML Retrieval Exploration

This notebook explores retrieval strategies over raw HTML to help identify relevant content for extraction.

Goals:
- Load saved scenario HTML files.
- Implement multiple retrieval strategies:
  - Keyword scoring over DOM blocks
  - TF-IDF vector similarity
  - BM25 ranking
  - Show DOM-based context (CSS-like selectors) for interpretability
- Provide a simple runner to compare results per query.


In [95]:
import os
from pathlib import Path
import re
from collections import Counter

import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

DATA_DIR = Path("../data/html").resolve()
print("Data dir:", DATA_DIR)

SCENARIOS = {
    "scenario1_books": DATA_DIR / "scenario1_books.html",
    "scenario2_jobs": DATA_DIR / "scenario2_jobs.html",
    "scenario3_clubs": DATA_DIR / "scenario3_clubs.html",
    "scenario4_property": DATA_DIR / "scenario4_property.html",
}

assert all(p.exists() for p in SCENARIOS.values()), "One or more scenario HTML files are missing."


Data dir: /Users/ardyh/Documents/job-applications/mrscraper/data/html


In [96]:
def read_html(path: Path) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def tokenize(text: str):
    # Simple alnum tokens, lowercase
    return re.findall(r"[A-Za-z0-9_]+", text.lower())


def css_path(tag):
    # Build a CSS-like path for a BeautifulSoup Tag
    parts = []
    el = tag
    while el and getattr(el, "name", None) and el.name != "[document]":
        sibling_index = 1
        sib = el
        while sib.previous_sibling is not None:
            sib = sib.previous_sibling
            if getattr(sib, "name", None) == el.name:
                sibling_index += 1
        parts.append(f"{el.name}:nth-of-type({sibling_index})")
        el = el.parent
    parts.reverse()
    return " > ".join(parts)


def block_text(block: dict) -> str:
    """Return the canonical text representation for a block."""
    return block.get("text") or block.get("content") or ""


def get_blocks(html: str, max_len: int = 50000):
    # Extract text blocks from common container elements
    soup = BeautifulSoup(html, "lxml")
    candidates = soup.find_all(["p", "li", "div", "article", "section", "tr"])
    blocks = []
    for c in candidates:
        txt = c.get_text(" ", strip=True)
        if not txt:
            continue
        # Filter very short/very long
        if len(txt) < 30:
            continue
        html_fragment = str(c)
        if len(txt) > max_len:
            txt = txt[:max_len]
        if len(html_fragment) > max_len:
            html_fragment = html_fragment[:max_len]
        blocks.append({
            "text": txt,
            "html": html_fragment,
            "content": txt,
            "path": css_path(c),
            "tag": c.name,
        })
    return blocks


def show_top(results, top_k=5):
    for i, r in enumerate(results[:top_k], 1):
        print(f"#{i} score={r['score']:.4f} tag={r['item']['tag']}")
        print("path:", r["item"]["path"])
        preview = block_text(r["item"])[:300].replace("\n", " ")
        print(preview)
        print("-")


In [97]:
def keyword_retrieval(blocks, query: str, top_k=5):
    qtokens = tokenize(query)
    qset = set(qtokens)
    results = []
    for b in blocks:
        tokens = tokenize(block_text(b))
        counts = Counter(tokens)
        score = sum(counts[t] for t in qset)
        if score:
            results.append({"item": b, "score": float(score)})
    results.sort(key=lambda x: x["score"], reverse=True)
    return results[: top_k * 5]  # oversample, then show_top trims


def tfidf_retrieval(blocks, query: str, top_k=5):
    texts = [block_text(b) for b in blocks]
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.9)
    X = vectorizer.fit_transform(texts)
    q = vectorizer.transform([query])
    sims = cosine_similarity(q, X)[0]
    order = np.argsort(-sims)
    results = [{"item": blocks[i], "score": float(sims[i])} for i in order[: top_k * 5]]
    return results


def bm25_retrieval(blocks, query: str, top_k=5):
    tokenized_corpus = [tokenize(block_text(b)) for b in blocks]
    bm25 = BM25Okapi(tokenized_corpus)
    scores = bm25.get_scores(tokenize(query))
    order = np.argsort(-scores)
    results = [{"item": blocks[i], "score": float(scores[i])} for i in order[: top_k * 5]]
    return results


In [98]:
def run_strategies(html_path: Path, query: str, top_k=5):
    html = read_html(html_path)
    blocks = get_blocks(html)
    print(f"Loaded {len(blocks)} blocks from {html_path.name}")

    print("\n[Keyword Retrieval]")
    kw = keyword_retrieval(blocks, query, top_k=top_k)
    show_top(kw, top_k)

    print("\n[TF-IDF Retrieval]")
    tf = tfidf_retrieval(blocks, query, top_k=top_k)
    show_top(tf, top_k)

    print("\n[BM25 Retrieval]")
    bm = bm25_retrieval(blocks, query, top_k=top_k)
    show_top(bm, top_k)

    return {"keyword": kw, "tfidf": tf, "bm25": bm}


## Try the four scenarios
You can tweak `top_k` and the queries below. Results print top matching DOM blocks with a CSS-like path and a text preview.


In [99]:
# Scenario 1 — Books to Scrape
run_strategies(
    SCENARIOS["scenario1_books"],
    query="Can you return me the books: name and price?",
    top_k=5,
)


Loaded 52 blocks from scenario1_books.html

[Keyword Retrieval]
#1 score=17.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1)
Home All products Books Travel Mystery Historical Fiction Sequential Art Classics Philosophy Romance Womens Fiction Fiction Childrens Religion Nonfiction Music Default Science Fiction Sports and Games Add a comment Fantasy New Adult Young Adult Science Poetry Paranormal Art Psychology Autobiography 
-
#2 score=17.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1)
Home All products Books Travel Mystery Historical Fiction Sequential Art Classics Philosophy Romance Womens Fiction Fiction Childrens Religion Nonfiction Music Default Science Fiction Sports and Games Add a comment Fantasy New Adult Young Adult Science Poetry Paranormal Art Psychology Autobiography 
-
#3 score=17.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > div:

    'path': 'html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1)',
    'tag': 'div'},
   'score': 17.0},
    'path': 'html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1)',
    'tag': 'div'},
   'score': 17.0},
    'path': 'html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1)',
    'tag': 'div'},
   'score': 17.0},
    'path': 'html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1)',
    'tag': 'div'},
   'score': 14.0},
    'path': 'html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > section:nth-of-type(1)',
    'tag': 'section'},
   'score': 14.0},
  {'item': {'text': "A Light in the ... £51.77 In stock Add to basket Tipping the Velvet £53.74 In stock Add to basket Soumission £50.10 In stock Add to basket Sharp Objects £47.82 In stock Add to baske

In [100]:
# Scenario 2 — Job Listings
run_strategies(
    SCENARIOS["scenario2_jobs"],
    query="Extract job title, location, salary, and company name from the listings",
    top_k=5,
)


Loaded 201 blocks from scenario2_jobs.html

[Keyword Retrieval]
#1 score=161.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1)
Job Search Doctors Resources Locum Doctor Jobs Australia Locum Doctor Jobs New Zealand Permanent Doctor Jobs Doctor Jobs in Australia Doctor Jobs in New Zealand International Medical Graduates Jobs for Senior Doctors Jobs for Junior Doctors Employers About us Our Services Our Consultant Team Our Reg
-
#2 score=161.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1)
Job Search Doctors Resources Locum Doctor Jobs Australia Locum Doctor Jobs New Zealand Permanent Doctor Jobs Doctor Jobs in Australia Doctor Jobs in New Zealand International Medical Graduates Jobs for Senior Doctors Jobs for Junior Doctors Employers About us Our Services Our Consultant Team Our Reg
-
#3 score=158.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(1) > d

{'keyword': [{'item': {'text': "Job Search Doctors Resources Locum Doctor Jobs Australia Locum Doctor Jobs New Zealand Permanent Doctor Jobs Doctor Jobs in Australia Doctor Jobs in New Zealand International Medical Graduates Jobs for Senior Doctors Jobs for Junior Doctors Employers About us Our Services Our Consultant Team Our Registrar & RMO Team Careers Refer a Friend B Corp FAQS Articles Contact us Sign in Join us Search results Save this search to receive job alerts by email when new jobs match. Save\xa0alert 2247 jobs found Sort By Resident Medical Officer SAVE JOB Emergency Medicine (ED) North Tamworth , New South Wales AU Locum $160 per hour 18 Dec 2025 ~ 18 Dec 2025 This public hospital in Australia is located in a bustling city that offers a unique blend of urban and rural living. The hospital is situated in a picturesque location surrounded by rolling hills and lush greenery, providing a serene and peaceful environment for patients and staff alike. The hos... Share See Detail

In [101]:
# Scenario 3 — Club Listing
run_strategies(
    SCENARIOS["scenario3_clubs"],
    query="Get the club names, logo image links and their official websites",
    top_k=5,
)


Loaded 95 blocks from scenario3_clubs.html

[Keyword Retrieval]
#1 score=27.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1)
Skip to main content Skip to primary navigation Arizona Soccer Association Menu Menu About ASA Staff Member Clubs Board of Directors Bylaws/Policies/Meeting Minutes Arizona Soccer Foundation MaxInMotion Scholarship Player Safety Safeguarding Upcoming Camps Member Benefits Contact Us Diversity, Equit
-
#2 score=17.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(3)
Member Clubs This is a list of every current member club of the Arizona Soccer Association. To identify a club near you please use the map or dropdown list below. Phoenix Arizona Soccer Academy Arizona Storm AYSO United AZ Arsenal AZFC Select AZ Golden Eagles FC Arizona Soccer Club AZ Inferno Brazas
-
#3 score=17.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(3) > div:

{'keyword': [{'item': {'text': "Skip to main content Skip to primary navigation Arizona Soccer Association Menu Menu About ASA Staff Member Clubs Board of Directors Bylaws/Policies/Meeting Minutes Arizona Soccer Foundation MaxInMotion Scholarship Player Safety Safeguarding Upcoming Camps Member Benefits Contact Us Diversity, Equity, and Inclusion Annual General Meeting Competitions Global Credit Union Arizona Advanced Leagues MaxInMotion Open League Global Credit Union Youth Academy State Cup Driven by Desert Ford Dealers State Cup Merchandise Presidents Cup Driven by Desert Ford Dealers Presidents Cup Merchandise 2025-2026 Sanctioned Club Tournaments Youth Academy Tournament Series ODP ODP Homepage ODP Calendar Tryouts Register for Tryouts ODP Pool Training Rosters Girls Pool Training Rosters Boys Pool Training Rosters Register for Pool Training ODP Staff Camps/Clinics Event Page: 2026 ODP Far West Championships Coaching Coaching Course Schedule National License Overview Course Descri

In [102]:
# Scenario 4 — Hidden Information
run_strategies(
    SCENARIOS["scenario4_property"],
    query="Return the property name, address, latitude and longitude",
    top_k=5,
)


Loaded 121 blocks from scenario4_property.html

[Keyword Retrieval]
#1 score=48.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1)
Questions? Call us ・ (877) 640-7787 Park City, UT Silverado Bedrooms 4 Beds 5 Bathrooms 4 Guests 11 Communal Pool Sparkling Clean Check our enhanced cleaning and sanitation protocols. Show details Work-Friendly Space Laptop friendly with high-speed WiFi. Tech-Enabled Locks Check yourself in with the
-
#2 score=42.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(4)
Questions? Call us ・ (877) 640-7787 Park City, UT Silverado Bedrooms 4 Beds 5 Bathrooms 4 Guests 11 Communal Pool Sparkling Clean Check our enhanced cleaning and sanitation protocols. Show details Work-Friendly Space Laptop friendly with high-speed WiFi. Tech-Enabled Locks Check yourself in with the
-
#3 score=42.0000 tag=div
path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(4) > 

{'keyword': [{'item': {'text': "Questions? Call us ・ (877) 640-7787 Park City, UT Silverado Bedrooms 4 Beds 5 Bathrooms 4 Guests 11 Communal Pool Sparkling Clean Check our enhanced cleaning and sanitation protocols. Show details Work-Friendly Space Laptop friendly with high-speed WiFi. Tech-Enabled Locks Check yourself in with the keypad. Welcome to Silverado. Modern mountain charm and sophistication come together to provide the ideal haven for family ski vacations or romantic mountain getaways. Majestic views, beautiful hardwood floors, and timeless finishes make this mountain retreat feel just like home. After a day on the slopes, warm up by the fireplace on the plush sofa with the après-ski snacks you prepared in the state-of-the-art kitchen and take in the majestic mountain landscape. When you've finished shredding for the day, head to the hot tub on the balcony and enjoy some much-deserved rest and relaxation. Local Attractions: Deer Valley Mountain Resort, Main Street, The Farm R

## LLM via LM Studio
This section adds a minimal client for LM Studio's OpenAI-compatible API (`/v1/chat/completions`). Configure:
- `LMSTUDIO_BASE_URL` (default `http://localhost:1234/v1`)
- `LMSTUDIO_MODEL` (name of the loaded model in LM Studio)
- `LMSTUDIO_API_KEY` (optional; some setups accept any token)


In [103]:
import os
import json
import time
import textwrap
import subprocess
from pathlib import Path
from typing import List, Dict, Any, Optional

import httpx

LMSTUDIO_BASE_URL = os.getenv("LMSTUDIO_BASE_URL", "http://localhost:1234/v1")
LMSTUDIO_MODEL = os.getenv("LMSTUDIO_MODEL", "qwen2.5-coder-7b-instruct")  # set in your env or here
LMSTUDIO_API_KEY = os.getenv("LMSTUDIO_API_KEY", "lm-studio")  # some setups accept any token


def lmstudio_chat(messages: List[Dict[str, str]], model: Optional[str] = None,
                   temperature: float = 0.0, max_tokens: int = 1024) -> str:
    assert LMSTUDIO_MODEL or model, "LMSTUDIO_MODEL not set. Please set the env var or pass model=."
    url = f"{LMSTUDIO_BASE_URL}/chat/completions" if LMSTUDIO_BASE_URL.endswith("/v1") else f"{LMSTUDIO_BASE_URL}/v1/chat/completions"
    headers = {"Authorization": f"Bearer {LMSTUDIO_API_KEY}"}
    payload = {
        "model": model or LMSTUDIO_MODEL,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "stream": False,
    }
    with httpx.Client(timeout=120) as client:
        resp = client.post(url, headers=headers, json=payload)
        resp.raise_for_status()
        data = resp.json()
    try:
        return data["choices"][0]["message"]["content"]
    except Exception:
        return json.dumps(data, indent=2)



In [104]:
from bs4 import BeautifulSoup

def html_skeleton(html: str, max_depth: int = 3, max_children: int = 6) -> str:
    """Return a compact tree-like skeleton of the HTML (tags + ids/classes),
    limited by depth and children per node for readability.
    """
    soup = BeautifulSoup(html, "lxml")
    root = soup.body or soup

    def node_label(el):
        if not getattr(el, "name", None) or el.name == "[document]":
            return None
        ident = []
        el_id = el.get("id")
        if el_id:
            ident.append(f"#{el_id}")
        classes = el.get("class")
        if classes:
            ident.extend([f".{c}" for c in classes[:3]])
        return el.name + ("" if not ident else " " + "".join(ident))

    def walk(el, depth: int) -> str:
        if depth > max_depth:
            return ""
        label = node_label(el)
        if not label:
            return ""
        lines = [label]
        children = [c for c in el.children if getattr(c, "name", None)]
        for child in children[:max_children]:
            child_repr = walk(child, depth + 1)
            if child_repr:
                lines.append("  " * depth + "- " + child_repr.replace("\n", f"\n{'  ' * depth}  "))
        if len(children) > max_children:
            lines.append("  " * depth + f"- ... ({len(children) - max_children} more)")
        return "\n".join(lines)

    return walk(root, 1) or "(empty)"


In [105]:
# Demo: HTML skeleton on Scenario 1 (first 120 lines)
html_1 = read_html(SCENARIOS["scenario1_books"])
skel = html_skeleton(html_1, max_depth=3, max_children=6)
print("\n".join(skel.splitlines()[:120]))


body #default.default
  - header .header.container-fluid
        - div .page_inner
  - div .container-fluid.page
        - div .page_inner
  - footer .footer.container-fluid
  - script
  - script
  - script
  - ... (4 more)


In [106]:
CODE_DIR = Path("../generated").resolve()
CODE_DIR.mkdir(parents=True, exist_ok=True)


def extract_code_block(text: str) -> str:
    import re
    match = re.search(r"```(?:python)?\s*([\s\S]+?)```", text)
    return (match.group(1) if match else text).strip()


In [107]:
# Demo: end-to-end on Scenario 1
run_test = False
if run_test:
    html = read_html(SCENARIOS["scenario1_books"])
    blocks = get_blocks(html)
    retrieved = bm25_retrieval(blocks, "Can you return me the books: name and price?", top_k=5)
    skel = html_skeleton(html, max_depth=3, max_children=6)

    res = codegen_execute_retry(
        query="Can you return me the books: name and price?",
        html=html,
        retrieved=retrieved,
        skeleton_text=skel,
        max_iterations=3,
    )
    print("\nSuccess:", res["success"]) 
    print("Iterations:", res["iterations"]) 
    if res["success"]:
        print(json.dumps(res["result"]["json"], indent=2)[:2000])
    else:
        print(res["result"]["stderr"][:1000])


## File-based code generation and execution
Instead of injecting HTML into the generated script, we now pass the HTML file path to the LLM and require the code to read from `HTML_FILE_PATH`.


In [108]:
import re

def build_codegen_prompt_from_file(query: str, blocks: List[Dict[str, Any]], skeleton_text: str,
                                   html_file_path: str, max_block_chars: int = 2000) -> List[Dict[str, str]]:
    blocks_joined = []
    for b in blocks[:10]:
        snippet_text = block_text(b["item"])[:max_block_chars]
        snippet_html = (b["item"].get("html") or "")[:max_block_chars]
        blocks_joined.append(
            f"PATH: {b['item']['path']}\nTEXT: {snippet_text}\nHTML_SNIPPET: {snippet_html}"
        )
    blocks_text = "\n\n".join(blocks_joined)

    system = (
        "You are a senior Python engineer. Generate ONLY Python code in a single block.\n"
        "Assumptions and constraints:\n"
        "- The page HTML file path is available in a variable named HTML_FILE_PATH (string).\n"
        "- Read the file contents, parse with BeautifulSoup (bs4), and extract the requested data.\n"
        "- Print ONLY a valid JSON object/array to stdout (no extra text).\n"
        "- Do not write files or make network calls.\n"
        "- Keep code self-contained and deterministic.\n"
    )

    user = (
        f"Query:\n{query}\n\n"
        f"HTML file path (read this):\n{html_file_path}\n\n"
        f"Page skeleton (truncated):\n{ skeleton_text[:4000] }\n\n"
        f"Retrieved blocks (top-N, truncated) with raw HTML snippets:\n{ blocks_text[:6000] }\n\n"
        "Return only python code fenced with triple backticks."
    )

    return [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]


# Redefine: run_generated_code_from_file with scenario-aware filenames
def _slug_from_path(html_file_path: str) -> str:
    from pathlib import Path
    stem = Path(html_file_path).stem
    return re.sub(r"[^a-zA-Z0-9]+", "_", stem).strip("_").lower()

def run_generated_code_from_file(code_str: str, html_file_path: str, code_path: Optional[Path] = None,
                                 timeout: int = 60) -> Dict[str, Any]:
    slug = _slug_from_path(html_file_path)
    code_path = code_path or (CODE_DIR / f"extract_{slug}_{int(time.time())}.py")
    header = 'HTML_FILE_PATH = r"' + html_file_path.replace('"', '\\"') + '"\n'
    full_code = header + "\n" + code_str
    code_path.write_text(full_code, encoding="utf-8")

    proc = subprocess.run(
        ["python", str(code_path)],
        capture_output=True,
        text=True,
        timeout=timeout,
    )
    ok = proc.returncode == 0
    out = (proc.stdout or "").strip()
    err = (proc.stderr or "").strip()
    parsed: Any = None
    if ok and out:
        try:
            parsed = json.loads(out)
        except Exception as e:
            ok = False
            err = f"Output is not valid JSON: {e}\nRaw: {out[:5000]}"
    return {"ok": ok, "stdout": out, "stderr": err, "path": str(code_path), "json": parsed}


def codegen_execute_retry_from_file_v2(query: str, html_file_path: str, retrieved: list[dict],
                                       skeleton_text: str, max_iterations: int = 3, debug: bool = False,
                                       min_items: int | None = None, required_fields: list[str] | None = None,
                                       expect_array: bool | None = None) -> dict:
    """
    Loop: on failure, ask code interpreter to fix. On success, ask LLM whether we have
    enough results or should retry for more/better extraction.
    """
    def build_messages_full(query, retrieved, skeleton_text, html_file_path, extra_user_content=None):
        # Always begin with prompt containing the html/blocks/skeleton; may append extra instructions from user if given.
        messages = build_codegen_prompt_from_file(query, retrieved, skeleton_text, html_file_path)
        if extra_user_content is not None:
            messages.append({"role": "user", "content": extra_user_content})
        return messages

    messages = build_messages_full(query, retrieved, skeleton_text, html_file_path)
    last_code = None
    last_json_preview = ""

    for i in range(1, max_iterations + 1):
        print(f"\n[Iteration {i}] Generating code...")
        if debug:
            print(messages[-2:])
        completion = lmstudio_chat(messages)
        code = extract_code_block(completion)
        last_code = code
        result = run_generated_code_from_file(code, html_file_path)

        if result["ok"]:
            parsed = result.get("json")
            json_preview = json.dumps(parsed, ensure_ascii=False)[:2000] if parsed is not None else ""
            print("Code executed successfully.")
            # Ask LLM: is this sufficient?
            # (We use a simple prompt; this avoids explicit validation logic in Python.)
            sufficiency_question = (
                f"You previously generated code to answer this query:\n"
                f"{query}\n\n"
                f"The code executed successfully and produced this JSON (truncated):\n"
                f"{json_preview}\n\n"
                f"Does this JSON output sufficiently and completely answer the query? "
                f"Reply Y if yes, or N if you see that important data is missing or incomplete. "
                f"(Reply with a single line: Y or N. If N, you may add a short reason after the N.)"
            )
            sufficiency_messages = messages + [
                {"role": "assistant", "content": completion},
                {"role": "user", "content": sufficiency_question},
            ]
            answer = lmstudio_chat(sufficiency_messages)
            answer_line = answer.strip().splitlines()[0].strip().upper()
            if answer_line.startswith("Y"):
                print("Success. Code path:", result["path"])
                return {"success": True, "iterations": i, "result": result}
            else:
                print("LLM says: Output is incomplete or insufficient.")
                reason = answer.strip()
                fix_user = (
                    "Your previous code executed but the JSON output did not fully answer the query, "
                    "perhaps due to missing/insufficient results. Please improve the code extraction."
                    "\n\n"
                    f"LLM assessment: {reason}\n\n"
                    f"Recent JSON (truncated):\n{json_preview}\n\n"
                    f"Your previous code:\n```python\n{last_code}\n```\n"
                    "Constraints:\n"
                    "- Extract ALL relevant items on the page (do not cap results).\n"
                    "- Do NOT truncate strings (no ellipses).\n"
                    "- Ensure all required fields are populated if present in the HTML.\n"
                    "- Output ONLY valid JSON to stdout.\n"
                )
                # On retry, always inject the full context prompt plus feedback and previous code
                messages = build_messages_full(
                    query, retrieved, skeleton_text, html_file_path, extra_user_content=fix_user
                )
        else:
            print("Failed run. Error follows:\n", result["stderr"][:1000])
            fix_user = (
                "The previous code failed. Please respond with a fixed Python code block only.\n\n"
                f"Error:\n{result['stderr'][:4000]}\n\n"
                f"Your previous code:\n```python\n{last_code}\n```"
            )
            # On retry, always inject the full context prompt plus feedback and previous code
            messages = build_messages_full(
                query, retrieved, skeleton_text, html_file_path, extra_user_content=fix_user
            )

    return {"success": False, "iterations": max_iterations, "result": {"stderr": "Max iterations reached"}}


In [109]:
# Redefine: get_blocks to return both text and raw HTML from expanded tags, skip header/footer/nav, and support grouping
from bs4 import BeautifulSoup

def get_blocks_expanded(
    html: str,
    min_chars: int = 30,
    max_chars: int = 4000,
    include_tags = ("p","li","div","article","section","tr","td","h1","h2","h3","h4","h5","h6","span","a"),
    exclude_containers = ("header","footer","nav","script","style","noscript","template"),
    group_contiguous: bool = False,
    group_window: int = 2,
    combined_only: bool = False,
    html_only: bool = True,
):
    """
    Extract text + raw HTML blocks from expanded set of tags, skipping typical non-content containers.
    Optionally produce contiguous combined blocks (neighbors of the same tag) to capture list rows/items.
    - group_contiguous: when True, for each candidate create a combined block with up to +/- group_window siblings.
    - combined_only: when True, return only the combined blocks (skip individual ones).
    - html_only: if True, the primary content is the raw HTML; otherwise it's text, but both representations are stored.
    """
    soup = BeautifulSoup(html, "lxml")

    def is_in_excluded(el) -> bool:
        p = el
        while p is not None and getattr(p, "name", None):
            if p.name in exclude_containers:
                return True
            p = p.parent
        return False

    def html_of(el) -> str:
        h = str(el) or ""
        if len(h) > max_chars:
            h = h[:max_chars]
        return h

    def text_of(el) -> str:
        t = el.get_text(" ", strip=True) or ""
        if len(t) > max_chars:
            t = t[:max_chars]
        return t

    candidates = soup.find_all(include_tags)

    blocks: list[dict] = []
    combined: list[dict] = []

    for c in candidates:
        if is_in_excluded(c):
            continue
        text_val = text_of(c)
        html_val = html_of(c)
        primary = html_val if html_only else text_val
        if not primary:
            primary = text_val or html_val
        if not primary or len(primary) < min_chars:
            continue
        block = {
            "text": text_val,
            "html": html_val,
            "content": primary,
            "path": css_path(c),
            "tag": c.name,
        }
        if not combined_only:
            blocks.append(block)

        if group_contiguous:
            parent = c.parent
            if parent is None:
                continue
            siblings = [child for child in parent.find_all(c.name, recursive=False)]
            try:
                idx = siblings.index(c)
            except ValueError:
                idx = None
            if idx is None:
                continue
            start = max(0, idx - group_window)
            end = min(len(siblings), idx + group_window + 1)
            grouped = siblings[start:end]
            text_parts = []
            html_parts = []
            for g in grouped:
                t = text_of(g)
                h = html_of(g)
                if t:
                    text_parts.append(t)
                if h:
                    html_parts.append(h)
            combined_text = "\n".join(text_parts)
            combined_html = "\n".join(html_parts)
            primary_combined = combined_html if html_only else combined_text
            if not primary_combined:
                primary_combined = combined_text or combined_html
            if not primary_combined or len(primary_combined) < min_chars:
                continue
            combined.append({
                "text": combined_text[:max_chars],
                "html": combined_html[:max_chars],
                "content": primary_combined[:max_chars],
                "path": css_path(parent) + " (grouped " + c.name + f"[{start}:{end}])",
                "tag": c.name,
            })

    return combined if combined_only else (combined + blocks)


In [110]:
# Demo: file-based run on Scenario 1
html_path = str(SCENARIOS["scenario1_books"])  # pass filename
html = read_html(SCENARIOS["scenario1_books"])  # still use content for retrieval & skeleton
blocks = get_blocks_expanded(html, group_contiguous=True, group_window=3, html_only=False)
retrieved = tfidf_retrieval(blocks, "Can you return me the books: name and price?", top_k=8)
skel = html_skeleton(html, max_depth=30, max_children=10)

res_file = codegen_execute_retry_from_file_v2(
    query="Can you return me the books: name and price?",
    html_file_path=html_path,
    retrieved=retrieved,
    skeleton_text=skel,
    max_iterations=5,
    debug=False
)
print("\nSuccess:", res_file["success"])
print("Iterations:", res_file["iterations"])
if res_file["success"]:
    print(json.dumps(res_file["result"]["json"], indent=2)[:2000])
else:
    print(res_file["result"]["stderr"][:1000])



[Iteration 1] Generating code...
Code executed successfully.
Success. Code path: /Users/ardyh/Documents/job-applications/mrscraper/generated/extract_scenario1_books_1762643414.py

Success: True
Iterations: 1
[
  {
    "name": "A Light in the ...",
    "price": "\u00a351.77"
  },
  {
    "name": "Tipping the Velvet",
    "price": "\u00a353.74"
  },
  {
    "name": "Soumission",
    "price": "\u00a350.10"
  },
  {
    "name": "Sharp Objects",
    "price": "\u00a347.82"
  },
  {
    "name": "Sapiens: A Brief History ...",
    "price": "\u00a354.23"
  },
  {
    "name": "The Requiem Red",
    "price": "\u00a322.65"
  },
  {
    "name": "The Dirty Little Secrets ...",
    "price": "\u00a333.34"
  },
  {
    "name": "The Coming Woman: A ...",
    "price": "\u00a317.93"
  },
  {
    "name": "The Boys in the ...",
    "price": "\u00a322.60"
  },
  {
    "name": "The Black Maria",
    "price": "\u00a352.15"
  },
  {
    "name": "Starving Hearts (Triangular Trade ...",
    "price": "\u00a313.99

In [None]:
# Demo: file-based run on Scenario 2 (jobs)
html_path = str(SCENARIOS["scenario2_jobs"])  # pass filename
html = read_html(SCENARIOS["scenario2_jobs"])  # still use content for retrieval & skeleton
blocks = get_blocks_expanded(html, group_contiguous=True, group_window=3, html_only=False)
retrieved = tfidf_retrieval(blocks, "Extract job title, location, salary, and company name from the job listings", top_k=8)
skel = html_skeleton(html, max_depth=30, max_children=10)

res_file = codegen_execute_retry_from_file_v2(
    query="Extract job title, location, salary, and company name from the job listings",
    html_file_path=html_path,
    retrieved=retrieved,
    skeleton_text=skel,
    max_iterations=5,
    debug=False
)
print("\nSuccess:", res_file["success"])
print("Iterations:", res_file["iterations"])
if res_file["success"]:
    print(json.dumps(res_file["result"]["json"], indent=2)[:2000])
else:
    print(res_file["result"]["stderr"][:1000])



[Iteration 1] Generating code...
[{'role': 'system', 'content': 'You are a senior Python engineer. Generate ONLY Python code in a single block.\nAssumptions and constraints:\n- The page HTML file path is available in a variable named HTML_FILE_PATH (string).\n- Read the file contents, parse with BeautifulSoup (bs4), and extract the requested data.\n- Print ONLY a valid JSON object/array to stdout (no extra text).\n- Do not write files or make network calls.\n- Keep code self-contained and deterministic.\n'}, {'role': 'user', 'content': 'Query:\nExtract job title, location, salary, and company name from the job listings\n\nHTML file path (read this):\n/Users/ardyh/Documents/job-applications/mrscraper/data/html/scenario2_jobs.html\n\nPage skeleton (truncated):\nbody\n  - div #__next\n        - div .style_outsideContainer__J0QRG\n                - div .style_bgImageTinyBanner__UoFoC\n                          - span\n                                      - img .style_bgImage__6KJyu\n    

In [None]:
# Demo: file-based run on Scenario 3
html_path = str(SCENARIOS["scenario3_clubs"])  # pass filename
html = read_html(SCENARIOS["scenario3_clubs"])  # still use content for retrieval & skeleton
blocks = get_blocks_expanded(html, html_only=False)
query = "Get the club names, logo image links and their official websites" 
retrieved = bm25_retrieval(blocks, query, top_k=5)
skel = html_skeleton(html, max_depth=20, max_children=5)

res_file = codegen_execute_retry_from_file_v2(
    query=query,
    html_file_path=html_path,
    retrieved=retrieved,
    skeleton_text=skel,
    max_iterations=3,
    debug=False
)
print("\nSuccess:", res_file["success"]) 
print("Iterations:", res_file["iterations"]) 
if res_file["success"]:
    print(json.dumps(res_file["result"]["json"], indent=2)[:2000])
else:
    print(res_file["result"]["stderr"][:1000])




[Iteration 1] Generating code...
Success. Code path: /Users/ardyh/Documents/job-applications/mrscraper/generated/extract_1762544348.py

Success: True
Iterations: 1
[]


In [None]:
# Demo: file-based run on Scenario 4
html_path = str(SCENARIOS["scenario4_property"])  # pass filename
html = read_html(SCENARIOS["scenario4_property"])  # still use content for retrieval & skeleton
blocks = get_blocks_expanded(html)
query = "Return the property name, address, latitude and longitude"
retrieved = bm25_retrieval(blocks, query, top_k=5)
skel = html_skeleton(html, max_depth=20, max_children=5)

res_file = codegen_execute_retry_from_file_v2(
    query=query,
    html_file_path=html_path,
    retrieved=retrieved,
    skeleton_text=skel,
    max_iterations=3,
    debug=False
)
print("\nSuccess:", res_file["success"]) 
print("Iterations:", res_file["iterations"]) 
if res_file["success"]:
    print(json.dumps(res_file["result"]["json"], indent=2)[:2000])
else:
    print(res_file["result"]["stderr"][:1000])



[Iteration 1] Generating code...
Failed run. Error follows:
 Traceback (most recent call last):
  File "/Users/ardyh/Documents/job-applications/mrscraper/generated/extract_1762544415.py", line 13, in <module>
    property_name = soup.find('meta', {'name': 'title'}).get('content')
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get'

[Iteration 2] Generating code...
Success. Code path: /Users/ardyh/Documents/job-applications/mrscraper/generated/extract_1762544436.py

Success: True
Iterations: 2
{
  "property_name": null,
  "address": null,
  "latitude": null,
  "longitude": null
}


# Sandbox

In [10]:
from bs4 import BeautifulSoup

# Example HTML snippet for testing
html = """
<html>
  <body>
    <div>
      <p>First paragraph</p>
      <p class="highlight">Second paragraph</p>
      <ul>
        <li>Item one</li>
        <li>Item two</li>
      </ul>
    </div>
    <section>
      <article>
        <div>
          <p>Nested paragraph</p>
        </div>
      </article>
    </section>
  </body>
</html>
"""

soup = BeautifulSoup(html, "lxml")
test_elements = [
    soup.find("p"),                         # First <p>
    soup.find("p", {"class": "highlight"}), # Second <p>
    soup.find("li"),                        # First <li>
    soup.find_all("li")[1],                 # Second <li>
    soup.find("section").find("p"),         # Nested <p>
]

print("Testing css_path:")
for el in test_elements:
    print(f"Text: {el.get_text(strip=True)!r}")
    print(f"css_path: {css_path(el)}")
    print("-" * 40)


Testing css_path:
Text: 'First paragraph'
css_path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > p:nth-of-type(1)
----------------------------------------
Text: 'Second paragraph'
css_path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > p:nth-of-type(2)
----------------------------------------
Text: 'Item one'
css_path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > ul:nth-of-type(1) > li:nth-of-type(1)
----------------------------------------
Text: 'Item two'
css_path: html:nth-of-type(1) > body:nth-of-type(1) > div:nth-of-type(1) > ul:nth-of-type(1) > li:nth-of-type(2)
----------------------------------------
Text: 'Nested paragraph'
css_path: html:nth-of-type(1) > body:nth-of-type(1) > section:nth-of-type(1) > article:nth-of-type(1) > div:nth-of-type(1) > p:nth-of-type(1)
----------------------------------------
