In [2]:
import logging

logging.basicConfig(level=logging.INFO)

# Get the logger for docling and set its level
logging.getLogger('docling').setLevel(logging.INFO)
logging.getLogger('docling_core').setLevel(logging.INFO)
log = logging.getLogger(__name__)  # This makes your script a logging-aware application

In [7]:
def find_headers_in_html(doc, html_string, word):
    """Find headers in HTML that contain the given word, and include page info and parent H1 from doc."""
    try:
        from bs4 import BeautifulSoup
        from docling_core.types.doc.document import SectionHeaderItem
        soup = BeautifulSoup(html_string, 'html.parser')
        headers = []
        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = tag.get_text().strip()
            if word.lower() in text.lower():
                # Find parent H1
                parent_h1 = None
                current = tag
                while current:
                    current = current.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
                    if current and current.name == 'h1':
                        parent_h1 = current.get_text().strip()
                        break
                # Find corresponding SectionHeaderItem in doc
                page = None
                for item in doc.texts:
                    if isinstance(item, SectionHeaderItem) and item.text.strip() == text:
                        prov = getattr(item, "prov", None)
                        if prov:
                            for p in prov:
                                pg = getattr(p, "page_no", None)
                                if pg is not None:
                                    page = int(pg)
                                    break
                        break
                headers.append((tag.name, text, page, parent_h1))
        return headers
    except ImportError:
        print("BeautifulSoup not available. Install with: pip install beautifulsoup4")
        return []

# Call the function
html = export_doc_html(doc)
replacement_headers = find_headers_in_html(doc, html, 'replacement')
for level, text, page, parent_h1 in replacement_headers:
    print(f"{level}: {text} (page {page}) - Parent H1: {parent_h1}")

Saved HTML to artifacts/doc.html
h2: Filter replacement (page 492) - Parent H1: None
h2: Filter replacement (page 492) - Parent H1: None
h2: Blade replacement (page 494) - Parent H1: None
h2: Front windshield wiper blade replacement (page 495) - Parent H1: None
h2: Rear window wiper blade replacement (page 496) - Parent H1: None
h2: Tire replacement (page 503) - Parent H1: None
h2: Compact spare tire replacement (page 504) - Parent H1: None
h2: Wheel replacement (page 504) - Parent H1: None
h2: Instrument panel fuse replacement (page 514) - Parent H1: None
h2: Engine compartment panel fuse replacement (page 514) - Parent H1: None
h2: Front light replacement (page 526) - Parent H1: None
h2: Side repeater light replacement (page 527) - Parent H1: None
h2: Rear combination light replacement (page 527) - Parent H1: None
h2: High mounted stop light replacement (page 528) - Parent H1: None
h2: License plate light replacement (page 528) - Parent H1: None
h2: Interior light replacement (page 5

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def compute_semantic_similarity(headers_list, query):
    """
    Compute semantic similarity between a list of headers and a query string using TF-IDF vectors.
    
    Args:
        headers_list: List of tuples (level, text, page, parent_h1) from find_headers_in_html or similar.
        query: The query string (phrase or word).
    
    Returns:
        List of dicts with 'header', 'cosine_similarity', and 'euclidean_distance'.
    """
    try:

        
        # Extract texts from headers
        texts = [text for _, text, _, _ in headers_list]
        texts.append(query)
        
        # Vectorize using TF-IDF
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(texts)
        
        # Query vector is the last one
        query_vec = tfidf_matrix[-1]
        
        similarities = []
        for i, header in enumerate(headers_list):
            header_vec = tfidf_matrix[i]
            
            # Cosine similarity
            cos_sim = cosine_similarity(query_vec, header_vec)[0][0]
            
            # Euclidean distance
            euclidean = np.linalg.norm(query_vec.toarray() - header_vec.toarray())
            
            similarities.append({
                'header': header,
                'cosine_similarity': cos_sim,
                'euclidean_distance': euclidean
            })
        
        return similarities
    except ImportError as e:
        print(f"Required libraries not available: {e}. Install scikit-learn and numpy.")
        return []

In [13]:
html = export_doc_html(doc)
replacement_headers = find_headers_in_html(doc, html, 'cabin air filter')
compute_semantic_similarity(replacement_headers, 'replace cabin air filter')

Saved HTML to artifacts/doc.html


[{'header': ('h2', 'Cabin air filter', 203, None),
  'cosine_similarity': np.float64(0.6705436749433179),
  'euclidean_distance': np.float64(0.81173434700853)},
 {'header': ('h2', 'Cabin air filter', 203, None),
  'cosine_similarity': np.float64(0.6705436749433179),
  'euclidean_distance': np.float64(0.81173434700853)},
 {'header': ('h2', 'Cabin Air Filter', 493, None),
  'cosine_similarity': np.float64(0.6705436749433179),
  'euclidean_distance': np.float64(0.81173434700853)}]

In [3]:
from docling_core.types.doc.document import SectionHeaderItem

def find_headers_with_word(doc, word):
    """Find all SectionHeaderItem that contain the given word in their text."""
    matches = []
    for text in doc.texts:
        if isinstance(text, SectionHeaderItem):
            if word.lower() in text.text.lower():
                matches.append(text)
    return matches

# Example usage
replacement_headers = find_headers_with_word(doc, 'replacement')
for header in replacement_headers:
    print(header)

self_ref='#/texts/9550' parent=RefItem(cref='#/body') children=[] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.SECTION_HEADER: 'section_header'> prov=[ProvenanceItem(page_no=492, bbox=BoundingBox(l=36.734, t=514.1930122070312, r=138.19, b=503.7220122070312, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 18))] orig='Filter replacement' text='Filter replacement' formatting=None hyperlink=None level=1
self_ref='#/texts/9579' parent=RefItem(cref='#/body') children=[] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.SECTION_HEADER: 'section_header'> prov=[ProvenanceItem(page_no=493, bbox=BoundingBox(l=235.158, t=393.1940122070312, r=336.374, b=382.7230122070312, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 18))] orig='Filter replacement' text='Filter replacement' formatting=None hyperlink=None level=1
self_ref='#/texts/9602' parent=RefItem(cref='#/body') children=[] content_layer=<ContentLayer.BODY: 'body'> label=<DocIt

In [None]:
doc

In [43]:
# Parameters for header-only selection
query = "how do i change the cabin air filter?"  # set this per user request
print("Query:", query)

Query: how do i change the cabin air filter?


In [47]:
# Header index with spaCy lemmatization and optional WordNet synonyms
import re
from typing import List, Any, Tuple, Dict, Set
from collections import defaultdict

from docling_core.types.doc.document import SectionHeaderItem

# Load spaCy model (lightweight) and optionally WordNet
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")
    except Exception:
        # Try to download if missing (comment out if offline)
        import sys, subprocess
        subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=False)
        nlp = spacy.load("en_core_web_sm")
except Exception as e:
    nlp = None
    print("spaCy not available:", e)

try:
    import nltk
    from nltk.corpus import wordnet as wn
    try:
        _ = wn.synsets("test")
    except LookupError:
        nltk.download("wordnet")
        nltk.download("omw-1.4")
except Exception as e:
    wn = None
    print("WordNet not available:", e)


def is_body(x: Any) -> bool:
    v = getattr(x, "content_layer", None)
    return getattr(v, "value", v) == "body"

texts: List[Any] = list(doc.texts)
headers: List[Tuple[int, SectionHeaderItem]] = [
    (i, t) for i, t in enumerate(texts) if isinstance(t, SectionHeaderItem) and is_body(t)
]
print(f"Found {len(headers)} body headers.")

# --- Page helpers ---

def item_pages(obj: Any) -> Set[int]:
    pages: Set[int] = set()
    prov = getattr(obj, "prov", None)
    if prov:
        for p in prov:
            pg = getattr(p, "page_no", None)
            if pg is not None:
                try:
                    pages.add(int(pg))
                except Exception:
                    pass
    # Fallback if no provenance: use page_ref (0-indexed in many docling builds)
    pr = getattr(obj, "page_ref", None)
    if pr is not None and not pages:
        try:
            pages.add(int(pr) + 1)
        except Exception:
            pages.add(1)
    return pages


def nodes_pages(nodes: List[Any]) -> Set[int]:
    ps: Set[int] = set()
    for n in nodes:
        ps |= item_pages(n)
    return ps

# Slice a section and filter out empty/TOC

def slice_nodes(i: int) -> Tuple[SectionHeaderItem, List[Any]]:
    h = texts[i]
    lvl = getattr(h, "level", 3)
    nodes = []
    for j in range(i + 1, len(texts)):
        t = texts[j]
        if not is_body(t):
            continue
        if isinstance(t, SectionHeaderItem) and getattr(t, "level", 3) <= lvl:
            break
        nodes.append(t)
    return h, nodes


def has_content(nodes: List[Any]) -> bool:
    textish = 0
    structural = 0
    for n in nodes:
        name = n.__class__.__name__.lower()
        if hasattr(n, "text") and name != "sectionheaderitem":
            if re.search(r"\w", getattr(n, "text", "") or ""):
                textish += 1
        if hasattr(n, "items") or hasattr(n, "num_rows") or hasattr(n, "caption"):
            structural += 1
    return textish >= 1 or structural >= 1

# Build header index with POS-tagged lemmas and document pages
IndexItem = Dict[str, Any]
index: List[IndexItem] = []

for i, h in headers:
    title = getattr(h, "text", "") or ""
    header_ps = item_pages(h)
    nodes = slice_nodes(i)[1]
    if not has_content(nodes):
        continue
    section_ps = nodes_pages(nodes)

    nouns: Set[str] = set()
    verbs: Set[str] = set()

    if nlp is not None:
        doc_h = nlp(title)
        for tok in doc_h:
            if tok.is_stop or not tok.is_alpha:
                continue
            lemma = tok.lemma_.lower()
            if tok.pos_ in ("NOUN", "PROPN"):
                nouns.add(lemma)
            elif tok.pos_ in ("VERB",):
                verbs.add(lemma)
    else:
        # Fallback: simple regex tokenization, no POS
        for t in re.findall(r"\b[\w-]+\b", title.lower()):
            if len(t) >= 2:
                nouns.add(t)

    # Optional synonym expansion via WordNet
    syns_n: Set[str] = set()
    syns_v: Set[str] = set()
    if wn is not None:
        for n in nouns:
            for s in wn.synsets(n, pos=wn.NOUN):
                for l in s.lemma_names():
                    syns_n.add(l.replace("_", " ").lower())
        for v in verbs:
            for s in wn.synsets(v, pos=wn.VERB):
                for l in s.lemma_names():
                    syns_v.add(l.replace("_", " ").lower())

    index.append({
        "i": i,
        # First page where the header is located (doc page numbers)
        "header_pages": sorted(header_ps),
        # Pages covered by the section content
        "section_pages": sorted(section_ps),
        # Convenience union of all pages for this section (header + content)
        "doc_pages": sorted((header_ps | section_ps)),
        "level": getattr(h, "level", 3),
        "title": title,
        "nouns": nouns,
        "verbs": verbs,
        "syn_nouns": syns_n,
        "syn_verbs": syns_v,
    })

print(f"Indexed {len(index)} headers with content.")

Found 2133 body headers.
Indexed 1911 headers with content.


In [48]:
# Rank headers using query nouns/verbs + WordNet synonyms; pick best and save section
from html import escape
from difflib import SequenceMatcher
from pathlib import Path

# Extract query nouns/verbs via spaCy (or fallback)
q_nouns, q_verbs = set(), set()
q_text = query
if 'nlp' in globals() and nlp is not None:
    qdoc = nlp(q_text)
    for tok in qdoc:
        if tok.is_stop or not tok.is_alpha:
            continue
        lemma = tok.lemma_.lower()
        if tok.pos_ in ("NOUN", "PROPN"):
            q_nouns.add(lemma)
        elif tok.pos_ in ("VERB",):
            q_verbs.add(lemma)
else:
    for t in re.findall(r"\b[\w-]+\b", q_text.lower()):
        if len(t) >= 2:
            q_nouns.add(t)

# Expand query with WordNet if available
q_syn_n, q_syn_v = set(), set()
if 'wn' in globals() and wn is not None:
    for n in q_nouns:
        for s in wn.synsets(n, pos=wn.NOUN):
            for l in s.lemma_names():
                q_syn_n.add(l.replace("_", " ").lower())
    for v in q_verbs:
        for s in wn.synsets(v, pos=wn.VERB):
            for l in s.lemma_names():
                q_syn_v.add(l.replace("_", " ").lower())

# Score headers: prioritize noun/verb coverage, then synonyms, then fuzzy title
cands = []
for h in index:
    hn = h['nouns']; hv = h['verbs']
    syn_n = h['syn_nouns']; syn_v = h['syn_verbs']

    # Coverage
    noun_cov = len(q_nouns & hn) / max(1, len(q_nouns))
    verb_cov = len(q_verbs & hv) / max(1, len(q_verbs))

    # Synonym coverage
    noun_syn_cov = len(q_syn_n & (hn | syn_n)) / max(1, len(q_syn_n)) if q_syn_n else 0.0
    verb_syn_cov = len(q_syn_v & (hv | syn_v)) / max(1, len(q_syn_v)) if q_syn_v else 0.0

    fuzzy = SequenceMatcher(None, q_text.lower(), h['title'].lower()).ratio()

    score = (
        0.40 * noun_cov +
        0.30 * verb_cov +
        0.15 * noun_syn_cov +
        0.10 * verb_syn_cov +
        0.05 * fuzzy
    )

    # Derive a representative document page for display: first page where this section appears
    # Prefer header page if available, else earliest section content page
    doc_pages = h.get('doc_pages', [])
    first_doc_page = doc_pages[0] if doc_pages else (h.get('header_pages') or h.get('section_pages') or [1])[0]

    cands.append({**h, 'score': score, 'fuzzy': fuzzy,
                  'noun_cov': noun_cov, 'verb_cov': verb_cov,
                  'noun_syn_cov': noun_syn_cov, 'verb_syn_cov': verb_syn_cov,
                  'first_doc_page': first_doc_page})

cands.sort(key=lambda x: x['score'], reverse=True)
print("Top headers:")
for r, c in enumerate(cands[:5], start=1):
    # Show the full list of document pages for each candidate
    pages_str = ','.join(str(p) for p in c.get('doc_pages', []) or c.get('header_pages', []) or c.get('section_pages', []) or ["?"])
    print(f"{r:>2}. p{c['first_doc_page']:>4} h{c['level']} score={c['score']:.3f} | noun={c['noun_cov']:.2f} verb={c['verb_cov']:.2f} n_syn={c['noun_syn_cov']:.2f} v_syn={c['verb_syn_cov']:.2f} fuzz={c['fuzzy']:.2f} :: {c['title']}  [pages: {pages_str}]")

best = cands[0]
print(f"Chosen: doc pages {best.get('doc_pages', best.get('header_pages', best.get('section_pages', ['?'])))} '{best['title']}' score={best['score']:.3f}")

# Slice and save rendered HTML
h, nodes = slice_nodes(best['i'])

# Augment HTML header with page info
page_info = ', '.join(str(p) for p in best.get('doc_pages') or best.get('header_pages') or best.get('section_pages') or [])


def render_nodes(nodes: List[Any]) -> str:
    parts = []
    for n in nodes:
        name = n.__class__.__name__.lower()
        if hasattr(n, 'text') and name != 'sectionheaderitem':
            t = getattr(n, 'text', '')
            if t:
                parts.append(f"<p>{escape(t)}</p>")
        elif hasattr(n, 'items'):
            parts.append('<ul>')
            for it in getattr(n, 'items', []) or []:
                parts.append(f"<li>{escape(getattr(it, 'text', str(it)) or '')}</li>")
            parts.append('</ul>')
        elif hasattr(n, 'num_rows') and hasattr(n, 'num_cols') and hasattr(n, 'cells'):
            rows = []
            for r in range(n.num_rows):
                cells = []
                for c in range(n.num_cols):
                    cell = n.cells[r][c]
                    cells.append(f"<td>{escape(getattr(cell, 'text', '') or '')}</td>")
                rows.append('<tr>' + ''.join(cells) + '</tr>')
            parts.append('<table>' + ''.join(rows) + '</table>')
        elif hasattr(n, 'caption'):
            cap = getattr(getattr(n, 'caption', None), 'text', '') or ''
            if cap:
                parts.append(f"<figure><figcaption>{escape(cap)}</figcaption></figure>")
    return '\n'.join(parts)

html_out = f"""
<!doctype html>
<html><head><meta charset='utf-8'><title>{escape(best['title'])}</title>
<style>body{{font-family:Segoe UI, Roboto, Arial, sans-serif; line-height:1.5; padding:1rem}} ul{{margin-left:1.25rem}}</style>
</head><body>
<h2>{escape(best['title'])}</h2>
<p><em>Document pages: {escape(page_info)}</em></p>
{render_nodes(nodes)}
</body></html>
"""

out_dir = Path('artifacts/sections')
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / 'header_syn_lemma_best_section.html').write_text(html_out, encoding='utf-8')
print('Saved header_syn_lemma_best_section.html')

Top headers:
 1. p 203 h1 score=0.580 | noun=1.00 verb=0.00 n_syn=1.00 v_syn=0.00 fuzz=0.60 :: Cabin air filter  [pages: 203]
 2. p 212 h1 score=0.580 | noun=1.00 verb=0.00 n_syn=1.00 v_syn=0.00 fuzz=0.60 :: Cabin air filter  [pages: 212]
 3. p 483 h1 score=0.433 | noun=0.67 verb=0.00 n_syn=0.94 v_syn=0.00 fuzz=0.51 :: Air cleaner filter  [pages: 483]
 4. p 321 h1 score=0.421 | noun=0.00 verb=1.00 n_syn=0.00 v_syn=1.00 fuzz=0.42 :: · Changing lanes  [pages: 321,322]
 5. p 374 h1 score=0.421 | noun=0.00 verb=1.00 n_syn=0.00 v_syn=1.00 fuzz=0.42 :: · Changing lanes  [pages: 374]
Chosen: doc pages [203] 'Cabin air filter' score=0.580
Saved header_syn_lemma_best_section.html


# Subtree-based selection: rank a section together with its subsections

This section adds a minimal, reusable, and PEP8-compliant utility layer to:

- Build a header tree from `doc.texts` (parent/children relationships by header levels)
- Compute page coverage using provenance (preferred) and page_ref fallback
- Extract linguistic features (noun/verb lemmas and optional WordNet synonyms) for titles
- Construct a subtree index (parent + immediate children merged semantics)
- Rank subtrees against a query and explore the top candidates with pages and child headers

Outputs:
- Console summary of the top subtree candidates (parent + children, doc pages, coverage)
- Saved HTML for the best subtree slice: `artifacts/sections/subtree_best_section.html`

Design notes:
- We keep this independent from existing cells. Functions are documented and PEP8-compliant.
- We limit subtree semantics to parent + immediate children to stay fast and deterministic.
- Rendering uses the existing structural slice: from the parent header until the next header of the same or higher level.

In [None]:
    node = HeaderNode(index=i, level=level, title=title, parent=parent, children=[], header_pages=header_pages, section_pages=list(set().union(*section_pages)), doc_pages=doc_pages)
    node.nodes = nodes
    nodes_map[i] = node

In [74]:
# Minimal runner for subtree selection and action bump
from pathlib import Path

def run_subtree_selection(query: str, top_k: int = 5, out_dir: Path = Path("artifacts/sections"), out_name: str = "subtree_best_section.html"):
    tree = build_header_tree(doc)
    subtrees = build_subtree_index(tree)
    scored = [(score_subtree(query, st), st) for st in subtrees]
    scored.sort(key=lambda x: x[0], reverse=True)
    print("Top subtrees (parent + immediate children):")
    for rank, (score, st) in enumerate(scored[:top_k], start=1):
        pages_str = ",".join(str(p) for p in st["doc_pages"])
        print(f"{rank:>2}. p{pages_str} h{st['level']} score={score:.3f} :: {st['title']}")
        if st["child_titles"]:
            print(f"    child titles: {st['child_titles']}")
        pn = ", ".join(sorted(st["subtree_nouns"]))
        pv = ", ".join(sorted(st["subtree_verbs"]))
        if pn:
            print(f"    subtree noun matches: {pn}")
        if pv:
            print(f"    subtree verb matches: {pv}")
        if st["child_action_hits"]:
            print(f"    child action hits: {st['child_action_hits']}")
    # Save best subtree
    if scored:
        best_idx = scored[0][1]["idx"]
        out_path = out_dir / out_name
        save_best_subtree_html(tree, best_idx, out_path)
        print(f"Saved {out_path.name}")

# Example runner
if 'query' in globals():
    run_subtree_selection(query=query, top_k=5)
else:
    print("Set a `query variable (str) and re-run this cell to execute subtree selection.")

Top subtrees (parent + immediate children):
 1. p203 h1 score=0.589 :: Cabin air filter
    subtree noun matches: air, cabin, filter
 2. p212 h1 score=0.589 :: Cabin air filter
    subtree noun matches: air, cabin, filter
 3. p493 h1 score=0.589 :: Cabin Air Filter
    subtree noun matches: air, cabin, filter
 4. p483 h1 score=0.408 :: Air cleaner filter
    subtree noun matches: air, filter
 5. p139 h1 score=0.376 :: Replacing the battery
    subtree noun matches: battery
    subtree verb matches: replace
Saved subtree_best_section.html


In [63]:
query = "How do I replace the cabin air filter?"