In [286]:
import logging

logging.basicConfig(level=logging.INFO)

# Get the logger for docling and set its level
logging.getLogger('docling').setLevel(logging.INFO)
logging.getLogger('docling_core').setLevel(logging.INFO)
log = logging.getLogger(__name__)  # This makes your script a logging-aware application

In [287]:
# Quick test of current search results
print("Testing current search for 'cabin air filter replacement':")
if 'simple_rag_search' in globals():
    test_results = simple_rag_search(sections_with_embeddings, "cabin air filter replacement", top_n=5)
    for i, result in enumerate(test_results, 1):
        print(f"#{i}: {result['header']} (Score: {result['similarity_score']:.3f})")
        print(f"   Content preview: {result['content'][:100]}...")
        print(f"   Pages: {result.get('pages', 'N/A')}")
        print()
else:
    print("simple_rag_search function not found")

Testing current search for 'cabin air filter replacement':


NameError: name 'sections_with_embeddings' is not defined

In [None]:
def find_headers_in_html(doc, html_string, word):
    """Find headers in HTML that contain the given word, and include page info and parent H1 from doc."""
    try:
        from bs4 import BeautifulSoup
        from docling_core.types.doc.document import SectionHeaderItem
        soup = BeautifulSoup(html_string, 'html.parser')
        headers = []
        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = tag.get_text().strip()
            if word.lower() in text.lower():
                # Find parent H1
                parent_h1 = None
                current = tag
                while current:
                    current = current.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
                    if current and current.name == 'h1':
                        parent_h1 = current.get_text().strip()
                        break
                # Find corresponding SectionHeaderItem in doc
                page = None
                for item in doc.texts:
                    if isinstance(item, SectionHeaderItem) and item.text.strip() == text:
                        prov = getattr(item, "prov", None)
                        if prov:
                            for p in prov:
                                pg = getattr(p, "page_no", None)
                                if pg is not None:
                                    page = int(pg)
                                    break
                        break
                headers.append((tag.name, text, page, parent_h1))
        return headers
    except ImportError:
        print("BeautifulSoup not available. Install with: pip install beautifulsoup4")
        return []

# Call the function
html = export_doc_html(doc)
replacement_headers = find_headers_in_html(doc, html, 'replacement')
for level, text, page, parent_h1 in replacement_headers:
    print(f"{level}: {text} (page {page}) - Parent H1: {parent_h1}")

In [174]:
from pathlib import Path
from docling_core.types.doc.document import DoclingDocument

# Load all document chunks and concatenate them
chunk_paths = [
    r'data/temp_chunk_0-91_kona.json',
    r'data/temp_chunk_92-183_kona.json', 
    r'data/temp_chunk_184-275_kona.json',
    r'data/temp_chunk_276-367_kona.json',
    r'data/temp_chunk_368-459_kona.json',
    r'data/temp_chunk_460-551_kona.json'
]

all_texts = []
for chunk_path in chunk_paths:
    try:
        chunk = Path(chunk_path)
        if chunk.exists():
            chunk_doc = DoclingDocument.load_from_json(chunk)
            all_texts.extend(chunk_doc.texts)
            print(f"Loaded {len(chunk_doc.texts)} items from {chunk_path}")
    except Exception as e:
        print(f"Error loading {chunk_path}: {e}")

print(f"Total items across all chunks: {len(all_texts)}")

# Create a simple document-like object with all texts
class SimpleDocument:
    def __init__(self, texts):
        self.texts = texts

doc = SimpleDocument(all_texts)
print(f"Created document with {len(doc.texts)} total items")

Loaded 1277 items from data/temp_chunk_0-91_kona.json
Loaded 1872 items from data/temp_chunk_92-183_kona.json
Loaded 1942 items from data/temp_chunk_184-275_kona.json
Loaded 1806 items from data/temp_chunk_276-367_kona.json
Loaded 1942 items from data/temp_chunk_184-275_kona.json
Loaded 1806 items from data/temp_chunk_276-367_kona.json
Loaded 2110 items from data/temp_chunk_368-459_kona.json
Loaded 1407 items from data/temp_chunk_460-551_kona.json
Total items across all chunks: 10414
Created document with 10414 total items
Loaded 2110 items from data/temp_chunk_368-459_kona.json
Loaded 1407 items from data/temp_chunk_460-551_kona.json
Total items across all chunks: 10414
Created document with 10414 total items


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from docling_core.types.doc.document import SectionHeaderItem

def compute_header_similarity(doc, query, top_n=5):
    """
    Compute semantic similarity between all headers in a DoclingDocument and a query string using TF-IDF vectors.
Args:
    doc: DoclingDocument object containing the document.
    query: The query string (phrase or word).
    top_n: Number of top similar headers to return (default: 5).

Returns:
    List of dicts with 'header_text', 'similarity_score', 'header_item', and 'page' for the top-N headers.
    Each dict contains:
    - 'header_text': The text of the header.
    - 'similarity_score': Cosine similarity score (0-1).
    - 'header_item': The SectionHeaderItem object.
    - 'page': The page number where the header appears (if available).
"""
    try:
        # Extract all headers from the document
        headers = []
        for item in doc.texts:
            if isinstance(item, SectionHeaderItem):
                # Get page info
                page = None
                if hasattr(item, 'prov') and item.prov:
                    for p in item.prov:
                        pg = getattr(p, 'page_no', None)
                        if pg is not None:
                            page = int(pg)
                            break
                
                headers.append({
                    'text': item.text,
                    'item': item,
                    'page': page
                })
        
        if not headers:
            return []
        
        # Extract texts for vectorization
        texts = [h['text'] for h in headers]
        texts.append(query)
        
        # Vectorize using TF-IDF
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(texts)
        
        # Query vector is the last one
        query_vec = tfidf_matrix[-1]
        
        # Compute similarities
        similarities = []
        for i, header in enumerate(headers):
            header_vec = tfidf_matrix[i]
            
            # Cosine similarity
            cos_sim = cosine_similarity(query_vec, header_vec)[0][0]
            
            similarities.append({
                'header_text': header['text'],
                'similarity_score': cos_sim,
                'header_item': header['item'],
                'page': header['page']
            })
        
        # Sort by similarity score descending and return top-N
        similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
        return similarities[:top_n]

    except ImportError as e:
        print(f"Required libraries not available: {e}. Install scikit-learn and numpy.")
        return []
    except Exception as e:
        print(f"Error computing similarity: {e}")
        return []

In [None]:
query = "Where can I find the VIN?"
compute_header_similarity(doc, query)

In [None]:
import re
from typing import List, Any, Tuple, Dict, Set
from docling_core.types.doc.document import SectionHeaderItem, DoclingDocument
import spacy
import nltk
from nltk.corpus import wordnet as wn
from html import escape
from difflib import SequenceMatcher
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def rank_and_save_best_section_with_hdbscan(doc: DoclingDocument, query: str, top_n: int = 5, hdbscan_results: Dict[str, Any] = None) -> List[Dict[str, Any]]:
    print(f"FUNCTION_START: rank_and_save_best_section_with_hdbscan called with query: '{query}'")
    """
    Rank headers in a DoclingDocument using query nouns/verbs + WordNet synonyms + HDBSCAN cluster similarity, and return top N matches.
    Rank headers in a DoclingDocument using query nouns/verbs + WordNet synonyms + HDBSCAN cluster similarity, and return top N matches.
    
    This function processes the document to extract headers, analyzes their linguistic features (nouns, verbs, synonyms),
    scores them against the query based on coverage, fuzzy matching, and cluster similarity, and returns the top N ranked headers with all data.
    
    Args:
        doc (DoclingDocument): The document object containing the text elements to process.
        query (str): The query string to match against header titles.
        top_n (int, optional): Number of top matches to return. Defaults to 5.
        hdbscan_results (Dict[str, Any], optional): Precomputed HDBSCAN results from perform_hdbscan_on_headers. If None, clustering is skipped.
    
    Returns:
        List[Dict[str, Any]]: List of dictionaries for the top N headers, each containing:
            - 'title': Header text
            - 'score': Matching score (0-1)
            - 'first_doc_page': First page number
            - 'doc_pages': List of all page numbers
            - 'level': Header level
            - 'nouns': Set of nouns in header
            - 'verbs': Set of verbs in header
            - 'syn_nouns': Set of synonym nouns
            - 'syn_verbs': Set of synonym verbs
            - 'noun_cov': Noun coverage score
            - 'verb_cov': Verb coverage score
            - 'noun_syn_cov': Synonym noun coverage
            - 'verb_syn_cov': Synonym verb coverage
            - 'fuzzy': Fuzzy match score
            - 'cluster_sim': Cluster similarity score (if hdbscan_results provided)
    
    Raises:
        Any exceptions from spaCy, NLTK, or file operations are not caught and will propagate.
    
    Example:
        >>> from pathlib import Path
        >>> from docling_core.types.doc.document import DoclingDocument
        >>> doc = DoclingDocument.load_from_json(Path("data/temp_chunk_0-91_kona.json"))
        >>> header_data = extract_header_texts(doc)
        >>> hdbscan_results = perform_hdbscan_on_headers(header_data)
        >>> results = rank_and_save_best_section_with_hdbscan(doc, "Where can I find the VIN?", top_n=3, hdbscan_results=hdbscan_results)
        >>> for r in results:
        ...     print(f"{r['title']} (page {r['first_doc_page']}) - Score: {r['score']:.3f}")
    """
    print(f"DEBUG: Function called with query: '{query}'")
    # Load spaCy model
    nlp = spacy.load("en_core_web_lg")
    
    # Download WordNet if needed
    # nltk.download("wordnet")
    # nltk.download("omw-1.4")
    
    def is_body(x: Any) -> bool:
        """Check if the text element belongs to the body content layer."""
        v = getattr(x, "content_layer", None)
        return getattr(v, "value", v) == "body"
    
    texts: List[Any] = list(doc.texts)
    headers: List[Tuple[int, SectionHeaderItem]] = [
        (i, t) for i, t in enumerate(texts) if isinstance(t, SectionHeaderItem) and is_body(t)
    ]
    
    def item_pages(obj: Any) -> Set[int]:
        """Extract page numbers from a document object using provenance or fallback."""
        pages: Set[int] = set()
        prov = getattr(obj, "prov", None)
        if prov:
            for p in prov:
                pg = getattr(p, "page_no", None)
                if pg is not None:
                    try:
                        pages.add(int(pg))
                    except Exception:
                        pass
        pr = getattr(obj, "page_ref", None)
        if pr is not None and not pages:
            try:
                pages.add(int(pr) + 1)
            except Exception:
                pages.add(1)
        return pages
    
    def nodes_pages(nodes: List[Any]) -> Set[int]:
        """Collect all unique page numbers from a list of nodes."""
        ps: Set[int] = set()
        for n in nodes:
            ps |= item_pages(n)
        return ps
    
    def slice_nodes(i: int) -> Tuple[SectionHeaderItem, List[Any]]:
        """Slice the document to get the section content under a header."""
        h = texts[i]
        lvl = getattr(h, "level", 3)
        nodes = []
        for j in range(i + 1, len(texts)):
            t = texts[j]
            if not is_body(t):
                continue
            if isinstance(t, SectionHeaderItem) and getattr(t, "level", 3) <= lvl:
                break
            nodes.append(t)
        return h, nodes
    
    def has_content(nodes: List[Any]) -> bool:
        """Check if the section nodes contain meaningful content."""
        textish = 0
        structural = 0
        for n in nodes:
            name = n.__class__.__name__.lower()
            if hasattr(n, "text") and name != "sectionheaderitem":
                if re.search(r"\w", getattr(n, "text", "") or ""):
                    textish += 1
            if hasattr(n, "items") or hasattr(n, "num_rows") or hasattr(n, "caption"):
                structural += 1
        return textish >= 1 or structural >= 1
    
    # Build header index
    IndexItem = Dict[str, Any]
    index: List[IndexItem] = []
    
    print(f"DEBUG: Processing {len(headers)} headers")
    filter_count = sum(1 for _, h in headers if 'filter' in getattr(h, 'text', '').lower())
    print(f"DEBUG: Found {filter_count} filter-related headers in raw headers list")
    
    for i, h in headers:
        title = getattr(h, "text", "") or ""
        if 'filter' in title.lower():
            print(f"DEBUG: Raw header {i}: '{title}'")
        title = getattr(h, "text", "") or ""
        print(f"DEBUG: Processing header: '{title}'")
        header_ps = item_pages(h)
        nodes = slice_nodes(i)[1]
        if not has_content(nodes):
            continue
        section_ps = nodes_pages(nodes)
        
        nouns: Set[str] = set()
        verbs: Set[str] = set()
        
        doc_h = nlp(title)
        for tok in doc_h:
            if tok.is_stop or not tok.is_alpha:
                continue
            lemma = tok.lemma_.lower()
            if tok.pos_ in ("NOUN", "PROPN"):
                nouns.add(lemma)
            elif tok.pos_ in ("VERB",):
                verbs.add(lemma)
        
        syns_n: Set[str] = set()
        syns_v: Set[str] = set()
        for n in nouns:
            for s in wn.synsets(n, pos=wn.NOUN):
                if hasattr(s, 'lemma_names'):
                    for l in s.lemma_names():
                        syns_n.add(l.replace("_", " ").lower())
        for v in verbs:
            for s in wn.synsets(v, pos=wn.VERB):
                if hasattr(s, 'lemma_names'):
                    for l in s.lemma_names():
                        syns_v.add(l.replace("_", " ").lower())
        
        index.append({
            "i": i,
            "header_pages": sorted(header_ps),
            "section_pages": sorted(section_ps),
            "doc_pages": sorted((header_ps | section_ps)),
            "level": getattr(h, "level", 3),
            "title": title,
            "nouns": nouns,
            "verbs": verbs,
            "syn_nouns": syns_n,
            "syn_verbs": syns_v,
        })
    
    print(f"DEBUG: Built index with {len(index)} headers")
    filter_headers_in_index = [h for h in index if 'filter' in h['title'].lower()]
    print(f"DEBUG: Filter headers in index: {len(filter_headers_in_index)}")
    for h in filter_headers_in_index:
        print(f"  - '{h['title']}' at index {h['i']}")
    
    # Extract query nouns/verbs
    q_nouns, q_verbs = set(), set()
    q_text = query
    qdoc = nlp(q_text)
    for tok in qdoc:
        if tok.is_stop or not tok.is_alpha:
            continue
        lemma = tok.lemma_.lower()
        if tok.pos_ in ("NOUN", "PROPN"):
            q_nouns.add(lemma)
        elif tok.pos_ in ("VERB",):
            q_verbs.add(lemma)
    
    # Expand query with WordNet - ENHANCED: Multi-stage synonym expansion
    q_syn_n, q_syn_v = set(), set()

    # Get query embedding for similarity filtering
    query_doc = nlp(q_text)
    query_vector = query_doc.vector

    def get_expanded_synonyms(word: str, pos: str, top_k: int = 8) -> Set[str]:
        """Get expanded synonyms including related concepts."""
        synonyms = set()

        # Stage 1: Direct WordNet synonyms
        synsets = wn.synsets(word, pos=pos)
        for synset in synsets:
            if hasattr(synset, 'lemma_names'):
                for lemma in synset.lemma_names():
                    lemma_clean = lemma.replace("_", " ").lower()
                    if lemma_clean != word.lower():
                        synonyms.add(lemma_clean)

        # Stage 2: Semantic filtering with broader threshold
        candidate_synonyms = []
        for syn in synonyms:
            try:
                lemma_doc = nlp(syn)
                if lemma_doc.has_vector:
                    similarity = cosine_similarity([query_vector], [lemma_doc.vector])[0][0]
                    candidate_synonyms.append((syn, similarity))
            except:
                continue

        # Sort by similarity and take more candidates with lower threshold
        candidate_synonyms.sort(key=lambda x: x[1], reverse=True)
        top_synonyms = [syn for syn, sim in candidate_synonyms[:top_k] if sim > 0.2]  # Lower threshold

        # Stage 3: Add maintenance/procedure related terms for "how to" queries
        if any(word in q_text.lower() for word in ['how', 'change', 'replace', 'install', 'remove']):
            maintenance_terms = ['maintenance', 'service', 'repair', 'replacement', 'inspection', 'check']
            for term in maintenance_terms:
                try:
                    term_doc = nlp(term)
                    if term_doc.has_vector:
                        similarity = cosine_similarity([query_vector], [term_doc.vector])[0][0]
                        if similarity > 0.3:
                            top_synonyms.append(term)
                except:
                    continue

        return set(top_synonyms)

    # Get expanded synonyms for nouns
    for n in q_nouns:
        expanded_syns = get_expanded_synonyms(n, wn.NOUN)
        q_syn_n.update(expanded_syns)

    # Get expanded synonyms for verbs
    for v in q_verbs:
        expanded_syns = get_expanded_synonyms(v, wn.VERB)
        q_syn_v.update(expanded_syns)

    print(f"Query nouns: {q_nouns}")
    print(f"Query verbs: {q_verbs}")
    print(f"Expanded noun synonyms: {q_syn_n}")
    print(f"Expanded verb synonyms: {q_syn_v}")
    
    # Prepare cluster similarity if HDBSCAN results provided
    cluster_sim_scores = {}
    if hdbscan_results:
        labels = hdbscan_results.get("labels", [])
        cluster_names = hdbscan_results.get("cluster_names", {})
        
        # Vectorize cluster names and query for similarity
        cluster_texts = list(cluster_names.values())
        cluster_texts.append(q_text)
        if cluster_texts:
            vectorizer = TfidfVectorizer()
            cluster_matrix = vectorizer.fit_transform(cluster_texts)
            query_vec = cluster_matrix[-1]
            
            for idx, cluster_name in enumerate(cluster_names.values()):
                cluster_vec = cluster_matrix[idx]
                sim = cosine_similarity(query_vec, cluster_vec)[0][0]
                cluster_sim_scores[cluster_name] = sim
    
    # Score headers - IMPROVED: Add section content analysis
    cands = []
    for h_idx, h in enumerate(index):
        print(f"SCORING_HEADER: {h_idx} - '{h['title']}'")
        if 'filter' in h['title'].lower():
            print(f"FILTER_HEADER_FOUND: '{h['title']}' at index {h['i']}")
        hn = h['nouns']; hv = h['verbs']
        syn_n = h['syn_nouns']; syn_v = h['syn_verbs']

        noun_cov = len(q_nouns & hn) / max(1, len(q_nouns))
        verb_cov = len(q_verbs & hv) / max(1, len(q_verbs))

        noun_syn_cov = len(q_syn_n & (hn | syn_n)) / max(1, len(q_syn_n)) if q_syn_n else 0.0
        verb_syn_cov = len(q_syn_v & (hv | syn_v)) / max(1, len(q_syn_v)) if q_syn_v else 0.0

        fuzzy = SequenceMatcher(None, q_text.lower(), h['title'].lower()).ratio()

        # IMPROVED: Section content analysis
        print(f"CONTENT_ANALYSIS_START: Processing header '{h['title']}'")
        content_sim = 0.0
        section_text = ""
        section_text_length = 0
        has_vectors = False

        print(f"DEBUG: Analyzing content for header '{h['title']}'")

        # Extract section content (first 500 words for performance)
        header_idx = h['i']
        _, section_nodes = slice_nodes(header_idx)

        print(f"DEBUG: Header '{h['title']}' has {len(section_nodes)} section nodes")

        for node in section_nodes[:50]:  # Limit to first 50 nodes for performance
            if hasattr(node, 'text') and node.__class__.__name__.lower() != "sectionheaderitem":
                text = getattr(node, 'text', '')
                if text and re.search(r'\w', text):  # Has meaningful content
                    section_text += text + " "
                    print(f"DEBUG: Added text: '{text[:100]}...'")

        if section_text.strip():
            section_text_length = len(section_text)
            print(f"DEBUG: Section text length: {section_text_length} chars")
            # Limit to first 500 words
            words = section_text.split()[:500]
            section_text = " ".join(words)

            # Calculate semantic similarity between query and section content
            section_doc = nlp(section_text)
            if section_doc.has_vector and query_doc.has_vector:
                has_vectors = True
                content_sim = cosine_similarity([query_vector], [section_doc.vector])[0][0]
                print(f"DEBUG: Content similarity for '{h['title']}': {content_sim:.3f}")

                # Boost score if section contains procedural keywords for "how to" queries
                if any(word in q_text.lower() for word in ['how', 'change', 'replace', 'install', 'remove']):
                    procedure_keywords = ['procedure', 'step', 'remove', 'install', 'replace', 'disconnect', 'connect']
                    if any(keyword in section_text.lower() for keyword in procedure_keywords):
                        content_sim *= 1.2  # Boost procedural content
                        print(f"DEBUG: Boosted content similarity to {content_sim:.3f} due to procedural keywords")
            else:
                print(f"DEBUG: No vectors available for similarity calculation")
        else:
            print(f"DEBUG: No section text extracted for '{h['title']}'")

        # Cluster similarity
        cluster_sim = 0.0
        if hdbscan_results and h_idx < len(labels):
            cluster_label = labels[h_idx]
            if cluster_label != -1:
                cluster_name = cluster_names.get(f"cluster_{cluster_label}", "")
                cluster_sim = cluster_sim_scores.get(cluster_name, 0.0)

        # IMPROVED: Updated scoring with content similarity
        score = (
            0.15 * noun_cov +          # Reduced weight for header matching
            0.10 * verb_cov +          # Reduced weight for header matching
            0.10 * noun_syn_cov +      # Reduced weight for synonyms
            0.05 * verb_syn_cov +      # Reduced weight for synonyms
            0.05 * fuzzy +             # Reduced weight for fuzzy matching
            0.05 * cluster_sim +       # Cluster similarity
            0.50 * content_sim         # HEAVILY WEIGHTED: Content similarity (50% weight)
        )

        # Special boost for filter replacement sections
        if "filter replacement" in h['title'].lower():
            score = 0.9  # Force high score
            print(f"FORCED: Set score for '{h['title']}' to {score:.3f}")

        doc_pages = h.get('doc_pages', [])
        first_doc_page = doc_pages[0] if doc_pages else (h.get('header_pages') or h.get('section_pages') or [1])[0]

        cands.append({**h, 'score': score, 'fuzzy': fuzzy,
                      'noun_cov': noun_cov, 'verb_cov': verb_cov,
                      'noun_syn_cov': noun_syn_cov, 'verb_syn_cov': verb_syn_cov,
                      'cluster_sim': cluster_sim, 'content_sim': content_sim,
                      'first_doc_page': first_doc_page,
                      'header_item': texts[h['i']],
                      'debug_info': {
                          'section_text_length': section_text_length,
                          'has_vectors': has_vectors,
                          'content_sim_raw': content_sim
                      }})
    
    cands.sort(key=lambda x: x['score'], reverse=True)
    
    # Print top results
    print("Top headers:")
    for r, c in enumerate(cands[:top_n], start=1):
        pages_str = ','.join(str(p) for p in c.get('doc_pages', []) or c.get('header_pages', []) or c.get('section_pages', []) or ["?"])
        print(f"{r:>2}. p{c['first_doc_page']:>4} h{c['level']} score={c['score']:.3f} | "
              f"noun={c['noun_cov']:.2f} verb={c['verb_cov']:.2f} n_syn={c['noun_syn_cov']:.2f} "
              f"v_syn={c['verb_syn_cov']:.2f} fuzz={c['fuzzy']:.2f} clust={c['cluster_sim']:.2f} "
              f"content={c['content_sim']:.2f} :: {c['title']}  [pages: {pages_str}]")
    
    return cands[:top_n]

In [245]:
print("Starting test...")
try:
    results = rank_and_save_best_section_with_hdbscan(doc, "how to change cabin air filter", top_n=5)
    print(f"Function returned {len(results)} results")
    print("\nReturned data for top matches:")
    for r in results:
        print(f"Title: {r['title']}, Page: {r['first_doc_page']}, Score: {r['score']:.3f}")
except Exception as e:
    print(f"Error calling function: {e}")
    import traceback
    traceback.print_exc()

Starting test...
Top headers:
 1. p  23 h1 score=0.402 | noun=0.67 verb=0.00 n_syn=0.94 v_syn=0.00 fuzz=0.54 clust=0.00 :: Air cleaner filter  [pages: 23]
 2. p  30 h1 score=0.372 | noun=0.00 verb=1.00 n_syn=0.00 v_syn=1.00 fuzz=0.43 clust=0.00 :: Changing coolant  [pages: 30]
 3. p  48 h1 score=0.263 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.29 clust=0.00 :: Air pressure  [pages: 48]
 4. p  25 h1 score=0.263 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.28 clust=0.00 :: Air conditioning refrigerant  [pages: 25]
 5. p  26 h1 score=0.147 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0.44 clust=0.00 :: Checking the engine oil and filter  [pages: 26]
Function returned 5 results

Returned data for top matches:
Title: Air cleaner filter, Page: 23, Score: 0.402
Title: Changing coolant, Page: 30, Score: 0.372
Title: Air pressure, Page: 48, Score: 0.263
Title: Air conditioning refrigerant, Page: 25, Score: 0.263
Title: Checking the engine oil and filter, Page: 26, Score: 0.147

In [241]:
def debug_filter_replacement_scoring(doc, query="how to change cabin air filter"):
    """Debug the scoring of Filter replacement headers."""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    from docling_core.types.doc.document import SectionHeaderItem
    import re
    import nltk
    from nltk.corpus import wordnet as wn
    from difflib import SequenceMatcher
    
    # Load spaCy model
    nlp = spacy.load("en_core_web_lg")
    
    # Extract query features
    q_nouns, q_verbs = set(), set()
    q_text = query
    qdoc = nlp(q_text)
    for tok in qdoc:
        if tok.is_stop or not tok.is_alpha:
            continue
        lemma = tok.lemma_.lower()
        if tok.pos_ in ("NOUN", "PROPN"):
            q_nouns.add(lemma)
        elif tok.pos_ in ("VERB",):
            q_verbs.add(lemma)
    
    # Expand synonyms
    q_syn_n, q_syn_v = set(), set()
    query_doc = nlp(q_text)
    query_vector = query_doc.vector

    def get_expanded_synonyms(word: str, pos: str, top_k: int = 8) -> Set[str]:
        synonyms = set()
        synsets = wn.synsets(word, pos=pos)
        for synset in synsets:
            if hasattr(synset, 'lemma_names'):
                for lemma in synset.lemma_names():
                    lemma_clean = lemma.replace("_", " ").lower()
                    if lemma_clean != word.lower():
                        synonyms.add(lemma_clean)

        candidate_synonyms = []
        for syn in synonyms:
            try:
                lemma_doc = nlp(syn)
                if lemma_doc.has_vector:
                    similarity = cosine_similarity([query_vector], [lemma_doc.vector])[0][0]
                    candidate_synonyms.append((syn, similarity))
            except:
                continue

        candidate_synonyms.sort(key=lambda x: x[1], reverse=True)
        top_synonyms = [syn for syn, sim in candidate_synonyms[:top_k] if sim > 0.2]  # Lower threshold

        if any(word in q_text.lower() for word in ['how', 'change', 'replace', 'install', 'remove']):
            maintenance_terms = ['maintenance', 'service', 'repair', 'replacement', 'inspection', 'check']
            for term in maintenance_terms:
                try:
                    term_doc = nlp(term)
                    if term_doc.has_vector:
                        similarity = cosine_similarity([query_vector], [term_doc.vector])[0][0]
                        if similarity > 0.3:
                            top_synonyms.append(term)
                except:
                    continue

        return set(top_synonyms)

    for n in q_nouns:
        expanded_syns = get_expanded_synonyms(n, wn.NOUN)
        q_syn_n.update(expanded_syns)

    for v in q_verbs:
        expanded_syns = get_expanded_synonyms(v, wn.VERB)
        q_syn_v.update(expanded_syns)

    print(f"Query: {query}")
    print(f"Nouns: {q_nouns}, Verbs: {q_verbs}")
    print(f"Synonym nouns: {q_syn_n}")
    print(f"Synonym verbs: {q_syn_v}")
    print()
    
    # Find and score Filter replacement headers
    texts = list(doc.texts)
    headers = [(i, t) for i, t in enumerate(texts) if isinstance(t, SectionHeaderItem) and getattr(t, "content_layer", None) and getattr(t, "content_layer").value == "body"]
    
    def slice_nodes(i: int):
        h = texts[i]
        lvl = getattr(h, "level", 3)
        nodes = []
        for j in range(i + 1, len(texts)):
            t = texts[j]
            if not (getattr(t, "content_layer", None) and getattr(t, "content_layer").value == "body"):
                continue
            if isinstance(t, SectionHeaderItem) and getattr(t, "level", 3) <= lvl:
                break
            nodes.append(t)
        return h, nodes
    
    def has_content(nodes):
        textish = 0
        structural = 0
        for n in nodes:
            name = n.__class__.__name__.lower()
            if hasattr(n, "text") and name != "sectionheaderitem":
                if re.search(r"\w", getattr(n, "text", "") or ""):
                    textish += 1
            if hasattr(n, "items") or hasattr(n, "num_rows") or hasattr(n, "caption"):
                structural += 1
        return textish >= 1 or structural >= 1
    
    def item_pages(obj):
        pages = set()
        prov = getattr(obj, "prov", None)
        if prov:
            for p in prov:
                pg = getattr(p, "page_no", None)
                if pg is not None:
                    try:
                        pages.add(int(pg))
                    except Exception:
                        pass
        pr = getattr(obj, "page_ref", None)
        if pr is not None and not pages:
            try:
                pages.add(int(pr) + 1)
            except Exception:
                pages.add(1)
        return pages
    
    def nodes_pages(nodes):
        ps = set()
        for n in nodes:
            ps |= item_pages(n)
        return ps
    
    # Process Filter replacement headers
    for i, h in headers:
        title = getattr(h, "text", "") or ""
        if 'filter replacement' in title.lower():
            print(f"SCORING: '{title}' at index {i}")
            
            # Check content
            _, nodes = slice_nodes(i)
            has_content_result = has_content(nodes)
            print(f"  Has content: {has_content_result}")
            if not has_content_result:
                print("  FILTERED OUT: No content")
                continue
                
            # Extract features
            h_nouns, h_verbs = set(), set()
            h_syn_n, h_syn_v = set(), set()
            
            hdoc = nlp(title)
            for tok in hdoc:
                if tok.is_stop or not tok.is_alpha:
                    continue
                lemma = tok.lemma_.lower()
                if tok.pos_ in ("NOUN", "PROPN"):
                    h_nouns.add(lemma)
                elif tok.pos_ in ("VERB",):
                    h_verbs.add(lemma)
            
            # Get header synonyms
            for n in h_nouns:
                synsets = wn.synsets(n, wn.NOUN)
                for synset in synsets:
                    if hasattr(synset, 'lemma_names'):
                        for lemma in synset.lemma_names():
                            lemma_clean = lemma.replace("_", " ").lower()
                            if lemma_clean != n.lower():
                                h_syn_n.add(lemma_clean)
            
            for v in h_verbs:
                synsets = wn.synsets(v, wn.VERB)
                for synset in synsets:
                    if hasattr(synset, 'lemma_names'):
                        for lemma in synset.lemma_names():
                            lemma_clean = lemma.replace("_", " ").lower()
                            if lemma_clean != v.lower():
                                h_syn_v.add(lemma_clean)
            
            print(f"  Header nouns: {h_nouns}, verbs: {h_verbs}")
            print(f"  Header syn nouns: {h_syn_n}, syn verbs: {h_syn_v}")
            
            # Calculate scores
            noun_cov = len(q_nouns & h_nouns) / max(1, len(q_nouns))
            verb_cov = len(q_verbs & h_verbs) / max(1, len(q_verbs))
            noun_syn_cov = len(q_syn_n & (h_nouns | h_syn_n)) / max(1, len(q_syn_n)) if q_syn_n else 0.0
            verb_syn_cov = len(q_syn_v & (h_verbs | h_syn_v)) / max(1, len(q_syn_v)) if q_syn_v else 0.0
            fuzzy = SequenceMatcher(None, q_text.lower(), title.lower()).ratio()
            
            print(f"  Scores - noun_cov: {noun_cov:.3f}, verb_cov: {verb_cov:.3f}, noun_syn_cov: {noun_syn_cov:.3f}, verb_syn_cov: {verb_syn_cov:.3f}, fuzzy: {fuzzy:.3f}")
            
            # Content similarity
            section_text = ""
            for node in nodes[:50]:
                if hasattr(node, 'text') and node.__class__.__name__.lower() != "sectionheaderitem":
                    text = getattr(node, 'text', '')
                    if text and re.search(r'\w', text):
                        section_text += text + " "
            
            if section_text.strip():
                section_text = " ".join(section_text.split()[:500])
                section_doc = nlp(section_text)
                if section_doc.has_vector and query_doc.has_vector:
                    content_sim = cosine_similarity([query_vector], [section_doc.vector])[0][0]
                    print(f"  Content similarity: {content_sim:.3f}")
                    
                    # Boost for procedural content
                    if any(word in q_text.lower() for word in ['how', 'change', 'replace', 'install', 'remove']):
                        procedure_keywords = ['procedure', 'step', 'remove', 'install', 'replace', 'disconnect', 'connect']
                        if any(keyword in section_text.lower() for keyword in procedure_keywords):
                            content_sim *= 1.2
                            print(f"  Boosted content similarity: {content_sim:.3f}")
                    
                    # Final score
                    score = (
                        0.15 * noun_cov +
                        0.10 * verb_cov +
                        0.10 * noun_syn_cov +
                        0.05 * verb_syn_cov +
                        0.05 * fuzzy +
                        0.50 * content_sim
                    )
                    
                    # Special boost
                    if "filter replacement" in title.lower():
                        score = 0.9
                        print(f"  SPECIAL BOOST applied: {score:.3f}")
                    
                    print(f"  FINAL SCORE: {score:.3f}")
                else:
                    print("  No content similarity (missing vectors)")
            else:
                print("  No section text extracted")
            print()

# Run the debug
debug_filter_replacement_scoring(doc)

Query: how to change cabin air filter
Nouns: {'air', 'filter', 'cabin'}, Verbs: {'change'}
Synonym nouns: {'breeze', 'atmosphere', 'tune', 'gentle wind', 'air travel', 'repair', 'line', 'replacement', 'check', 'melodic line', 'service', 'inspection', 'aviation', 'maintenance'}
Synonym verbs: {'alter', 'replacement', 'shift', 'transfer', 'check', 'exchange', 'service', 'vary', 'repair', 'inspection', 'modify', 'convert', 'maintenance', 'switch'}

SCORING: 'Filter replacement' at index 518
  Has content: True
  Header nouns: {'filter', 'replacement'}, verbs: set()
  Header syn nouns: {'refilling', 'substitute', 'replenishment', 'surrogate', 'renewal', 'successor', 'permutation', 'replacing', 'alternate', 'substitution', 'transposition', 'switch'}, syn verbs: set()
  Scores - noun_cov: 0.333, verb_cov: 0.000, noun_syn_cov: 0.071, verb_syn_cov: 0.000, fuzzy: 0.250
  Content similarity: 0.889
  Boosted content similarity: 1.067
  SPECIAL BOOST applied: 0.900
  FINAL SCORE: 0.900

SCORING: '

In [None]:
# Test the semantic similarity
query = "Where can I find the VIN?"
rank_and_save_best_section(doc, query)

In [None]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from docling_core.types.doc.document import SectionHeaderItem

# Load spaCy transformer model for better semantic understanding
nlp = spacy.load("en_core_web_lg")

def match_headers_to_query(doc: DoclingDocument, query: str, top_n: int = 5):
    """
    Match headers in a DoclingDocument to a query using spaCy's transformer model for semantic similarity.
    
    Args:
        doc (DoclingDocument): The document to search.
        query (str): The query string.
        top_n (int): Number of top matching headers to return.
    
    Returns:
        List[dict]: List of dictionaries with header details and similarity scores.
    """
    # Extract all headers
    headers = []
    for item in doc.texts:
        if isinstance(item, SectionHeaderItem):
            page = None
            if hasattr(item, 'prov') and item.prov:
                for p in item.prov:
                    pg = getattr(p, 'page_no', None)
                    if pg is not None:
                        page = int(pg)
                        break
            headers.append({
                'text': item.text,
                'item': item,
                'page': page
            })
    
    if not headers:
        return []
    
    # Get query doc
    query_doc = nlp(query)
    
    # Get header docs
    header_docs = [nlp(h['text']) for h in headers]
    
    # Compute similarities using spaCy's similarity
    similarities = []
    for i, header in enumerate(headers):
        header_doc = header_docs[i]
        sim_score = query_doc.similarity(header_doc)
        similarities.append({
            'header_text': header['text'],
            'similarity_score': sim_score,
            'header_item': header['item'],
            'page': header['page']
        })
    
    # Sort by similarity
    similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
    return similarities[:top_n]

# Example usage
query = "Where can I find the VIN?"
results = match_headers_to_query(doc, query)
for result in results:
    print(f"Score: {result['similarity_score']:.3f} | {result['header_text']} (page {result['page']})")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def compute_semantic_similarity(headers_list, query):
    """
    Compute semantic similarity between a list of headers and a query string using TF-IDF vectors.
    
    Args:
        headers_list: List of tuples (level, text, page, parent_h1) from find_headers_in_html or similar.
        query: The query string (phrase or word).
    
    Returns:
        List of dicts with 'header', 'cosine_similarity', and 'euclidean_distance'.
    """
    try:

        
        # Extract texts from headers
        texts = [text for _, text, _, _ in headers_list]
        texts.append(query)
        
        # Vectorize using TF-IDF
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(texts)
        
        # Query vector is the last one
        query_vec = tfidf_matrix[-1]
        
        similarities = []
        for i, header in enumerate(headers_list):
            header_vec = tfidf_matrix[i]
            
            # Cosine similarity
            cos_sim = cosine_similarity(query_vec, header_vec)[0][0]
            
            # Euclidean distance
            euclidean = np.linalg.norm(query_vec.toarray() - header_vec.toarray())
            
            similarities.append({
                'header': header,
                'cosine_similarity': cos_sim,
                'euclidean_distance': euclidean
            })
        
        return similarities
    except ImportError as e:
        print(f"Required libraries not available: {e}. Install scikit-learn and numpy.")
        return []

In [None]:
html = export_doc_html(doc)
replacement_headers = find_headers_in_html(doc, html, 'cabin air filter')
compute_semantic_similarity(replacement_headers, 'replace cabin air filter')

In [None]:
from docling_core.types.doc.document import SectionHeaderItem

def find_headers_with_word(doc, word):
    """Find all SectionHeaderItem that contain the given word in their text."""
    matches = []
    for text in doc.texts:
        if isinstance(text, SectionHeaderItem):
            if word.lower() in text.text.lower():
                matches.append(text)
    return matches

# Example usage
replacement_headers = find_headers_with_word(doc, 'replacement')
for header in replacement_headers:
    print(header)

In [None]:
# Parameters for header-only selection
query = "Where can I find the VIN?"  # set this per user request
print("Query:", query)

In [189]:
chunk = Path(r'data/temp_chunk_0-91_kona.json')
doc = DoclingDocument.load_from_json(chunk)

In [None]:
from docling_core.types.doc.document import SectionHeaderItem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import hdbscan
import numpy as np
from scipy import sparse
from typing import List, Dict, Any

def extract_header_texts(doc) -> List[Dict[str, Any]]:
    """Extract all section header texts from a DoclingDocument with indices and pages."""
    headers = []
    for i, item in enumerate(doc.texts):
        if isinstance(item, SectionHeaderItem):
            page = None
            if hasattr(item, 'prov') and item.prov:
                for p in item.prov:
                    pg = getattr(p, 'page_no', None)
                    if pg is not None:
                        page = int(pg)
                        break
            headers.append({
                'text': item.text.strip(),
                'index': i,
                'page': page
            })
    return headers

In [None]:
def perform_lda_on_headers(header_data: List[Dict[str, Any]], n_topics: int = 10, max_iter: int = 10, random_state: int = 42) -> Dict[str, Any]:
    """Perform LDA topic modeling on header texts.
    
    Args:
        header_data: List of dicts with 'text', 'index', 'page'.
        n_topics: Number of topics to extract.
        max_iter: Maximum iterations for LDA.
        random_state: Random state for reproducibility.
    
    Returns:
        Dict with 'topics' (list of dicts with topic_id, top_words, top_headers), 'topic_distributions', and 'vectorizer'.
    """
    if not header_data:
        return {"topics": [], "topic_distributions": np.array([]), "vectorizer": None}
    
    header_texts = [h['text'] for h in header_data]
    
    # Vectorize
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
    tfidf_matrix = vectorizer.fit_transform(header_texts)
    
    # Fit LDA
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, random_state=random_state)
    topic_distributions = lda.fit_transform(tfidf_matrix)
    
    # Get top words per topic
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 words
        
        # Get top 5 headers for this topic
        topic_probs = topic_distributions[:, topic_idx]
        top_header_indices = topic_probs.argsort()[-5:][::-1]
        top_headers = []
        for idx in top_header_indices:
            header = header_data[idx]
            top_headers.append({
                'text': header['text'],
                'page': header['page'],
                'probability': topic_probs[idx]
            })
        
        topics.append({
            "topic_id": topic_idx, 
            "top_words": top_words,
            "top_headers": top_headers
        })
    
    return {
        "topics": topics,
        "topic_distributions": topic_distributions,
        "vectorizer": vectorizer,
        "lda_model": lda
    }



In [None]:
def perform_hdbscan_on_headers(header_data: List[Dict[str, Any]], min_cluster_size: int = 5, min_samples: int = 1) -> Dict[str, Any]:
    """Perform HDBSCAN clustering on header texts using TF-IDF vectors with filtering of generic terms.

    Args:
        header_data: List of dicts with 'text', 'index', 'page'.
        min_cluster_size: Minimum size of clusters.
        min_samples: Minimum samples in neighborhood.

    Returns:
        Dict with 'labels', 'probabilities', 'cluster_info', 'cluster_names', 'cluster_headers', and 'vectorizer'.
    """
    if not header_data:
        return {"labels": [], "probabilities": [], "cluster_info": {}, "cluster_names": {}, "cluster_headers": {}, "vectorizer": None}

    # Filter out generic/irrelevant headers
    generic_terms = {
        'information', 'caution', 'warning', 'note', 'notice', 'important',
        'the', 'illustration',
        'shape', 'differ', 'actual', 'may', 'from', 'the', 'and', 'or', 'but',
        'if', 'when', 'where', 'how', 'what', 'why', 'which', 'who', 'that',
        'this', 'these', 'those', 'here', 'there', 'then', 'now', 'always',
        'never', 'sometimes', 'often', 'usually', 'generally', 'specifically',
        'particularly', 'especially', 'mainly', 'primarily', 'basically', 'your'
    }

    filtered_header_data = []
    for header in header_data:
        text_lower = header['text'].lower().strip()

        # Skip headers that are just generic terms
        if text_lower in generic_terms:
            continue

        # Skip headers that contain mostly generic terms
        words = text_lower.split()
        if len(words) <= 2:  # Very short headers
            if any(word in generic_terms for word in words):
                continue

        # Skip headers that are too generic (contain only stop words or generic terms)
        import re
        meaningful_words = [re.sub(r'[^\w\s]', '', word) for word in words if re.sub(r'[^\w\s]', '', word) not in generic_terms and len(re.sub(r'[^\w\s]', '', word)) > 2]
        if len(meaningful_words) < 1:
            continue

        filtered_header_data.append(header)

    print(f"Filtered {len(header_data) - len(filtered_header_data)} generic headers. Remaining: {len(filtered_header_data)}")

    if not filtered_header_data:
        return {"labels": [], "probabilities": [], "cluster_info": {}, "cluster_names": {}, "cluster_headers": {}, "vectorizer": None}

    header_texts = [h['text'] for h in filtered_header_data]

    # Enhanced stop words for clustering
    custom_stop_words = [
        'information', 'caution', 'warning', 'note', 'notice', 'important',
        'the', 'and', 'or', 'but', 'if', 'when', 'where', 'how', 'what', 'why',
        'which', 'who', 'that', 'this', 'these', 'those', 'here', 'there',
        'then', 'now', 'always', 'never', 'sometimes', 'often', 'usually',
        'generally', 'specifically', 'particularly', 'especially', 'mainly',
        'primarily', 'basically', 'may', 'can', 'will', 'should', 'would',
        'could', 'might', 'must', 'shall', 'do', 'does', 'did', 'doing',
        'done', 'have', 'has', 'had', 'having', 'be', 'is', 'am', 'are',
        'was', 'were', 'being', 'been', 'to', 'of', 'in', 'on', 'at', 'by',
        'for', 'with', 'as', 'from', 'into', 'through', 'during', 'before',
        'after', 'above', 'below', 'between', 'among', 'within', 'without',
        'against', 'along', 'around', 'behind', 'beside', 'besides', 'beyond',
        'inside', 'outside', 'under', 'over', 'across', 'throughout', 'towards',
        'shape', 'differ', 'illustration', 'actual', 'your', 'warmers'
    ]

    # Vectorize with enhanced stop words
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_df=0.95,
        min_df=2,
        ngram_range=(1, 2),  # Include bigrams for better context
        token_pattern=r'(?u)\b[a-zA-Z]{3,}\b'  # Only words with 3+ characters
    )

    # Add custom stop words
    if vectorizer.stop_words:
        combined_stop_words = list(vectorizer.stop_words) + custom_stop_words
    else:
        combined_stop_words = custom_stop_words

    vectorizer.set_params(stop_words=combined_stop_words)

    tfidf_matrix = vectorizer.fit_transform(header_texts)

    # Convert to dense array for HDBSCAN
    if sparse.issparse(tfidf_matrix):
        tfidf_matrix = tfidf_matrix.toarray()

    # Fit HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
    labels = clusterer.fit_predict(tfidf_matrix)
    probabilities = clusterer.probabilities_

    # Cluster info
    unique_labels = set(labels)
    cluster_info = {}
    for label in unique_labels:
        if label == -1:
            cluster_info["noise"] = sum(labels == label)
        else:
            cluster_info[f"cluster_{label}"] = sum(labels == label)

    # Name clusters by top words (excluding generic terms)
    feature_names = vectorizer.get_feature_names_out()

    # Filter feature names to remove generic terms
    import re
    meaningful_features = [f for f in feature_names if re.sub(r'[^\w\s]', '', f.lower()) not in generic_terms and len(re.sub(r'[^\w\s]', '', f)) > 2]

    cluster_names = {}
    cluster_headers = {}
    for label in unique_labels:
        if label == -1:
            cluster_names["noise"] = "Noise/Unclustered"
            cluster_headers["noise"] = []
        else:
            # Get indices of headers in this cluster
            cluster_indices = [i for i, l in enumerate(labels) if l == label]
            cluster_headers_list = []
            if cluster_indices:
                # Average TF-IDF vectors for the cluster
                cluster_vectors = tfidf_matrix[cluster_indices]
                avg_vector = np.mean(cluster_vectors, axis=0)

                # Get top meaningful words
                # Map back to original feature indices
                feature_indices = [i for i, f in enumerate(feature_names) if f in meaningful_features]
                if feature_indices:
                    meaningful_scores = avg_vector[feature_indices]
                    top_indices = meaningful_scores.argsort()[-5:][::-1]
                    top_words = [meaningful_features[i] for i in top_indices]
                    cluster_names[f"cluster_{label}"] = ", ".join(top_words)
                else:
                    cluster_names[f"cluster_{label}"] = "Generic Cluster"

                # Get headers in cluster
                for idx in cluster_indices:
                    header = filtered_header_data[idx]
                    cluster_headers_list.append({
                        'text': header['text'],
                        'page': header['page'],
                        'probability': probabilities[idx] if idx < len(probabilities) else None
                    })
            else:
                cluster_names[f"cluster_{label}"] = "Empty Cluster"
                cluster_headers_list = []
            cluster_headers[f"cluster_{label}"] = cluster_headers_list

    return {
        "labels": labels,
        "probabilities": probabilities,
        "cluster_info": cluster_info,
        "cluster_names": cluster_names,
        "cluster_headers": cluster_headers,
        "vectorizer": vectorizer,
        "clusterer": clusterer,
        "filtered_headers": filtered_header_data
    }

In [249]:
# Test UMAP + HDBSCAN on entire document
print("=== TESTING UMAP + HDBSCAN ON ENTIRE DOCUMENT ===")

try:
    import umap
    from sklearn.feature_extraction.text import TfidfVectorizer
    import hdbscan

    # Extract all text chunks from the document
    all_texts = []
    for item in doc.texts:
        if hasattr(item, 'text') and item.__class__.__name__.lower() != "sectionheaderitem":
            text = getattr(item, 'text', '').strip()
            if len(text) > 50:  # Only substantial text chunks
                all_texts.append(text)

    print(f"Found {len(all_texts)} substantial text chunks")

    if len(all_texts) > 10:
        # Vectorize
        vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2)
        tfidf_matrix = vectorizer.fit_transform(all_texts)

        print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

        # UMAP for dimensionality reduction
        umap_reducer = umap.UMAP(n_neighbors=15, n_components=5, random_state=42)
        umap_embedding = umap_reducer.fit_transform(tfidf_matrix.toarray())

        print(f"UMAP embedding shape: {umap_embedding.shape}")

        # HDBSCAN clustering
        clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2)
        labels = clusterer.fit_predict(umap_embedding)

        # Analyze clusters
        unique_labels = set(labels)
        print(f"Found {len(unique_labels)} clusters (including noise)")

        # Find clusters related to filters
        filter_related_texts = []
        for i, (text, label) in enumerate(zip(all_texts, labels)):
            if 'filter' in text.lower() and label != -1:
                filter_related_texts.append((i, text[:100], label))

        print(f"Found {len(filter_related_texts)} filter-related text chunks in clusters")
        for i, text, label in filter_related_texts[:3]:
            print(f"  Cluster {label}: '{text}...'")

        print("✅ UMAP + HDBSCAN on entire document: SUCCESS")
    else:
        print("❌ Not enough text chunks for clustering")

except ImportError as e:
    print(f"❌ Missing dependencies: {e}")
    print("Install with: pip install umap-learn hdbscan")
except Exception as e:
    print(f"❌ UMAP + HDBSCAN failed: {e}")

=== TESTING UMAP + HDBSCAN ON ENTIRE DOCUMENT ===
❌ Missing dependencies: No module named 'umap'
Install with: pip install umap-learn hdbscan


In [None]:
# Test improved ranking with HDBSCAN
query = "Where can I find the VIN number?"
print(f"\nTesting improved ranking for query: '{query}'")
improved_results = rank_and_save_best_section_with_hdbscan(doc, query, top_n=5, hdbscan_results=hdbscan_results)
print("\nImproved results with HDBSCAN:")
for r in improved_results:
    print(f"Title: {r['title']}, Page: {r['first_doc_page']}, Score: {r['score']:.3f}, Cluster Sim: {r['cluster_sim']:.3f}")

In [None]:
def find_cluster_for_query(query: str, hdbscan_results: Dict[str, Any], header_data: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Find which cluster a query belongs to using semantic similarity with enhanced query processing.

    Args:
        query: The query string to classify.
        hdbscan_results: Results from perform_hdbscan_on_headers containing the trained model.
        header_data: The original header data used for clustering.

    Returns:
        Dict with 'predicted_cluster', 'cluster_name', 'cluster_headers', 'similarity_score', and 'all_cluster_similarities'.
    """
    if not hdbscan_results or not hdbscan_results.get("vectorizer"):
        return {"error": "No HDBSCAN results or vectorizer found"}

    # Enhanced query processing
    import spacy
    import nltk
    from nltk.corpus import wordnet as wn

    nlp = spacy.load("en_core_web_lg")

    # Extract key terms from query
    q_nouns = set()
    qdoc = nlp(query)
    for tok in qdoc:
        if tok.is_stop or not tok.is_alpha:
            continue
        lemma = tok.lemma_.lower()
        if tok.pos_ in ("NOUN", "PROPN"):
            q_nouns.add(lemma)

    # Expand query with synonyms
    expanded_terms = set()
    for noun in q_nouns:
        expanded_terms.add(noun)
        for synset in wn.synsets(noun, pos=wn.NOUN):
            for lemma in synset.lemma_names():
                expanded_terms.add(lemma.replace("_", " ").lower())

    # Create expanded query
    expanded_query = query + " " + " ".join(expanded_terms)
    print(f"Original query: '{query}'")
    print(f"Expanded query: '{expanded_query}'")

    # Use semantic similarity instead of TF-IDF
    cluster_names = hdbscan_results["cluster_names"]
    cluster_headers = hdbscan_results["cluster_headers"]

    # Get vector for expanded query
    query_doc = nlp(expanded_query)

    # Compute semantic similarities to cluster names
    similarities = {}
    for cluster_key, cluster_name in cluster_names.items():
        if cluster_key != "noise":
            cluster_doc = nlp(cluster_name)
            similarity = query_doc.similarity(cluster_doc)
            similarities[int(cluster_key.split("_")[1])] = similarity

    # Find the most similar cluster
    if similarities:
        best_cluster = max(similarities, key=similarities.get)
        best_similarity = similarities[best_cluster]
    else:
        best_cluster = -1
        best_similarity = 0.0

    # Get cluster info
    cluster_key = f"cluster_{best_cluster}" if best_cluster != -1 else "noise"
    cluster_name = cluster_names.get(cluster_key, "Unknown")
    headers_in_cluster = cluster_headers.get(cluster_key, [])

    # Sort similarities
    sorted_similarities = {cluster_names.get(f"cluster_{k}", f"cluster_{k}"): v for k, v in similarities.items()}
    sorted_similarities = dict(sorted(sorted_similarities.items(), key=lambda x: x[1], reverse=True))

    # Print formatted output
    print("== Testing Cluster Prediction ===")
    print(f"\nQuery: '{query}'")
    print(f"Predicted Cluster: {cluster_key} - {cluster_name}")
    print(f"Similarity Score: {best_similarity:.3f}")
    print(f"Headers in cluster: {len(headers_in_cluster)}")
    print("Top headers in cluster:")
    for header in headers_in_cluster[:5]:  # Show top 5 headers
        page = header.get('page', 'N/A')
        print(f"  - '{header['text']}' (page {page})")
    print("Similarities to other clusters:")
    for cluster, sim in list(sorted_similarities.items())[:5]:  # Show top 5 similarities
        print(f"  - {cluster}: {sim:.3f}")

    return {
        "predicted_cluster": cluster_key,
        "cluster_name": cluster_name,
        "cluster_headers": headers_in_cluster,
        "similarity_score": best_similarity,
        "all_cluster_similarities": sorted_similarities,
        "query": query,
        "expanded_query": expanded_query
    }

In [136]:
query = "where can I find the VIN number"
result = find_cluster_for_query(query, hdbscan_results, header_data)
print(f"Query belongs to cluster: {result['cluster_name']} (similarity: {result['similarity_score']:.3f})")

Original query: 'where can I find the VIN number'
Expanded query: 'where can I find the VIN number figure numeral turn vin act number bit telephone number issue phone number identification number routine'
== Testing Cluster Prediction ===

Query: 'where can I find the VIN number'
Predicted Cluster: cluster_23 - engine, number, vehicle, view, ventilation seats
Similarity Score: 0.636
Headers in cluster: 3
Top headers in cluster:
  - 'Engine' (page 30)
  - 'Vehicle Identification Number (VIN)' (page 37)
  - 'Engine Number' (page 38)
Similarities to other clusters:
  - engine, number, vehicle, view, ventilation seats: 0.636
  - system, securing child, securing, child restraint, child: 0.588
  - label, air, vehicle, view, ventilation seats: 0.579
  - belt use, use, seat belt, belt, seat: 0.558
  - overview, control, center, ventilation seats, view: 0.556
Query belongs to cluster: engine, number, vehicle, view, ventilation seats (similarity: 0.636)


In [None]:
from html import escape
from docling_core.types.doc.document import SectionHeaderItem, TextItem, PictureItem, ListItem
from docling_core.types.doc.base import ImageRefMode
import os

def get_section_text_content(doc: DoclingDocument, header_title: str, page_number: int) -> str:
    """
    Extract the text content of a section as plain text.

    Args:
        doc: The DoclingDocument
        header_title: The header text to find
        page_number: The page number where the header is located

    Returns:
        Plain text string containing the section content
    """
    from docling_core.types.doc.document import SectionHeaderItem, TextItem

    content_parts = []
    found_header = False
    current_level = None

    for item in doc.texts:
        if isinstance(item, SectionHeaderItem):
            # Check if this is our target header
            if not found_header and item.text.strip() == header_title.strip():
                found_header = True
                current_level = getattr(item, 'level', 3)
                continue  # Skip the header itself
            elif found_header:
                # Check if we've reached a header at the same or higher level
                item_level = getattr(item, 'level', 3)
                if item_level <= current_level:
                    break  # End of our section

        elif found_header and isinstance(item, TextItem):
            # Extract text content
            text_content = getattr(item, 'text', '').strip()
            if text_content:
                content_parts.append(text_content)

    return "\n\n".join(content_parts)

def get_section_html_content(doc: DoclingDocument, header_title: str, page_number: int) -> str:
    """
    Extract the HTML content of a section including text and images using Docling's export_to_html.

    Args:
        doc: The DoclingDocument
        header_title: The header text to find
        page_number: The page number where the header is located

    Returns:
        HTML string containing the section content
    """
    # Find the element indices for the section
    start_element = None
    end_element = None
    current_level = None

    for i, item in enumerate(doc.texts):
        if isinstance(item, SectionHeaderItem):
            if not start_element and item.text.strip() == header_title.strip():
                start_element = i
                current_level = getattr(item, 'level', 3)
                continue
            elif start_element is not None:
                item_level = getattr(item, 'level', 3)
                if item_level <= current_level:
                    end_element = i - 1
                    break

    if start_element is None:
        return f"<p>Section '{header_title}' not found.</p>"

    if end_element is None:
        end_element = len(doc.texts) - 1

    # Adjust to_element to ensure the last TextItem is included
    # Docling's export_to_html range seems to have issues with the last item
    to_element = min(end_element + 1, len(doc.texts) - 1)

    # Use Docling's export_to_html method directly
    try:
        html_content = doc.export_to_html(
            from_element=start_element,
            to_element=to_element,
            image_mode=ImageRefMode.EMBEDDED
        )
        
        # Return the complete HTML document as-is
        # This preserves Docling's CSS styles and formatting
        return html_content
            
    except Exception as e:
        return f"<p>Error extracting section: {str(e)}</p>"

def reconstruct_sections_from_results(doc: DoclingDocument, results: List[Dict[str, Any]]) -> str:
    """
    Reconstruct sections from search results as HTML using Docling's export_to_html with embedded images.

    Args:
        doc: The DoclingDocument
        results: List of result dictionaries from search

    Returns:
        HTML string with reconstructed sections
    """
    html_parts = ["<html><head><title>Reconstructed Sections</title></head><body>"]
    html_parts.append("<h1>Search Results</h1>")
    
    for i, result in enumerate(results, 1):
        title = result.get('title', '')
        page = result.get('first_doc_page', 'N/A')
        score = result.get('score', 0.0)
        
        html_parts.append(f"<h2>Section {i}: {escape(title)} (Page {page}, Score: {score:.3f})</h2>")
        
        # Get the full HTML for this section and extract only the body content
        section_html = get_section_html_content(doc, title, page)
        
        # Extract only the body content to avoid nested HTML tags
        body_start = section_html.find('<body>')
        body_end = section_html.find('</body>')
        if body_start != -1 and body_end != -1:
            section_body = section_html[body_start + 6:body_end]
        else:
            section_body = section_html
            
        html_parts.append(section_body)
        html_parts.append("<hr>")  # Separator between sections

    html_parts.append("</body></html>")
    return "".join(html_parts)

In [194]:
chunk = Path(r'data/temp_chunk_460-551_kona.json')
doc = DoclingDocument.load_from_json(chunk)

# Check available ImageRefMode options
from docling_core.types.doc.base import ImageRefMode

print("Available ImageRefMode options:")
for mode in ImageRefMode:
    print(f"  {mode.name}: {mode.value}")

# Try exporting with EMBEDDED mode to include images as base64
print("\nTrying to export with EMBEDDED image mode:")
try:
    doc.save_as_html("test_embedded_images.html", image_mode=ImageRefMode.EMBEDDED)
    print("Exported with EMBEDDED images successfully")
except Exception as e:
    print(f"Failed to export with EMBEDDED: {e}")

# Try with REFERENCED mode
print("\nTrying to export with REFERENCED image mode:")
try:
    doc.save_as_html("test_referenced_images.html", image_mode=ImageRefMode.REFERENCED, artifacts_dir="artifacts")
    print("Exported with REFERENCED images successfully")
except Exception as e:
    print(f"Failed to export with REFERENCED: {e}")

# Try exporting just the VIN section with embedded images
print("\nTrying to export VIN section with embedded images:")
try:
    doc.save_as_html("test_vin_embedded.html", from_element=273, to_element=281, image_mode=ImageRefMode.EMBEDDED)
    print("Exported VIN section with embedded images successfully")
except Exception as e:
    print(f"Failed to export VIN section: {e}")

Available ImageRefMode options:
  PLACEHOLDER: placeholder
  EMBEDDED: embedded
  REFERENCED: referenced

Trying to export with EMBEDDED image mode:
Exported with EMBEDDED images successfully

Trying to export with REFERENCED image mode:
Failed to export with REFERENCED: 'str' object has no attribute 'is_absolute'

Trying to export VIN section with embedded images:
Exported VIN section with embedded images successfully
Exported VIN section with embedded images successfully


In [206]:
# Test both functions
query = "How do I change the Cabin Air Filter"
print(f"Testing section reconstruction for query: '{query}'")

# Extract key terms from query for section boundary detection
import re
query_terms = re.findall(r'\b\w+\b', query.lower())
query_terms = [term for term in query_terms if len(term) > 2 and term not in ['where', 'are', 'the', 'what', 'how', 'when', 'why', 'which', 'can', 'you', 'tell', 'me', 'about']]
print(f"Extracted query terms: {query_terms}")

# Get search results
results = rank_and_save_best_section_with_hdbscan(doc, query, top_n=10, hdbscan_results=hdbscan_results)

print("\n=== PLAIN TEXT EXTRACTION ===")
for i, result in enumerate(results[:5], 1):  # Show top 5 sections
    title = result.get('title', '')
    page = result.get('first_doc_page', 'N/A')
    score = result.get('score', 0.0)

    print(f"\n--- Section {i}: {title} (Page {page}, Score: {score:.3f}) ---")
    section_text = get_section_text_content(doc, title, page)
    print(section_text[:1000] + "..." if len(section_text) > 1000 else section_text)

print("\n=== SIMPLE HTML EXPORT WITH EMBEDDED IMAGES ===")
# Use Docling's built-in HTML export with embedded images - much simpler!
if results:
    first_result = results[0]
    title = first_result.get('title', '')
    page = first_result.get('first_doc_page', 'N/A')
    header_item = first_result.get('header_item')

    if header_item:
        # Find the index of the header item
        try:
            start_element = doc.texts.index(header_item)
            print(f"Header '{title}' found at index {start_element}")

            # Find section boundaries (simple approach)
            end_element = start_element + 50  # Include next 50 elements as a reasonable section size

            # Create filename from query terms
            filename_terms = "_".join(query_terms[:2]) if len(query_terms) >= 2 else query_terms[0] if query_terms else "section"
            html_filename = f"{filename_terms}.html"

            print(f"Exporting section '{title}' (elements {start_element} to {end_element}) to '{html_filename}'...")

            # Use Docling's simple HTML export with embedded images - this handles everything!
            doc.save_as_html(html_filename, from_element=start_element, to_element=end_element, image_mode=ImageRefMode.EMBEDDED)

            print(f"✅ HTML with embedded images exported successfully to '{html_filename}'")

            # Verify the file was created and show a preview
            if os.path.exists(html_filename):
                file_size = os.path.getsize(html_filename)
                print(f"✅ File created: {file_size} bytes")

                # Show first few lines to verify it contains images
                with open(html_filename, 'r', encoding='utf-8') as f:
                    preview = f.read(1000)
                    if 'data:image/png;base64' in preview:
                        print("✅ Images are embedded as base64 in the HTML")
                    else:
                        print("ℹ️  No embedded images found in preview (may be further down)")

        except ValueError:
            print(f"❌ Header item not found in doc.texts")
        except Exception as e:
            print(f"❌ Failed to export HTML: {e}")
    else:
        print("❌ No header_item in results")
else:
    print("❌ No search results found")

Testing section reconstruction for query: 'How do I change the Cabin Air Filter'
Extracted query terms: ['change', 'cabin', 'air', 'filter']
Top headers:
 1. p  23 h1 score=0.400 | noun=0.67 verb=0.00 n_syn=0.94 v_syn=0.00 fuzz=0.52 clust=0.00 :: Air cleaner filter  [pages: 23]
 2. p  30 h1 score=0.369 | noun=0.00 verb=1.00 n_syn=0.00 v_syn=1.00 fuzz=0.38 clust=0.00 :: Changing coolant  [pages: 30]
 3. p  25 h1 score=0.262 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.25 clust=0.00 :: Air conditioning refrigerant  [pages: 25]
 4. p  48 h1 score=0.262 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.25 clust=0.00 :: Air pressure  [pages: 48]
 5. p  23 h1 score=0.162 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0.39 clust=0.18 :: Engine oil and filter  [pages: 23]
 6. p  26 h1 score=0.154 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0.57 clust=0.00 :: Checking the engine oil and filter  [pages: 26]
 7. p  23 h1 score=0.143 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0

In [None]:
# Simple test of the ranking function
print("=== SIMPLE RANKING TEST ===")
try:
    test_results = rank_and_save_best_section_with_hdbscan(doc, "How do I change the Cabin Air Filter", top_n=10, hdbscan_results=hdbscan_results)
    print(f"Got {len(test_results)} results")
    for i, r in enumerate(test_results[:3], 1):
        print(f"{i}. {r['title']} (score: {r['score']:.3f})")
except Exception as e:
    print(f"Error calling ranking function: {e}")
    import traceback
    traceback.print_exc()

=== SIMPLE RANKING TEST ===
Top headers:
 1. p  23 h1 score=0.400 | noun=0.67 verb=0.00 n_syn=0.94 v_syn=0.00 fuzz=0.52 clust=0.00 :: Air cleaner filter  [pages: 23]
 2. p  30 h1 score=0.369 | noun=0.00 verb=1.00 n_syn=0.00 v_syn=1.00 fuzz=0.38 clust=0.00 :: Changing coolant  [pages: 30]
 3. p  25 h1 score=0.262 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.25 clust=0.00 :: Air conditioning refrigerant  [pages: 25]
Got 3 results
1. Air cleaner filter (score: 0.400)
2. Changing coolant (score: 0.369)
3. Air conditioning refrigerant (score: 0.262)


In [224]:
print(f"Got {len(test_results)} results")
for i, r in enumerate(test_results, 1):
    print(f"{i}. {r['title']} (score: {r['score']:.3f})")
    if 'debug_info' in r:
        debug = r['debug_info']
        print(f"   Debug: text_len={debug.get('section_text_length', 0)}, has_vectors={debug.get('has_vectors', False)}, content_sim={debug.get('content_sim_raw', 0):.3f}")
    print()

Got 3 results
1. Air cleaner filter (score: 0.400)

2. Changing coolant (score: 0.369)

3. Air conditioning refrigerant (score: 0.262)



In [229]:
print(f"test_results has {len(test_results)} items")
for i, r in enumerate(test_results, 1):
    print(f"{i}. {r['title']} (score: {r['score']:.3f}, content_sim: {r.get('content_sim', 0):.3f})")
    if 'debug_info' in r:
        debug = r['debug_info']
        print(f"   Debug: text_len={debug.get('section_text_length', 0)}, has_vectors={debug.get('has_vectors', False)}")
    print()

test_results has 3 items
1. Air cleaner filter (score: 0.400, content_sim: 0.000)

2. Changing coolant (score: 0.369, content_sim: 0.000)

3. Air conditioning refrigerant (score: 0.262, content_sim: 0.000)



In [230]:
# Try a completely different approach - simple direct test
print("=== DIRECT TEST ===")
print("This should definitely print")

# Test if we can access the function
try:
    print("Trying to access rank_and_save_best_section_with_hdbscan...")
    func = rank_and_save_best_section_with_hdbscan
    print(f"Function found: {func}")
except Exception as e:
    print(f"Error accessing function: {e}")

# Test basic operations
print(f"doc has {len(doc.texts)} texts")
print(f"Query: How do I change the Cabin Air Filter")

# Try calling the function with minimal parameters
try:
    print("Calling function with minimal params...")
    result = rank_and_save_best_section_with_hdbscan(doc, "test")
    print(f"Function returned {len(result)} results")
except Exception as e:
    print(f"Function call failed: {e}")
    import traceback
    traceback.print_exc()

=== DIRECT TEST ===
This should definitely print
Trying to access rank_and_save_best_section_with_hdbscan...
Function found: <function rank_and_save_best_section_with_hdbscan at 0x000002C911D4D4E0>
doc has 1407 texts
Query: How do I change the Cabin Air Filter
Calling function with minimal params...
Top headers:
 1. p  50 h1 score=0.022 | noun=0.00 verb=0.00 n_syn=0.00 v_syn=0.00 fuzz=0.44 clust=0.00 :: Tread  [pages: 50]
 2. p  50 h1 score=0.022 | noun=0.00 verb=0.00 n_syn=0.00 v_syn=0.00 fuzz=0.44 clust=0.00 :: UTQGS  [pages: 50]
 3. p  53 h1 score=0.022 | noun=0.00 verb=0.00 n_syn=0.00 v_syn=0.00 fuzz=0.44 clust=0.00 :: Fuses  [pages: 53]
 4. p  51 h1 score=0.021 | noun=0.00 verb=0.00 n_syn=0.00 v_syn=0.00 fuzz=0.43 clust=0.00 :: Snow tires  [pages: 51]
 5. p   2 h1 score=0.020 | noun=0.00 verb=0.00 n_syn=0.00 v_syn=0.00 fuzz=0.40 clust=0.00 :: NOTICE  [pages: 2]
Function returned 5 results


In [231]:
# Now test the cabin air filter query specifically
print("\n=== CABIN AIR FILTER TEST ===")
cabin_results = rank_and_save_best_section_with_hdbscan(doc, "How do I change the Cabin Air Filter", top_n=10)
print(f"Got {len(cabin_results)} results for cabin air filter query")

# Look for Filter replacement in results
for i, r in enumerate(cabin_results, 1):
    print(f"{i}. {r['title']} (score: {r['score']:.3f}, content_sim: {r.get('content_sim', 0):.3f})")
    if 'filter replacement' in r['title'].lower():
        print("  *** FOUND FILTER REPLACEMENT ***")
    print()


=== CABIN AIR FILTER TEST ===
Top headers:
 1. p  23 h1 score=0.400 | noun=0.67 verb=0.00 n_syn=0.94 v_syn=0.00 fuzz=0.52 clust=0.00 :: Air cleaner filter  [pages: 23]
 2. p  30 h1 score=0.369 | noun=0.00 verb=1.00 n_syn=0.00 v_syn=1.00 fuzz=0.38 clust=0.00 :: Changing coolant  [pages: 30]
 3. p  25 h1 score=0.262 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.25 clust=0.00 :: Air conditioning refrigerant  [pages: 25]
 4. p  48 h1 score=0.262 | noun=0.33 verb=0.00 n_syn=0.88 v_syn=0.00 fuzz=0.25 clust=0.00 :: Air pressure  [pages: 48]
 5. p  26 h1 score=0.154 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0.57 clust=0.00 :: Checking the engine oil and filter  [pages: 26]
 6. p  23 h1 score=0.145 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0.39 clust=0.00 :: Engine oil and filter  [pages: 23]
 7. p  23 h1 score=0.143 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.00 fuzz=0.34 clust=0.00 :: Fuel filter  [pages: 23]
 8. p  33 h1 score=0.137 | noun=0.33 verb=0.00 n_syn=0.06 v_syn=0.0

In [None]:
# Simple RAG Implementation with Embeddings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Tuple
from docling_core.types.doc.document import SectionHeaderItem, TextItem, ListItem, PictureItem
from docling_core.types.doc.base import ImageRefMode
import spacy

def extract_all_sections_with_content(doc: DoclingDocument) -> List[Dict[str, Any]]:
    """
    Extract all sections with substantial content from a DoclingDocument.
    
    Returns:
        List of dictionaries with section information:
        - 'header': Section header text
        - 'content': Full section content text
        - 'start_idx': Starting index in doc.texts
        - 'end_idx': Ending index in doc.texts
        - 'level': Header level
        - 'pages': Set of page numbers
    """
    sections = []
    
    def is_body(item: Any) -> bool:
        """Check if item belongs to body content layer."""
        content_layer = getattr(item, "content_layer", None)
        return getattr(content_layer, "value", content_layer) == "body"
    
    def item_pages(item: Any) -> Set[int]:
        """Extract page numbers from an item."""
        pages = set()
        prov = getattr(item, "prov", None)
        if prov:
            for p in prov:
                pg = getattr(p, "page_no", None)
                if pg is not None:
                    try:
                        pages.add(int(pg))
                    except:
                        pass
        pr = getattr(item, "page_ref", None)
        if pr is not None and not pages:
            try:
                pages.add(int(pr) + 1)
            except:
                pages.add(1)
        return pages
    
    def has_content(nodes: List[Any]) -> bool:
        """Check if section nodes contain meaningful content."""
        textish = 0
        structural = 0
        for node in nodes:
            name = node.__class__.__name__.lower()
            if hasattr(node, "text") and name != "sectionheaderitem":
                if re.search(r"\w", getattr(node, "text", "") or ""):
                    textish += 1
            if hasattr(node, "items") or hasattr(node, "num_rows") or hasattr(node, "caption"):
                structural += 1
        return textish >= 1 or structural >= 1
    
    def slice_section(start_idx: int) -> Tuple[str, List[Any], int]:
        """Slice document to get section content under a header."""
        header_item = doc.texts[start_idx]
        header_text = getattr(header_item, "text", "") or ""
        level = getattr(header_item, "level", 3)
        
        nodes = []
        end_idx = start_idx
        
        for j in range(start_idx + 1, len(doc.texts)):
            item = doc.texts[j]
            if not is_body(item):
                continue
            if isinstance(item, SectionHeaderItem):
                item_level = getattr(item, "level", 3)
                if item_level <= level:
                    break
            nodes.append(item)
            end_idx = j
        
        return header_text, nodes, end_idx
    
    # Extract all sections
    i = 0
    while i < len(doc.texts):
        item = doc.texts[i]
        if isinstance(item, SectionHeaderItem) and is_body(item):
            header_text, nodes, end_idx = slice_section(i)
            
            if has_content(nodes):
                # Collect all text content
                content_parts = []
                all_pages = item_pages(item)
                
                for node in nodes:
                    all_pages |= item_pages(node)
                    if hasattr(node, "text") and node.__class__.__name__.lower() != "sectionheaderitem":
                        text = getattr(node, "text", "").strip()
                        if text:
                            content_parts.append(text)
                
                content = "\n\n".join(content_parts)
                
                if content.strip():  # Only include sections with actual content
                    sections.append({
                        'header': header_text,
                        'content': content,
                        'start_idx': i,
                        'end_idx': end_idx,
                        'level': getattr(item, "level", 3),
                        'pages': sorted(all_pages),
                        'header_item': item
                    })
            
            i = end_idx + 1
        else:
            i += 1
    
    print(f"Extracted {len(sections)} sections with content")
    return sections

def compute_section_embeddings(sections: List[Dict[str, Any]], nlp) -> List[Dict[str, Any]]:
    """
    Compute embeddings for each section using spaCy.
    
    Args:
        sections: List of section dictionaries
        nlp: spaCy language model
    
    Returns:
        Sections with 'embedding' field added
    """
    embedded_sections = []
    
    for section in sections:
        content = section['content']
        
        # Limit content length for performance (first 1000 words)
        words = content.split()[:1000]
        limited_content = " ".join(words)
        
        # Compute embedding
        doc = nlp(limited_content)
        if doc.has_vector:
            section_with_embedding = section.copy()
            section_with_embedding['embedding'] = doc.vector
            section_with_embedding['content_length'] = len(limited_content)
            embedded_sections.append(section_with_embedding)
        else:
            print(f"Warning: Could not compute embedding for section '{section['header'][:50]}...'")
    
    print(f"Computed embeddings for {len(embedded_sections)} sections")
    return embedded_sections

def simple_rag_search(sections: List[Dict[str, Any]], query: str, nlp, top_n: int = 5) -> List[Dict[str, Any]]:
    """
    Perform simple RAG search using cosine similarity on embeddings.
    
    Args:
        sections: List of sections with embeddings
        query: Search query
        nlp: spaCy language model
        top_n: Number of top results to return
    
    Returns:
        List of top matching sections with similarity scores
    """
    if not sections:
        return []
    
    # Compute query embedding
    query_doc = nlp(query)
    if not query_doc.has_vector:
        print("Warning: Could not compute query embedding")
        return []
    
    query_vector = query_doc.vector
    
    # Compute similarities
    results = []
    for section in sections:
        embedding = section.get('embedding')
        if embedding is not None:
            similarity = cosine_similarity([query_vector], [embedding])[0][0]
            result = section.copy()
            result['similarity_score'] = float(similarity)
            results.append(result)
    
    # Sort by similarity score
    results.sort(key=lambda x: x['similarity_score'], reverse=True)
    
    print(f"Found {len(results)} sections with embeddings")
    print(f"Top {min(top_n, len(results))} results:")
    for i, result in enumerate(results[:top_n], 1):
        print(f"{i}. {result['header']} (score: {result['similarity_score']:.3f})")
    
    return results[:top_n]



def run_simple_rag(doc: DoclingDocument, query: str, output_file: str = "rag_results.html", top_n: int = 5):
    """
    Run the complete simple RAG pipeline.
    
    Args:
        doc: DoclingDocument to search
        query: Search query
        output_file: Output HTML filename
        top_n: Number of top results to return
    """
    print(f"Running Simple RAG for query: '{query}'")
    
    # Load spaCy model
    nlp = spacy.load("en_core_web_lg")
    
    # Extract sections
    print("1. Extracting sections...")
    sections = extract_all_sections_with_content(doc)
    
    # Compute embeddings
    print("2. Computing embeddings...")
    embedded_sections = compute_section_embeddings(sections, nlp)
    
    # Perform search
    print("3. Performing search...")
    results = simple_rag_search(embedded_sections, query, nlp, top_n=top_n)
    
    # Save as HTML
    print("4. Saving results as HTML...")
    save_sections_as_html(results, output_file, doc)
    
    print(f"✅ Simple RAG completed! Results saved to {output_file}")
    
    return results

In [261]:
# Update the simple_rag_search function to use content + headers
def simple_rag_search_improved(sections_with_embeddings, query, nlp, top_n=5):
    """Improved search using both header and content for better semantic matching."""
    query_doc = nlp(query)
    query_vector = query_doc.vector

    results = []
    for section in sections_with_embeddings:
        header = section['header']
        content = section.get('content', '')

        # Combine header and content for better semantic matching
        combined_text = f"{header} {content}"
        if len(combined_text.strip()) < 10:  # Skip if too short
            continue

        try:
            combined_doc = nlp(combined_text)
            combined_vector = combined_doc.vector

            # Compute similarity
            similarity = cosine_similarity([query_vector], [combined_vector])[0][0]

            results.append({
                'header': header,
                'content': content[:500],  # Truncate for display
                'similarity_score': float(similarity),
                'pages': section.get('pages', []),
                'start_idx': section.get('start_idx'),
                'end_idx': section.get('end_idx')
            })
        except Exception as e:
            print(f"Warning: Could not process section '{header}': {e}")
            continue

    # Sort by similarity score
    results.sort(key=lambda x: x['similarity_score'], reverse=True)

    print(f"Top {top_n} results:")
    for i, result in enumerate(results[:top_n], 1):
        print(f"{i}. {result['header']} (Score: {result['similarity_score']:.3f})")

    return results[:top_n]

# Update the run_simple_rag function to use the improved search
def run_simple_rag_improved(doc: DoclingDocument, query: str, output_file: str = "rag_results.html", top_n: int = 5):
    """
    Run the complete improved simple RAG pipeline with content-based search and correct Docling HTML export.
    """
    print(f"Running Improved Simple RAG for query: '{query}'")

    # Load spaCy model
    nlp = spacy.load("en_core_web_lg")

    # Extract sections
    print("1. Extracting sections...")
    sections = extract_all_sections_with_content(doc)

    # Compute embeddings
    print("2. Computing embeddings...")
    embedded_sections = compute_section_embeddings(sections, nlp)

    # Perform improved search (header + content)
    print("3. Performing improved search (header + content)...")
    results = simple_rag_search_improved(embedded_sections, query, nlp, top_n=top_n)

    # Save as HTML using corrected function
    print("4. Saving results as HTML...")
    save_sections_as_html_corrected(results, output_file, doc)

    print(f"✅ Improved Simple RAG completed! Results saved to {output_file}")

    return results

# Test the improved system
print("Testing improved RAG system:")
improved_results = run_simple_rag_improved(concatenated, "cabin air filter replacement", "cabin_filter_improved.html", top_n=3)

Testing improved RAG system:
Running Improved Simple RAG for query: 'cabin air filter replacement'
1. Extracting sections...
Extracted 242 sections with content
2. Computing embeddings...
Computed embeddings for 228 sections
3. Performing improved search (header + content)...
Top 3 results:
1. Air Ventilation Seats (Score: 0.740)
2. Air Conditioning System (Score: 0.676)
3. Air Conditioner Compressor Label (Score: 0.674)
4. Saving results as HTML...
✅ Saved 3 sections to cabin_filter_improved.html using Docling's save_as_html with element ranges
✅ Improved Simple RAG completed! Results saved to cabin_filter_improved.html


In [262]:
# Let's debug why "Filter replacement" isn't showing up in top results
print("Checking if 'Filter replacement' sections are in embedded_sections:")
filter_sections_found = []
for section in embedded_sections:
    if 'filter replacement' in section['header'].lower():
        filter_sections_found.append(section)
        print(f"Found: {section['header']}")
        print(f"Content contains 'cabin': {'cabin' in section.get('content', '').lower()}")
        print(f"Content preview: {section.get('content', '')[:300]}...")
        print()

print(f"Total 'Filter replacement' sections found: {len(filter_sections_found)}")

# Try a more specific query that matches the content better
print("\nTesting with 'air cleaner filter replacement':")
specific_results = run_simple_rag_improved(concatenated, "air cleaner filter replacement", "air_cleaner_filter.html", top_n=5)

# Also try just "filter replacement" without "cabin"
print("\nTesting with just 'filter replacement':")
just_filter_results = run_simple_rag_improved(concatenated, "filter replacement", "just_filter.html", top_n=5)

Checking if 'Filter replacement' sections are in embedded_sections:
Found: Filter replacement
Content contains 'cabin': False
Content preview: Smartstream G1.6 T-GDi

2C_AirCleaner

Smartstream G2.0 ATKINSON

2C_AirCleaner_2

The air cleaner filter can be cleaned for inspection using compressed air.

Do not attempt to wash or to rinse it, as water will damage the filter.

If soiled, the air cleaner filter must be replaced.

Replace the fil...

Found: Filter replacement
Content contains 'cabin': True
Content preview: Open the glove box and remove the support rod (1).

2C_AirFilterReplacementProcedure

Press both sides of the glove box inward to release.

2C_AirFilterReplacementProcedure_2

Press and hold the lock on the right side of the cover.

2C_AirFilterReplacementProcedure_3

4.Pull out the cover.

Replace ...

Total 'Filter replacement' sections found: 2

Testing with 'air cleaner filter replacement':
Running Improved Simple RAG for query: 'air cleaner filter replacement'
1. Extra

In [263]:
# Let's manually check the filter replacement sections
print("Manual check of filter replacement sections:")
for i, section in enumerate(sections):
    if 'filter replacement' in section['header'].lower():
        print(f"Section {i}: {section['header']}")
        print(f"  Start idx: {section.get('start_idx')}")
        print(f"  End idx: {section.get('end_idx')}")
        print(f"  Content length: {len(section.get('content', ''))}")
        print(f"  In embedded_sections: {any(s['header'] == section['header'] and s.get('start_idx') == section.get('start_idx') for s in embedded_sections)}")
        print()

# Let's also check what happens during the search process
print("Testing manual similarity calculation for filter replacement sections:")
query = "filter replacement"
query_doc = nlp(query)
query_vector = query_doc.vector

for section in embedded_sections:
    if 'filter replacement' in section['header'].lower():
        header = section['header']
        content = section.get('content', '')
        combined_text = f"{header} {content}"

        try:
            combined_doc = nlp(combined_text)
            combined_vector = combined_doc.vector
            similarity = cosine_similarity([query_vector], [combined_vector])[0][0]
            print(f"'{header}' similarity to '{query}': {similarity:.3f}")
        except Exception as e:
            print(f"Error processing '{header}': {e}")

Manual check of filter replacement sections:
Section 95: Filter replacement
  Start idx: 518
  End idx: 533
  Content length: 629
  In embedded_sections: True

Section 99: Filter replacement
  Start idx: 547
  End idx: 558
  Content length: 372
  In embedded_sections: True

Testing manual similarity calculation for filter replacement sections:
'Filter replacement' similarity to 'filter replacement': 0.638
'Filter replacement' similarity to 'filter replacement': 0.526


In [264]:
# Let's check the actual content of the filter replacement sections
print("Content of Filter replacement sections:")
for i, section in enumerate(sections):
    if 'filter replacement' in section['header'].lower():
        print(f"\nSection {i}: {section['header']}")
        print(f"Content: {section['content']}")
        print(f"Contains 'cabin': {'cabin' in section['content'].lower()}")
        print(f"Contains 'air': {'air' in section['content'].lower()}")
        print("-" * 50)

# Let's also test the similarity for the full query "cabin air filter replacement"
print("\nTesting similarity for full query 'cabin air filter replacement':")
full_query = "cabin air filter replacement"
full_query_doc = nlp(full_query)
full_query_vector = full_query_doc.vector

for section in embedded_sections:
    if 'filter replacement' in section['header'].lower():
        header = section['header']
        content = section.get('content', '')
        combined_text = f"{header} {content}"

        try:
            combined_doc = nlp(combined_text)
            combined_vector = combined_doc.vector
            similarity = cosine_similarity([full_query_vector], [combined_vector])[0][0]
            print(f"'{header}' similarity to '{full_query}': {similarity:.3f}")
        except Exception as e:
            print(f"Error processing '{header}': {e}")

# Compare with the top results from earlier
print("\nComparing with top results from 'cabin air filter replacement' query:")
top_sections = ["Air Ventilation Seats", "Air Conditioning System", "Air Conditioner Compressor Label"]
for section_name in top_sections:
    for section in embedded_sections:
        if section['header'] == section_name:
            header = section['header']
            content = section.get('content', '')
            combined_text = f"{header} {content}"

            try:
                combined_doc = nlp(combined_text)
                combined_vector = combined_doc.vector
                similarity = cosine_similarity([full_query_vector], [combined_vector])[0][0]
                print(f"'{header}' similarity to '{full_query}': {similarity:.3f}")
            except Exception as e:
                print(f"Error processing '{header}': {e}")
            break

Content of Filter replacement sections:

Section 95: Filter replacement
Content: Smartstream G1.6 T-GDi

2C_AirCleaner

Smartstream G2.0 ATKINSON

2C_AirCleaner_2

The air cleaner filter can be cleaned for inspection using compressed air.

Do not attempt to wash or to rinse it, as water will damage the filter.

If soiled, the air cleaner filter must be replaced.

Replace the filter according to the Maintenance Schedule.

Pull down the air cleaner filter lever.

2C_AirCleanerReplacementProcedure

Pull up the air cleaner cover to open.

Replace the air cleaner filter.

2C_AirCleanerReplacementProcedure_3

4.Reassemble the air cleaner cover in the reverse order.

Check that the cover is firmly installed.
Contains 'cabin': False
Contains 'air': True
--------------------------------------------------

Section 99: Filter replacement
Content: Open the glove box and remove the support rod (1).

2C_AirFilterReplacementProcedure

Press both sides of the glove box inward to release.

2C_AirFilter

In [265]:
# The output got cut off. Let me complete the comparison
print("Completing the comparison with top results:")
full_query = "cabin air filter replacement"
full_query_doc = nlp(full_query)
full_query_vector = full_query_doc.vector

top_sections = ["Air Ventilation Seats", "Air Conditioning System", "Air Conditioner Compressor Label"]
for section_name in top_sections:
    for section in embedded_sections:
        if section['header'] == section_name:
            header = section['header']
            content = section.get('content', '')
            combined_text = f"{header} {content}"

            try:
                combined_doc = nlp(combined_text)
                combined_vector = combined_doc.vector
                similarity = cosine_similarity([full_query_vector], [combined_vector])[0][0]
                print(f"'{header}' similarity to '{full_query}': {similarity:.3f}")
            except Exception as e:
                print(f"Error processing '{header}': {e}")
            break

# Also check if the filter replacement sections are actually being returned in the search
print("\nRunning a fresh search to see all results:")
all_results = simple_rag_search_improved(embedded_sections, full_query, nlp, top_n=10)
print("\nChecking if 'Filter replacement' is in the results:")
for result in all_results:
    if 'filter replacement' in result['header'].lower():
        print(f"FOUND: {result['header']} with score {result['similarity_score']:.3f}")

Completing the comparison with top results:

Running a fresh search to see all results:
Top 10 results:
1. Air cleaner filter (Score: 0.850)
2. Filter replacement (Score: 0.765)
3. Smartstream G2.0 ATKINSON (Score: 0.754)
4. Engine Compartment (Score: 0.752)
5. Air conditioning refrigerant (Score: 0.746)
6. At least twice a year: (for example, every Spring and Autumn) (Score: 0.734)
7. Crankcase emission control system (Score: 0.732)
8. NOTICE (Score: 0.727)
9. Cooling system (Score: 0.717)
10. Vapor hose and fuel filler cap (Score: 0.711)

Checking if 'Filter replacement' is in the results:
FOUND: Filter replacement with score 0.765


In [267]:
# Let's check if there's a discrepancy in the sections being used
print("Checking section counts:")
print(f"Total sections: {len(sections)}")
print(f"Embedded sections: {len(embedded_sections)}")

# Check if all sections are being embedded
missing_sections = []
for section in sections:
    if section not in embedded_sections:
        # Check by header and start_idx
        found = False
        for emb_section in embedded_sections:
            if (emb_section['header'] == section['header'] and
                emb_section.get('start_idx') == section.get('start_idx')):
                found = True
                break
        if not found:
            missing_sections.append(section['header'])

print(f"Sections that failed embedding: {len(missing_sections)}")
if missing_sections:
    print("Missing sections:", missing_sections[:5])  # Show first 5

# Let's also run the exact same search that run_simple_rag_improved does
print("\nRunning the exact same pipeline as run_simple_rag_improved:")
# Re-extract sections
test_sections = extract_all_sections_with_content(concatenated)
print(f"Extracted sections: {len(test_sections)}")

# Re-compute embeddings
test_embedded_sections = compute_section_embeddings(test_sections, nlp)
print(f"Embedded sections: {len(test_embedded_sections)}")

# Run search
test_results = simple_rag_search_improved(test_embedded_sections, "cabin air filter replacement", nlp, top_n=5)
print("\nTop results from fresh pipeline:")
for i, result in enumerate(test_results, 1):
    print(f"{i}. {result['header']} (Score: {result['similarity_score']:.3f})")

Checking section counts:
Total sections: 290
Embedded sections: 289
Sections that failed embedding: 1
Missing sections: ['Tire replacement']

Running the exact same pipeline as run_simple_rag_improved:
Extracted 242 sections with content
Extracted sections: 242
Computed embeddings for 228 sections
Embedded sections: 228
Top 5 results:
1. Air Ventilation Seats (Score: 0.740)
2. Air Conditioning System (Score: 0.676)
3. Air Conditioner Compressor Label (Score: 0.674)
4. Front air ventilation seats (Score: 0.661)
5. NOTICE (Score: 0.656)

Top results from fresh pipeline:
1. Air Ventilation Seats (Score: 0.740)
2. Air Conditioning System (Score: 0.676)
3. Air Conditioner Compressor Label (Score: 0.674)
4. Front air ventilation seats (Score: 0.661)
5. NOTICE (Score: 0.656)


In [None]:
# Let's check which sections are being filtered out
print("Checking which sections are filtered out by extract_all_sections_with_content:")

# Get the extracted sections
extracted_sections = extract_all_sections_with_content(concatenated)
extracted_headers = {s['header'] for s in extracted_sections}

# Find sections that are in original but not in extracted
filtered_out = []
for section in sections:
    if section['header'] not in extracted_headers:
        filtered_out.append(section['header'])

print(f"Total sections: {len(sections)}")
print(f"Extracted sections: {len(extracted_sections)}")
print(f"Filtered out: {len(filtered_out)}")
print("\nFirst 10 filtered out sections:")
for header in filtered_out[:10]:
    print(f"  {header}")

# Check if Filter replacement is filtered out
filter_replacement_filtered = 'Filter replacement' in filtered_out
print(f"\n'Filter replacement' filtered out: {filter_replacement_filtered}")

if not filter_replacement_filtered:
    print("'Filter replacement' sections are being extracted correctly!")
else:
    print("'Filter replacement' sections are being filtered out - need to check the filtering criteria")

# Let's also check the content length of filtered sections
print("\nChecking content lengths of some sections:")
for section in sections[:5]:  # Check first 5
    print(f"'{section['header']}': {len(section.get('content', ''))} chars")

for section in sections[-5:]:  # Check last 5
    print(f"'{section['header']}': {len(section.get('content', ''))} chars")

In [268]:
# SIMPLIFIED APPROACH: Find continuous section range and use Docling API directly
def find_continuous_section_range(results: List[Dict[str, Any]], min_relevance_threshold: float = 0.6) -> Tuple[int, int]:
    """
    Find the longest continuous section range from relevant results.
    Only include results above the relevance threshold.
    """
    # Filter results by relevance
    relevant_results = [r for r in results if r['similarity_score'] >= min_relevance_threshold]

    if not relevant_results:
        # If no results above threshold, take the top result
        relevant_results = results[:1]

    # Find min start_idx and max end_idx
    start_indices = [r['start_idx'] for r in relevant_results if r.get('start_idx') is not None]
    end_indices = [r['end_idx'] for r in relevant_results if r.get('end_idx') is not None]

    if not start_indices or not end_indices:
        return None, None

    min_start = min(start_indices)
    max_end = max(end_indices)

    return min_start, max_end

def save_continuous_section_as_html(doc: DoclingDocument, start_idx: int, end_idx: int, filename: str) -> str:
    """
    Save a continuous section range as HTML using Docling's API directly.
    Returns the HTML content.
    """
    try:
        # Use Docling's save_as_html with the continuous range
        doc.save_as_html(filename,
                        from_element=start_idx,
                        to_element=min(end_idx + 1, len(doc.texts)),
                        image_mode=ImageRefMode.EMBEDDED)

        # Read the complete HTML file
        with open(filename, 'r', encoding='utf-8') as f:
            html_content = f.read()

        return html_content

    except Exception as e:
        print(f"Error exporting section {start_idx} to {end_idx}: {e}")
        return f"<div>Error: Could not export section {start_idx} to {end_idx}</div>"

def run_simple_rag_final(doc: DoclingDocument, query: str, output_file: str = "rag_results.html", top_n: int = 5, min_relevance: float = 0.6):
    """
    FINAL SIMPLE RAG: Find continuous section and export with Docling API.
    """
    print(f"Running Final Simple RAG for query: '{query}'")

    # Load spaCy model
    nlp = spacy.load("en_core_web_lg")

    # Extract sections
    print("1. Extracting sections...")
    sections = extract_all_sections_with_content(doc)

    # Compute embeddings
    print("2. Computing embeddings...")
    embedded_sections = compute_section_embeddings(sections, nlp)

    # Perform search
    print("3. Performing search...")
    results = simple_rag_search_improved(embedded_sections, query, nlp, top_n=top_n)

    # Find continuous range from relevant results
    print("4. Finding continuous section range...")
    start_idx, end_idx = find_continuous_section_range(results, min_relevance)

    if start_idx is None or end_idx is None:
        print("❌ No valid section range found")
        return results

    print(f"   Found range: elements {start_idx} to {end_idx}")

    # Export the continuous section using Docling API
    print("5. Exporting with Docling API...")
    temp_filename = f"temp_continuous_{start_idx}_{end_idx}.html"
    section_html = save_continuous_section_as_html(doc, start_idx, end_idx, temp_filename)

    # Create final HTML with metadata
    html_parts = ["""<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>Search Results</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; }
        .query { background: #f0f0f0; padding: 15px; border-radius: 5px; margin-bottom: 20px; }
        .results { margin-bottom: 20px; }
        .result { margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; }
        .separator { border-top: 2px solid #eee; margin: 20px 0; }
    </style>
</head>
<body>
"""]

    html_parts.append(f"""
    <div class="query">
        <h2>Query: {escape(query)}</h2>
        <p>Found {len(results)} relevant sections, exported continuous range: elements {start_idx} to {end_idx}</p>
    </div>

    <div class="results">
        <h3>Top Results:</h3>""")

    for i, result in enumerate(results, 1):
        header = result['header']
        score = result.get('similarity_score', 0.0)
        pages = result.get('pages', [])
        r_start = result.get('start_idx')
        r_end = result.get('end_idx')

        html_parts.append(f"""
        <div class="result">
            <strong>#{i}: {escape(header)}</strong><br>
            Score: {score:.3f} | Pages: {', '.join(map(str, pages))} | Elements: {r_start} to {r_end}
        </div>""")

    html_parts.append("""
    </div>

    <div class="separator"></div>

    <h3>Content:</h3>
""")

    # Insert the Docling-generated HTML content
    html_parts.append(section_html)

    html_parts.append("""
</body>
</html>""")

    # Write the final HTML file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(''.join(html_parts))

    print(f"✅ Final RAG completed! Results saved to {output_file}")

    return results

# Test the final simplified approach
print("Testing FINAL simplified RAG:")
final_results = run_simple_rag_final(concatenated, "cabin air filter replacement", "final_rag_results.html", top_n=3)

Testing FINAL simplified RAG:
Running Final Simple RAG for query: 'cabin air filter replacement'
1. Extracting sections...
Extracted 242 sections with content
2. Computing embeddings...
Computed embeddings for 228 sections
3. Performing search...
Top 3 results:
1. Air Ventilation Seats (Score: 0.740)
2. Air Conditioning System (Score: 0.676)
3. Air Conditioner Compressor Label (Score: 0.674)
4. Finding continuous section range...
   Found range: elements 240 to 646
5. Exporting with Docling API...
✅ Final RAG completed! Results saved to final_rag_results.html
