In [144]:
import json
import os
from typing import List, Dict
from tqdm import tqdm

import chromadb
from chromadb.config import Settings

from collections import defaultdict

from sentence_transformers import SentenceTransformer
import re
from bs4 import BeautifulSoup
from bs4.element import Tag
from FlagEmbedding import FlagReranker

In [145]:

with open('image_extracted_gpu.json', 'r', encoding = 'utf-8') as json_file:
    pages = json.load(json_file)

len(pages)


402

In [146]:
def sanitize_metadata(meta: dict):
    clean = {}
    for k, v in meta.items():
        if v is None:
            clean[k] = ""        # safest default
        elif isinstance(v, (str, int, float, bool)):
            clean[k] = v
        else:
            clean[k] = str(v)    # fallback (shouldn't happen)
    return clean


In [147]:
def flatten_page_blocks(blocks):
    """
    Returns:
    - embed_text: string used for embedding
    - html_tables: list of raw HTML tables (metadata only)
    """

    lines = []
    html_tables = []

    for block in blocks:
        btype = block.get("type")
        content = block.get("content")

        if not content:
            continue

        if btype == "page_number":
            lines.append(f"PAGE {content}")

        elif btype == "title":
            lines.append(f"TITLE: {content}")

        elif btype == "header":
            lines.append(f"HEADER: {content}")

        elif btype == "text":
            lines.append(content)

        elif btype == "list":
            lines.append(f"LIST: {content}")

        elif btype == "table_caption":
            lines.append(f"TABLE CAPTION: {content}")

        elif btype == "table":
            html_tables.append(content)

            soup = BeautifulSoup(content, "html.parser")
            for row in soup.find_all("tr"):
                cells = [c.get_text(strip=True) for c in row.find_all(["th", "td"])]
                if cells:
                    lines.append(" | ".join(cells))

    return "\n".join(lines), html_tables


In [148]:
page_docs = []
page_metas = []

for page in pages:
    blocks = page.get("blocks", [])
    image_name = page.get("image")

    embed_text, html_tables = flatten_page_blocks(blocks)

    if not embed_text.strip():
        continue

    page_no = next(
        (b["content"] for b in blocks if b.get("type") == "page_number"),
        ""
    )

    page_docs.append(embed_text)

    meta = {
        "page": page_no or "",
        "image": image_name or "",
        "type": "page",
        "html_tables": "\n\n".join(html_tables) if html_tables else ""
    }

    page_metas.append(sanitize_metadata(meta))


In [149]:
page_metas

[{'page': '', 'image': 'page_00001.png', 'type': 'page', 'html_tables': ''},
 {'page': 'ii', 'image': 'page_00002.png', 'type': 'page', 'html_tables': ''},
 {'page': 'iii', 'image': 'page_00003.png', 'type': 'page', 'html_tables': ''},
 {'page': 'iv', 'image': 'page_00004.png', 'type': 'page', 'html_tables': ''},
 {'page': 'V', 'image': 'page_00005.png', 'type': 'page', 'html_tables': ''},
 {'page': 'vi', 'image': 'page_00006.png', 'type': 'page', 'html_tables': ''},
 {'page': 'vii',
  'image': 'page_00007.png',
  'type': 'page',
  'html_tables': '<table><tr><td>Applicability</td><td>OSI seven layer</td><td colspan="6">Enhanced diagnostics services</td><td>WWH-OBD</td></tr><tr><td rowspan="7">Seven layer according to ISO/IEC 7498-1 and ISO/IEC 10731</td><td>Application (layer 7)</td><td colspan="6">ISO 14229-1, ISO 14229-3 UDSonCAN, ISO 14229-4 UDSonFR, ISO 14229-5 UDSonIP, ISO 14229-6 UDSonK-Line, ISO 14229-7 UDSonLIN, further standards</td><td>ISO 27145-3</td></tr><tr><td>Presentatio

In [150]:
embedder = SentenceTransformer("./bge-large-en")

In [151]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(
    Settings(
        persist_directory="./chroma_mineru_page_wise",
        anonymized_telemetry=False
    )
)

page_embeddings = embedder.encode(
    page_docs,                     
    show_progress_bar=True,
    normalize_embeddings=True
)

# page_col = client.create_collection("page_index")
page_col = client.get_collection("page_index")

page_col.add(
    documents=page_docs,
    embeddings=page_embeddings,
    ids=[f"text_{i}" for i in range(len(page_docs))],
    metadatas=page_metas           
)


Batches: 100%|████████████████████████████████████████████████████████████████████████| 13/13 [37:40<00:00, 173.85s/it]


In [154]:
def embed_query(query: str):
    return embedder.encode(
        f"Represent this sentence for searching relevant passages: {query}",
        normalize_embeddings=True
    )

query = "default sub function"
q_emb = embed_query(query)

page_hits = page_col.query(
    query_embeddings=[q_emb],
    n_results=20,
    include=["documents", "metadatas", "distances"]
)

In [155]:
def flatten_hits(hits, source_type):
    docs = []
    for doc_id, doc, meta, dist in zip(
        hits["ids"][0],          # ← IDs are HERE
        hits["documents"][0],
        hits["metadatas"][0],
        hits["distances"][0],
    ):
        docs.append({
            "id": doc_id,
            "content": doc,
            "metadata": meta,
            "distance": dist,
            "source": source_type
        })
    return docs


In [156]:
table_docs = flatten_hits(page_hits, "table")

In [157]:
reranker = FlagReranker(
    "./bge-reranker-v2-m3",
    use_fp16=True  # False if CPU-only
)


In [158]:
def rerank_results(query, retrieved_docs, reranker, top_k=5):
    pairs = [[query, d["content"]] for d in retrieved_docs]
    scores = reranker.compute_score(pairs, normalize=True)

    reranked = sorted(
        zip(retrieved_docs, scores),
        key=lambda x: x[1],
        reverse=True
    )
    return reranked[:top_k]


In [170]:
final_results[0]

({'id': 'text_45',
  'content': 'HEADER: ISO 14229-1:2013(E)\nTITLE: 9.2.2 Request message\nTITLE: 9.2.2.1 Request message definition\nTable 24 defines the request message.\nTABLE CAPTION: Table 24 - Request message definition\nA_Data byte | Parameter Name | Cvt | Byte Value | Mnemonic\n#1 | DiagnosticSessionControl Request SID | M | 0x10 | DSC\n#2 | sub-function = [ diagnosticSessionType ] | M | 0x00 - 0xFF | LEV_DS_\nTITLE: 9.2.2.2 Request message sub-function parameter $Level (LEV_) definition\nThe sub-function parameter diagnosticSessionType is used by the DiagnosticSessionControl service to select the specific behaviour of the server. Explanations and usage of the possible diagnostic sessions are detailed in Table 25.\nThe following sub-function values are specified (suppressPosRspMsgIndicationBit (bit 7) not shown).\nTABLE CAPTION: Table 25 - Request message sub-function parameter definition\nBit 6-0 | Description | Cvt | Mnemonic\n0x00 | ISOSAEReservedThis value is reserved by t

In [159]:
final_results = rerank_results(
    query,
    table_docs,
    reranker,
    top_k=5
)

top_k = 5
final_results = final_results[:top_k]

for doc, score in final_results:
    print(
        f"score={score:.4f} | source={doc['source']} | page={doc['metadata'].get('page')}"
    )




You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


score=0.7201 | source=table | page=39
score=0.3800 | source=table | page=200
score=0.3225 | source=table | page=143
score=0.3160 | source=table | page=206
score=0.2835 | source=table | page=383


In [160]:
final_results

[({'id': 'text_45',
   'content': 'HEADER: ISO 14229-1:2013(E)\nTITLE: 9.2.2 Request message\nTITLE: 9.2.2.1 Request message definition\nTable 24 defines the request message.\nTABLE CAPTION: Table 24 - Request message definition\nA_Data byte | Parameter Name | Cvt | Byte Value | Mnemonic\n#1 | DiagnosticSessionControl Request SID | M | 0x10 | DSC\n#2 | sub-function = [ diagnosticSessionType ] | M | 0x00 - 0xFF | LEV_DS_\nTITLE: 9.2.2.2 Request message sub-function parameter $Level (LEV_) definition\nThe sub-function parameter diagnosticSessionType is used by the DiagnosticSessionControl service to select the specific behaviour of the server. Explanations and usage of the possible diagnostic sessions are detailed in Table 25.\nThe following sub-function values are specified (suppressPosRspMsgIndicationBit (bit 7) not shown).\nTABLE CAPTION: Table 25 - Request message sub-function parameter definition\nBit 6-0 | Description | Cvt | Mnemonic\n0x00 | ISOSAEReservedThis value is reserved by

In [161]:
# eval_data = [

#     # ===============================
#     # Negative response / A_PDU
#     # ===============================
#     {
#         "query": "Negative response A_PDU parameters",
#         "relevant_ids": {"text_24"}   # Table 3 — Negative response A_PDU (page 18)
#     },
#     {
#         "query": "Target Address TA field in negative response",
#         "relevant_ids": {"text_24"}
#     },
#     {
#         "query": "Negative Response SID value",
#         "relevant_ids": {"text_24"}
#     },
#     {
#         "query": "responseCode parameter usage in negative response",
#         "relevant_ids": {"text_24"}
#     },

#     # ===============================
#     # Suppress Positive Response bit
#     # ===============================
#     {
#         "query": "suppress positive response indication bit meaning",
#         "relevant_ids": {"text_24"}
#     },
#     {
#         "query": "when server does not send positive response message",
#         "relevant_ids": {"text_24"}
#     },

#     # ===============================
#     # Server response rules
#     # ===============================
#     {
#         "query": "server response implementation rules",
#         "relevant_ids": {"text_24"}
#     },
#     {
#         "query": "conditions when server sends no response",
#         "relevant_ids": {"text_24"}
#     },

#     # ===============================
#     # Addressing schemes tables
#     # ===============================
#     {
#         "query": "server response behavior for physical addressing",
#         "relevant_ids": {"table_22"}  # page 22 table
#     },
#     {
#         "query": "functional addressing response rules",
#         "relevant_ids": {"table_24"}  # page 24 table
#     },
#     {
#         "query": "negative response when sub function not supported",
#         "relevant_ids": {"table_22"}
#     },

#     # ===============================
#     # Service identifiers
#     # ===============================
#     {
#         "query": "negative response service identifier value",
#         "relevant_ids": {"table_17"}  # SI ranges incl. 0x7F
#     },
#     {
#         "query": "service identifier ranges defined by ISO 14229",
#         "relevant_ids": {"table_17"}
#     },

#     # ===============================
#     # Supported NRC
#     # ===============================
#     {
#         "query": "supported negative response codes",
#         "relevant_ids": {"text_77"}   # Table 85
#     },
#     {
#         "query": "incorrect message length negative response code",
#         "relevant_ids": {"text_77"}
#     },

#     # ===============================
#     # Annex A – Global NRC
#     # ===============================
#     {
#         "query": "global negative response code definitions",
#         "relevant_ids": {"text_331"}  # Annex A.1
#     },
#     {
#         "query": "NRC 0x11 service not supported meaning",
#         "relevant_ids": {"text_331"}
#     },
#     {
#         "query": "difference between general reject and service not supported",
#         "relevant_ids": {"text_331"}
#     },

#     # ===============================
#     # OSI Layer Mapping
#     # ===============================
#     {
#         "query": "OSI layer mapping for UDS services",
#         "relevant_ids": {"table_7"}   # page vii table
#     },
#     {
#         "query": "which OSI layer does ISO 14229 define",
#         "relevant_ids": {"table_7"}
#     },

#     # ===============================
#     # Definitions
#     # ===============================
#     {
#         "query": "definition of diagnostic service",
#         "relevant_ids": {"text_20"}   # section 3.1.x
#     },
#     {
#         "query": "definition of diagnostic session",
#         "relevant_ids": {"text_20"}
#     },

#     # ===============================
#     # ClearDiagnosticInformation
#     # ===============================
#     {
#         "query": "ClearDiagnosticInformation supported negative response codes",
#         "relevant_ids": {"text_292"}  # Table 253
#     },
#     {
#         "query": "groupOfDTC parameter definition",
#         "relevant_ids": {"text_292"}
#     },

#     # ===============================
#     # SecurityAccess behavior
#     # ===============================
#     {
#         "query": "negative response during SecurityAccess sendKey",
#         "relevant_ids": {"text_369"}  # state table
#     },

#     # ===============================
#     # TransferData service
#     # ===============================
#     {
#         "query": "TransferData positive response message format",
#         "relevant_ids": {"text_388"}  # Table 405
#     }
# ]


In [162]:
# def recall_at_k(ranked_ids, relevant_ids, k):
#     return int(len(set(ranked_ids[:k]) & relevant_ids) > 0)


# def mrr_at_k(ranked_ids, relevant_ids, k):
#     for rank, doc_id in enumerate(ranked_ids[:k], start=1):
#         if doc_id in relevant_ids:
#             return 1.0 / rank
#     return 0.0


In [163]:
# def rerank_all(query, retrieved_docs, reranker):
#     pairs = [[query, d["content"]] for d in retrieved_docs]
#     scores = reranker.compute_score(pairs, normalize=True)

#     reranked = sorted(
#         zip(retrieved_docs, scores),
#         key=lambda x: x[1],
#         reverse=True
#     )
#     return reranked


In [164]:
# def evaluate_single_query(query, relevant_ids, k=10):
#     # ---- retrieve ----
#     q_emb = embed_query(query)

#     page_hits = page_col.query(
#         query_embeddings=[q_emb],
#         n_results=50,  # IMPORTANT: higher than k
#         include=["documents", "metadatas", "distances"]
#     )

#     docs = flatten_hits(page_hits, "page")

#     # ---- embedding ranking (original order) ----
#     embedding_ranked_ids = [d["id"] for d in docs]

#     # ---- reranker ranking (full rerank) ----
#     reranked = rerank_all(query, docs, reranker)
#     reranked_ids = [d["id"] for d, _ in reranked]

#     # ---- metrics ----
#     return {
#         "query": query,
#         "embedding": {
#             "recall@k": recall_at_k(embedding_ranked_ids, relevant_ids, k),
#             "mrr@k": mrr_at_k(embedding_ranked_ids, relevant_ids, k),
#         },
#         "reranker": {
#             "recall@k": recall_at_k(reranked_ids, relevant_ids, k),
#             "mrr@k": mrr_at_k(reranked_ids, relevant_ids, k),
#         }
#     }


In [165]:
# results = []

# for item in eval_data:
#     metrics = evaluate_single_query(
#         query=item["query"],
#         relevant_ids=item["relevant_ids"],
#         k=10
#     )
#     results.append(metrics)


In [166]:
# for r in results:
#     print("\nQuery:", r["query"])
#     print("Embedding → Recall@10:", r["embedding"]["recall@k"],
#           "MRR@10:", round(r["embedding"]["mrr@k"], 3))
#     print("Reranker  → Recall@10:", r["reranker"]["recall@k"],
#           "MRR@10:", round(r["reranker"]["mrr@k"], 3))
