In [1]:
import json
import os
from typing import List, Dict
from tqdm import tqdm

import chromadb
from chromadb.config import Settings

from collections import defaultdict

from sentence_transformers import SentenceTransformer
import re
from bs4 import BeautifulSoup
from bs4.element import Tag

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

with open('image_extracted_gpu.json', 'r', encoding = 'utf-8') as json_file:
    pages = json.load(json_file)

len(pages)


402

In [3]:
def flatten_text_chunk(chunk):
    return "\n".join(
        f"{b['type'].upper()}: {b['content']}"
        for b in chunk["data_chunk"]
    )


In [4]:
text_chunks = []
table_chunks = []

for page in pages:
    blocks = page['blocks']

    temp_text_chunk = []
    temp_table_chunk = []
    page_number = None
    has_table = False

    for block in blocks:
        btype = block.get('type')
        content = block.get('content')

        if not content:
            continue

        
        if btype == 'page_number':
            page_number = content
            temp_text_chunk.append({
                "type": btype,
                "content": content
            })
            continue

        
        if btype in ('header', 'title', 'text', 'list'):
            temp_text_chunk.append({
                "type": btype,
                "content": content
            })

        
        elif btype == 'table_caption':
            temp_table_chunk.append({
                "type": btype,
                "content": content
            })

        
        elif btype == 'table':
            has_table = True
            temp_table_chunk.append({
                "type": btype,
                "content": content
            })

   
    if has_table and page_number:
        temp_table_chunk.insert(0, {
            "type": "page_number",
            "content": page_number
        })

    
    if temp_text_chunk:
        text_chunks.append({
            "image_name": page["image"],
            "data_chunk": temp_text_chunk
        })

    if has_table:
        table_chunks.append({
            "image_name": page["image"],
            "data_chunk": temp_table_chunk
        })


In [5]:
def flatten_table_chunk(chunk):
    lines = []
    raw_tables = []

    for b in chunk["data_chunk"]:
        if b["type"] == "page_number":
            lines.append(f"Page: {b['content']}")

        elif b["type"] == "table_caption":
            lines.append(f"Table caption: {b['content']}")

        elif b["type"] == "table":
            raw_tables.append(b["content"])  # ðŸ‘ˆ keep HTML

            soup = BeautifulSoup(b["content"], "html.parser")
            for row in soup.find_all("tr"):
                cells = [c.get_text(strip=True) for c in row.find_all(["td", "th"])]
                if cells:
                    lines.append(" | ".join(cells))

    return {
        "embed_text": "\n".join(lines),
        "html": raw_tables
    }


In [6]:
def extract_page_number(chunk):
    for b in chunk["data_chunk"]:
        if b["type"] == "page_number" and b["content"]:
            return b["content"]
    return None


In [7]:
def extract_html_tables(chunk):
    return [
        b["content"]
        for b in chunk["data_chunk"]
        if b["type"] == "table" and b["content"]
    ]


In [8]:
from collections import defaultdict

page_to_tables = defaultdict(list)

for c in table_chunks:
    page_no = extract_page_number(c)
    if page_no is None:
        continue

    flattened = flatten_table_chunk(c)
    page_to_tables[page_no].append(flattened["embed_text"])


In [9]:
def serialize_html_tables(html_tables):
    if not html_tables:
        return ""
    return "\n\n".join(html_tables)


In [10]:
text_docs = []
text_metas = []

for c in text_chunks:
    page_no = extract_page_number(c)
    if page_no is None:
        continue

    text_content = flatten_text_chunk(c)

    # add table context if present
    if page_no in page_to_tables:
        text_content += "\n\nTABLE CONTEXT:\n"
        text_content += "\n".join(page_to_tables[page_no])

    text_docs.append(text_content)
    text_metas.append({
        "page": page_no,
        "image": c["image_name"],
        "type": "text"
    })


In [11]:
table_docs = []
table_metas = []

for c in table_chunks:
    page_no = extract_page_number(c)
    if page_no is None:
        continue

    flattened = flatten_table_chunk(c)
    html_tables = extract_html_tables(c)

    table_docs.append(flattened["embed_text"])
    table_metas.append({
        "page": page_no,
        "image": c["image_name"],
        "type": "table",

        # semantic
        "flattened_table_text": flattened["embed_text"],

        # raw html (SERIALIZED STRING)
        "html_table": serialize_html_tables(html_tables)
    })


In [12]:
# text_chunks = []

# for page in pages:
#     blocks = page["blocks"]
#     current_chunk = []

#     for block in blocks:
#         if block["type"] in ("title", "header"):
#             if current_chunk:
#                 text_chunks.append({
#                     "content": " ".join(current_chunk),
#                     "page": page["image"],
#                     "type": "text"
#                 })
#                 current_chunk = []

#         if block["type"] in ("text", "list"):
#             if isinstance(block["content"], str):
#                 current_chunk.append(block["content"])

#         if block["type"] == "table":
#             # stop text chunk before table
#             if current_chunk:
#                 text_chunks.append({
#                     "content": " ".join(current_chunk),
#                     "page": page["image"],
#                     "type": "text"
#                 })
#                 current_chunk = []

#     if current_chunk:
#         text_chunks.append({
#             "content": " ".join(current_chunk),
#             "page": page["image"],
#             "type": "text"
#         })


In [None]:
# text_chunks[32]

In [14]:
# def row_to_facts(row, headers, table_caption):
#     facts = []

#     for col, val in row.items():
#         if val:
#             facts.append(f"{col}: {val}")

#     return (
#         f"In table '{table_caption}', "
#         f"the row contains the following values: "
#         f"{'; '.join(facts)}."
#     )


In [15]:
# table_chunks = []
# table_row_chunks = []

# for page in pages:
#     blocks = page["blocks"]

#     for i, block in enumerate(blocks):
#         if block["type"] != "table":
#             continue

#         # --- caption ---
#         caption = ""
#         for j in range(i - 1, max(i - 6, -1), -1):
#             if blocks[j]["type"] == "table_caption":
#                 caption = blocks[j]["content"]
#                 break
#         if not caption:
#             for j in range(i + 1, min(i + 6, len(blocks))):
#                 if blocks[j]["type"] == "table_caption":
#                     caption = blocks[j]["content"]
#                     break

#         # --- context above ---
#         context_text = []
#         for j in range(i - 1, max(i - 6, -1), -1):
#             if blocks[j]["type"] == "text":
#                 ct = blocks[j].get("content")
#                 if isinstance(ct, str) and ct.strip():
#                     context_text.append(ct.strip())
#         context_text = context_text[::-1]

#         # --- parse rows ---
#         rows = parse_table_html(block["content"])
#         row_facts = []

#         for r in rows:
#             if "A_PDU parameter" in r and "Cvt" in r:
#                 fact = (
#                     f"Parameter {r['A_PDU parameter']} "
#                     f"({r.get('Parameter Name','')}) "
#                     f"has Cvt value {r['Cvt']} "
#                     f"and byte value {r.get('Byte value','')} "
#                     f"in {caption}."
#                 )
#                 row_facts.append(fact)

#                 # row-level index (IMPORTANT)
#                 table_row_chunks.append({
#                     "content": fact,
#                     "page": page["image"],
#                     "parameter": r["A_PDU parameter"],
#                     "table": caption
#                 })

#         # table-level chunk (still useful)
#         embedding_text = f"""
#                             Context:
#                             {' '.join(context_text)}
                            
#                             Table Caption:
#                             {caption}
                            
#                             Row Facts:
#                             {' '.join(row_facts)}
#                             """

#         table_chunks.append({
#             "content": embedding_text.strip(),
#             "page": page["image"],
#             "type": "table"
#         })


In [18]:
embedder = SentenceTransformer("./bge-large-en")


In [28]:
# import chromadb
# from chromadb.config import Settings

# client = chromadb.Client(
#     Settings(
#         persist_directory="./chroma_mineru_fixed",
#         anonymized_telemetry=False
#     )
# )

# text_col = client.create_collection("text_index")
# table_col = client.create_collection("table_index")


In [29]:
# text_embeddings = embedder.encode(
#     text_docs,                     # list[str]
#     show_progress_bar=True,
#     normalize_embeddings=True
# )


In [30]:
# text_col.add(
#     documents=text_docs,
#     embeddings=text_embeddings,
#     ids=[f"text_{i}" for i in range(len(text_docs))],
#     metadatas=text_metas            # page, image, type
# )


In [31]:
# table_embeddings = embedder.encode(
#     table_docs,                    # flattened table text
#     show_progress_bar=True,
#     normalize_embeddings=True
# )


In [32]:
# table_col.add(
#     documents=table_docs,
#     embeddings=table_embeddings,
#     ids=[f"table_{i}" for i in range(len(table_docs))],
#     metadatas=table_metas           # page, image, type, html_tables
# )


In [33]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(
    Settings(
        persist_directory="./chroma_mineru_fixed",
        anonymized_telemetry=False
    )
)

text_col = client.get_collection("text_index")
table_col = client.get_collection("table_index")


In [95]:
def embed_query(query: str):
    return embedder.encode(
        f"Represent this sentence for searching relevant passages: {query}",
        normalize_embeddings=True
    )


In [106]:
query = "default sub function"
q_emb = embed_query(query)


In [107]:
table_hits = table_col.query(
    query_embeddings=[q_emb],
    n_results=20,
    include=["documents", "metadatas", "distances"]
)

text_hits = text_col.query(
    query_embeddings=[q_emb],
    n_results=20,
    include=["documents", "metadatas", "distances"]
)


In [108]:
table_hits

{'ids': [['table_152',
   'table_146',
   'table_150',
   'table_137',
   'table_136',
   'table_140',
   'table_139',
   'table_156',
   'table_138',
   'table_9',
   'table_302',
   'table_102',
   'table_101',
   'table_8',
   'table_10',
   'table_172',
   'table_35',
   'table_4',
   'table_24',
   'table_187']],
 'embeddings': None,
 'documents': [['Page: 206\nTable caption: Table 279 - Response message definition - sub-function = reportDTCExtDataRecordByRecordNumber\nA_Data byte | Parameter Name | Cvt | Byte Value | Mnemonic\n#1 | ReadDTCInformation Response SID | M | 0x59 | RDTCIPR\n#2 | reportType = [ reportDTExtDataRecordByRecordNumber] | M | 0x16 | LEV_ RDTCEDRBR\n#3 | DTExtDataRecordNumber | M | 0x00 - 0xEF | DTCEDRN\n#4 | DTAndStatusRecord[#1 = [ | C1 | 0x00 - 0xFF | DTCASR_\n#5 | DTHighByte | C1 | 0x00 - 0xFF | DTCHB\n#6 | DTMiddleByte | C1 | 0x00 - 0xFF | DTCMB\n#7 | DTLowByte | C1 | 0x00 - 0xFF | DTCLB\n | statusOfDTC | C1 | 0x00 - 0xFF | SODTC\n#8 | DTExtDataRecord[#1 

In [109]:
def flatten_hits(hits, source_type):
    docs = []
    for doc, meta, dist in zip(
        hits["documents"][0],
        hits["metadatas"][0],
        hits["distances"][0],
    ):
        docs.append({
            "content": doc,
            "metadata": meta,
            "distance": dist,
            "source": source_type
        })
    return docs


table_docs = flatten_hits(table_hits, "table")
text_docs = flatten_hits(text_hits, "text")

retrieved_docs = table_docs + text_docs   # total = 40


In [110]:
retrieved_docs

[{'content': 'Page: 206\nTable caption: Table 279 - Response message definition - sub-function = reportDTCExtDataRecordByRecordNumber\nA_Data byte | Parameter Name | Cvt | Byte Value | Mnemonic\n#1 | ReadDTCInformation Response SID | M | 0x59 | RDTCIPR\n#2 | reportType = [ reportDTExtDataRecordByRecordNumber] | M | 0x16 | LEV_ RDTCEDRBR\n#3 | DTExtDataRecordNumber | M | 0x00 - 0xEF | DTCEDRN\n#4 | DTAndStatusRecord[#1 = [ | C1 | 0x00 - 0xFF | DTCASR_\n#5 | DTHighByte | C1 | 0x00 - 0xFF | DTCHB\n#6 | DTMiddleByte | C1 | 0x00 - 0xFF | DTCMB\n#7 | DTLowByte | C1 | 0x00 - 0xFF | DTCLB\n | statusOfDTC | C1 | 0x00 - 0xFF | SODTC\n#8 | DTExtDataRecord[#1 = [ | C1 | 0x00 - 0xFF | DTCEDR_\n: | : | : | : | EDD11\n#8+(p-1) | : | C1 | 0x00 - 0xFF | EDD1p\n: | : | : | : | :\n#t | DTAndStatusRecord[#x = [ | C2 | 0x00 - 0xFF | DTCSSR\n#t+1 | DTHighByte | C2 | 0x00 - 0xFF | \n#t+2 | DTMiddleByte | C2 | 0x00 - 0xFF | \n#t+3 | DTLowByte | C2 | 0x00 - 0xFF | \n | statusOfDTC | C2 | 0x00 - 0xFF | \n#t+4 |

In [111]:
from FlagEmbedding import FlagReranker

reranker = FlagReranker(
    "./bge-reranker-v2-m3",
    use_fp16=True  # False if CPU-only
)


In [112]:
def rerank_results(query, retrieved_docs, reranker, top_k=5):
    pairs = [[query, d["content"]] for d in retrieved_docs]
    scores = reranker.compute_score(pairs, normalize=True)

    reranked = sorted(
        zip(retrieved_docs, scores),
        key=lambda x: x[1],
        reverse=True
    )
    return reranked[:top_k]


In [113]:
final_results = rerank_results(
    query,
    retrieved_docs,
    reranker,
    top_k=5
)


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [114]:
top_k = 5
final_results = final_results[:top_k]

for doc, score in final_results:
    print(
        f"score={score:.4f} | source={doc['source']} | page={doc['metadata'].get('page')}"
    )


score=0.7064 | source=text | page=39
score=0.4324 | source=table | page=48
score=0.3622 | source=table | page=200
score=0.3570 | source=table | page=142
score=0.3562 | source=table | page=210


In [115]:
final_results

[({'content': 'HEADER: ISO 14229-1:2013(E)\nTITLE: 9.2.2 Request message\nTITLE: 9.2.2.1 Request message definition\nTEXT: Table 24 defines the request message.\nTITLE: 9.2.2.2 Request message sub-function parameter $Level (LEV_) definition\nTEXT: The sub-function parameter diagnosticSessionType is used by the DiagnosticSessionControl service to select the specific behaviour of the server. Explanations and usage of the possible diagnostic sessions are detailed in Table 25.\nTEXT: The following sub-function values are specified (suppressPosRspMsgIndicationBit (bit 7) not shown).\nPAGE_NUMBER: 39\n\nTABLE CONTEXT:\nPage: 39\nTable caption: Table 24 - Request message definition\nA_Data byte | Parameter Name | Cvt | Byte Value | Mnemonic\n#1 | DiagnosticSessionControl Request SID | M | 0x10 | DSC\n#2 | sub-function = [ diagnosticSessionType ] | M | 0x00 - 0xFF | LEV_DS_\nTable caption: Table 25 - Request message sub-function parameter definition\nBit 6-0 | Description | Cvt | Mnemonic\n0x0