In [1]:
# !pip install chromadb sentence-transformers tqdm

In [2]:
import json
import os
from typing import List, Dict
from tqdm import tqdm

import chromadb
from chromadb.config import Settings

from sentence_transformers import SentenceTransformer


In [3]:
JSON_PATH = "mineru_0_to_40.json"

with open(JSON_PATH, "r", encoding="utf-8") as f:
    pages = json.load(f)

len(pages)


40

In [4]:
def page_to_text(page: Dict) -> str:
    return "\n".join(
        b["content"] for b in page["blocks"] if b.get("content")
    )

def flatten_json_to_text(pages: List[Dict]) -> str:
    return "\n".join(page_to_text(p) for p in pages)

def chunk_text(text, chunk_size=800, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

def is_table_page(page: Dict) -> bool:
    return any(b["type"] == "table" for b in page["blocks"])


In [5]:
# def flatten_json_to_text(pages: List[Dict]) -> str:
#     texts = []
#     for page in pages:
#         for block in page["blocks"]:
#             if block.get("content"):
#                 texts.append(block["content"])
#     return "\n".join(texts)


# def chunk_text(text: str, chunk_size=800, overlap=100) -> List[str]:
#     chunks = []
#     start = 0
#     while start < len(text):
#         end = start + chunk_size
#         chunks.append(text[start:end])
#         start = end - overlap
#     return chunks


# def page_to_text(page: Dict) -> str:
#     texts = []
#     for block in page["blocks"]:
#         if block.get("content"):
#             texts.append(block["content"])
#     return "\n".join(texts)


# def is_table_page(page: Dict) -> bool:
#     return any(b["type"] == "table" for b in page["blocks"])


In [6]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed(texts):
    return embedder.encode(texts, show_progress_bar=False)


In [7]:
chroma_client = chromadb.Client(
    Settings(
        persist_directory="./chroma_rnd",
        anonymized_telemetry=False
    )
)


In [8]:


collection_A1 = chroma_client.create_collection(name="A1_whole_json")

full_text = flatten_json_to_text(pages)
chunks = chunk_text(full_text, chunk_size=800)

collection_A1.add(
    documents=chunks,
    embeddings=embed(chunks),
    ids=[f"A1_{i}" for i in range(len(chunks))]
)


In [9]:
collection_A1.query(
    query_texts=["What does the Cvt column mean?"],
    n_results=5
)


{'ids': [['A1_115', 'A1_22', 'A1_25', 'A1_51', 'A1_18']],
 'embeddings': None,
 'documents': [['ication starts with a description of the actions performed by the client and the server(s), which are specific to each service. The description of each service includes a table, which lists the parameters of its primitives: request/indication, response/confirmation for positive or negative result. All have the same structure:\nFor a given request/indication and response/confirmation A_PDU definition the presence of each parameter is described by one of the following convention (Cvt) values:\nTable 8 defines the A_PDU parameter conventions.\nTable 8-A_PDU parameter conventions\n<table><tr><td>Type</td><td>Name</td><td>Description</td></tr><tr><td>M</td><td>Mandatory</td><td>The parameter has to be present in the A_PDU.</td></tr><tr><td>C</td><td>Conditional</td><td>The parameter can be present ',
   'ient\nfunction that is part of the tester and that makes use of the diagnostic services\nNOTE

In [10]:
collection_A2 = chroma_client.create_collection(name="A2_page_wise")

docs = [page_to_text(p) for p in pages]

collection_A2.add(
    documents=docs,
    embeddings=embed(docs),
    ids=[f"A2_page_{i}" for i in range(len(docs))]
)


In [11]:
collection_A2.query(
    query_texts=["What does the Cvt column mean?"],
    n_results=5
)


{'ids': [['A2_page_39', 'A2_page_0', 'A2_page_9', 'A2_page_25', 'A2_page_8']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\nTable 12 - Request message sub-function parameter definition\n<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mnemonic</td></tr><tr><td>xx</td><td>sub-function#1\ndescription of sub-function parameter#1</td><td>M/U</td><td>SUBFUNC1</td></tr><tr><td>:</td><td>:</td><td>:</td><td>:</td></tr><tr><td>xx</td><td>sub-function#m\ndescription of sub-function parameter#m</td><td>M/U</td><td>SUBFUNCm</td></tr></table>\nThe convention (Cvt) column in the Table 12 above shall be interpreted as defined in Table 13.\nTable 13 — SubFunction parameter conventions\n<table><tr><td>Type</td><td>Name</td><td>Description</td></tr><tr><td>M</td><td>Mandatory</td><td>The sub-function parameter has to be supported by the server in case the service is supported.</td></tr><tr><td>U</td><td>User option</td><td>The sub-function parameter may or may not be supp

In [12]:

collection_A3 = chroma_client.create_collection(name="A3_page_type_split")

docs, ids = [], []

for i, page in enumerate(pages):
    text = page_to_text(page)
    if is_table_page(page):
        docs.append(text)
        ids.append(f"A3_table_page_{i}")
    else:
        chunks = chunk_text(text, 600)
        for j, c in enumerate(chunks):
            docs.append(c)
            ids.append(f"A3_text_{i}_{j}")

collection_A3.add(
    documents=docs,
    embeddings=embed(docs),
    ids=ids
)


In [13]:
collection_A3.query(
    query_texts=["What does the Cvt column mean?"],
    n_results=5
)


{'ids': [['A3_text_8_0',
   'A3_table_page_39',
   'A3_text_20_5',
   'A3_text_21_0',
   'A3_text_10_0']],
 'embeddings': None,
 'documents': [['INTERNATIONAL STANDARD\nISO 14229-1:2013(E)\nRoad vehicles — Unified diagnostic services (UDS) —\nPart 1: Specifications and requirements\n1 Scope\nThis part of ISO 14229 specifies data link independent requirements of diagnostic services, which allow a diagnostic tester (client) to control diagnostic functions in an on-vehicle Electronic Control Unit (ECU, server) such as an electronic fuel injection, automatic gear box, anti-lock braking system, etc. connected to a serial data link embedded in a road vehicle.\nIt specifies generic services, which allow the diagnostic tester (client) to stop or to ',
   'ISO 14229-1:2013(E)\nTable 12 - Request message sub-function parameter definition\n<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mnemonic</td></tr><tr><td>xx</td><td>sub-function#1\ndescription of sub-function parameter#1<

In [14]:
EMBED_MODELS = {
    "minilm": "sentence-transformers/all-MiniLM-L6-v2",
    "mpnet": "sentence-transformers/all-mpnet-base-v2"
}

collections_A4 = {}

embedders_A4 = {}

for key, model_name in EMBED_MODELS.items():
    embedder = SentenceTransformer(model_name)
    embedders_A4[key] = embedder

    embeddings = embedder.encode(docs, show_progress_bar=True)

    collection = chroma_client.create_collection(name=f"A4_{key}")

    collection.add(
        documents=docs,
        embeddings=embeddings,
        ids=[f"{key}_{i}" for i in range(len(docs))]
    )

    collections_A4[key] = collection




Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
def query_a4(query, model="minilm", k=5):
    embedder = embedders_A4[model]
    collection = collections_A4[model]

    query_embedding = embedder.encode([query])

    return collection.query(
        query_embeddings=query_embedding,
        n_results=k
    )


In [17]:
res = query_a4(
    "How is the sub-function byte value calculated?",
    model="mpnet",
    k=5
)

for d in res["documents"][0]:
    print(d[:150], "\n")


ISO 14229-1:2013(E)
Table 12 - Request message sub-function parameter definition
<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mne 

ISO 14229-1:2013(E)
7.3.2 SI, Service Identifier
Type: 1 byte unsigned integer value
Range: \(0 \times 00 - 0 \times 00\) according to definitions in  

ISO 14229-1:2013(E)
NOTE The addressing information is shown in the table above for definition purpose. Further service request/indication definitions 

ISO 14229-1:2013(E)
Table 5 — Functionally addressed request message with sub-function parameter and server response behaviour
<table><tr><td rowspan= 

sionMode parameter definitions 351
C.5 Coding of UDS version number 352
Annex D (normative) Stored data transmission functional unit data-parameter de 



In [18]:
res = query_a4(
    "How is the sub-function byte value calculated?",
    model="minilm",
    k=5
)

for d in res["documents"][0]:
    print(d[:150], "\n")


ISO 14229-1:2013(E)
Table 12 - Request message sub-function parameter definition
<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mne 

ISO 14229-1:2013(E)
NOTE The addressing information is shown in the table above for definition purpose. Further service request/indication definitions 

 (and service indications) sent by a client on the main network, A_AE represents the remote server identifier (remote target address) for the server t 

ts identified by the value of the first byte of the A_PCI parameter. For all service requests and for service responses with first byte unequal to 0x7 

ISO 14229-1:2013(E)
6.4.1.3 A_SA, Application layer source address
Type: 2 byte unsigned integer value
Range: 0x0000 - 0xFFFF
Description:
The paramet 



In [24]:
def dense_retrieve(query, k=5):
    embedder = embedders_A4["mpnet"]
    collection = collections_A4["mpnet"]

    q_emb = embedder.encode([query])

    res = collection.query(
        query_embeddings=q_emb,
        n_results=k
    )

    return res["documents"][0]


In [25]:
dense_retrieve(
    "How is the sub-function byte value calculated?",
    k=5
)


['ISO 14229-1:2013(E)\nTable 12 - Request message sub-function parameter definition\n<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mnemonic</td></tr><tr><td>xx</td><td>sub-function#1\ndescription of sub-function parameter#1</td><td>M/U</td><td>SUBFUNC1</td></tr><tr><td>:</td><td>:</td><td>:</td><td>:</td></tr><tr><td>xx</td><td>sub-function#m\ndescription of sub-function parameter#m</td><td>M/U</td><td>SUBFUNCm</td></tr></table>\nThe convention (Cvt) column in the Table 12 above shall be interpreted as defined in Table 13.\nTable 13 — SubFunction parameter conventions\n<table><tr><td>Type</td><td>Name</td><td>Description</td></tr><tr><td>M</td><td>Mandatory</td><td>The sub-function parameter has to be supported by the server in case the service is supported.</td></tr><tr><td>U</td><td>User option</td><td>The sub-function parameter may or may not be supported by the server, depending on the usage of the service.</td></tr></table>\nThe complete sub-function parameter 

In [20]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
from rank_bm25 import BM25Okapi
import numpy as np
tokenized_docs = [d.lower().split() for d in docs]
bm25 = BM25Okapi(tokenized_docs)


In [29]:
def bm25_retrieve(query, k=5):
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)

    top_indices = np.argsort(scores)[::-1][:k]
    return [docs[i] for i in top_indices]


In [30]:
bm25_retrieve(
    "Table 14 sub-function byte value",
    k=5
)


['ISO 14229-1:2013(E)\nTable 12 - Request message sub-function parameter definition\n<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mnemonic</td></tr><tr><td>xx</td><td>sub-function#1\ndescription of sub-function parameter#1</td><td>M/U</td><td>SUBFUNC1</td></tr><tr><td>:</td><td>:</td><td>:</td><td>:</td></tr><tr><td>xx</td><td>sub-function#m\ndescription of sub-function parameter#m</td><td>M/U</td><td>SUBFUNCm</td></tr></table>\nThe convention (Cvt) column in the Table 12 above shall be interpreted as defined in Table 13.\nTable 13 — SubFunction parameter conventions\n<table><tr><td>Type</td><td>Name</td><td>Description</td></tr><tr><td>M</td><td>Mandatory</td><td>The sub-function parameter has to be supported by the server in case the service is supported.</td></tr><tr><td>U</td><td>User option</td><td>The sub-function parameter may or may not be supported by the server, depending on the usage of the service.</td></tr></table>\nThe complete sub-function parameter 

In [31]:
def hybrid_retrieve(query, k_dense=5, k_bm25=5):
    dense_results = dense_retrieve(query, k_dense)
    bm25_results = bm25_retrieve(query, k_bm25)

    combined = []
    seen = set()

    for d in dense_results + bm25_results:
        if d not in seen:
            combined.append(d)
            seen.add(d)

    return combined


In [32]:
hybrid_retrieve(
    "Explain Table 14 calculation of sub-function byte value",
    k_dense=5,
    k_bm25=5
)


['ISO 14229-1:2013(E)\nTable 12 - Request message sub-function parameter definition\n<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mnemonic</td></tr><tr><td>xx</td><td>sub-function#1\ndescription of sub-function parameter#1</td><td>M/U</td><td>SUBFUNC1</td></tr><tr><td>:</td><td>:</td><td>:</td><td>:</td></tr><tr><td>xx</td><td>sub-function#m\ndescription of sub-function parameter#m</td><td>M/U</td><td>SUBFUNCm</td></tr></table>\nThe convention (Cvt) column in the Table 12 above shall be interpreted as defined in Table 13.\nTable 13 — SubFunction parameter conventions\n<table><tr><td>Type</td><td>Name</td><td>Description</td></tr><tr><td>M</td><td>Mandatory</td><td>The sub-function parameter has to be supported by the server in case the service is supported.</td></tr><tr><td>U</td><td>User option</td><td>The sub-function parameter may or may not be supported by the server, depending on the usage of the service.</td></tr></table>\nThe complete sub-function parameter 

In [33]:
def compare_retrievals(query, k=5):
    print("\n=== Dense ===")
    for d in dense_retrieve(query, k):
        print(d[:200], "\n")

    print("\n=== BM25 ===")
    for d in bm25_retrieve(query, k):
        print(d[:200], "\n")

    print("\n=== Hybrid ===")
    for d in hybrid_retrieve(query, k, k):
        print(d[:200], "\n")


In [34]:
compare_retrievals(
    "How is the sub-function byte value calculated?",
    k=5
)



=== Dense ===
ISO 14229-1:2013(E)
Table 12 - Request message sub-function parameter definition
<table><tr><td>Bits 6 – 0</td><td>Description</td><td>Cvt</td><td>Mnemonic</td></tr><tr><td>xx</td><td>sub-function#1
d 

ISO 14229-1:2013(E)
7.3.2 SI, Service Identifier
Type: 1 byte unsigned integer value
Range: \(0 \times 00 - 0 \times 00\) according to definitions in Table 2.
Table 2 — Service identifier values
<tabl 

ISO 14229-1:2013(E)
NOTE The addressing information is shown in the table above for definition purpose. Further service request/indication definitions only specify the A_Data A_PDU parameter, because  

ISO 14229-1:2013(E)
Table 5 — Functionally addressed request message with sub-function parameter and server response behaviour
<table><tr><td rowspan="2">#</td><td colspan="2">Client request message</ 

sionMode parameter definitions 351
C.5 Coding of UDS version number 352
Annex D (normative) Stored data transmission functional unit data-parameter definitions 353
D.1 group