In [1]:
import json
import os
from typing import List, Dict
from tqdm import tqdm

import chromadb
from chromadb.config import Settings

from sentence_transformers import SentenceTransformer

KeyboardInterrupt: 

In [None]:

with open('image_extracted_gpu.json', 'r', encoding = 'utf-8') as json_file:
    pages = json.load(json_file)

len(pages)


In [3]:
table_data = []
for i in range(0,len(pages)):
    for j in pages[i]['blocks']:
        if j['type'] == 'table':
            table_data.append(pages[i])

In [4]:
table = []
for i in range(0,len(table_data)):
    for j in table_data[i]['blocks']:
        if j['type'] == 'table':
            table.append(j['content'])

In [5]:
len(table)

1233

In [6]:
import pandas as pd
from io import StringIO

html_string = table[7]

# Use StringIO to treat the string as a file
dfs = pd.read_html(StringIO(html_string))
df = dfs[0]


In [78]:
def page_to_text(page: Dict) -> str:
    return "\n".join(b["content"] for b in page["blocks"] if b.get("content"))

def flatten_json_to_text(pages: List[Dict]) -> str:
    return "\n".join(page_to_text(p) for p in pages)

def chunk_text(text, chunk_size=800, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

def is_table_page(page: Dict) -> bool:
    return any(b["type"] == "table" for b in page["blocks"])

def semantic_chunk(text, max_chars=600):
    paragraphs = text.split("\n\n")
    chunks, current = [], ""

    for p in paragraphs:
        if len(current) + len(p) <= max_chars:
            current += p + "\n\n"
        else:
            chunks.append(current.strip())
            current = p + "\n\n"

    if current:
        chunks.append(current.strip())

    return chunks


In [8]:
embedder = SentenceTransformer("./all-MiniLM-L6-v2")

def embed(texts):
    return embedder.encode(texts, show_progress_bar=False)

In [9]:
chroma_client = chromadb.Client(
    Settings(
        persist_directory="./chroma_rnd",
        anonymized_telemetry=False
    )
)


In [10]:
from sentence_transformers import SentenceTransformer
from chromadb.api.types import EmbeddingFunction, Documents

class LocalSentenceTransformerEmbedding(EmbeddingFunction):
    def __init__(self, model_path: str):
        self.model = SentenceTransformer(model_path)

    def __call__(self, input: Documents):
        # Chroma expects List[List[float]]
        return self.model.encode(
            input,
            show_progress_bar=False
        ).tolist()


In [11]:
import chromadb
from chromadb.config import Settings

chroma_client = chromadb.Client(
    Settings(
        persist_directory="./chroma_rnd",
        anonymized_telemetry=False
    )
)


In [12]:
embedding_fn = LocalSentenceTransformerEmbedding("./all-MiniLM-L6-v2")

collection_page_wise = chroma_client.create_collection(
    name="page_wise2",
    embedding_function=embedding_fn
)


In [13]:
docs = [page_to_text(p) for p in pages]

collection_page_wise.add(
    documents=docs,
    ids=[f"_page_{i}" for i in range(len(docs))]
)


In [14]:
results_page_wise = collection_page_wise.query(
    query_texts=["value of cvt in negative response for A_PDU parameter TA"],
    n_results=5
)

In [15]:
results_page_wise

{'ids': [['_page_40', '_page_25', '_page_78', '_page_208', '_page_37']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\n8.3 Positive response message\n8.3.1 Positive response message definition\nThis subclause defines the A_PDU parameters for the service response / confirmation (see 7.2 for a detailed description of the application layer protocol data unit A_PDU). There might be a separate table for each sub-function parameter $Level when the response messages of the different sub-function parameters $Level differ in the structure of the A_Data parameters.\nNOTE The positive response message of a diagnostic service (if required) shall be sent after the execution of the diagnostic service. In case a diagnostic service requires a different handling (e.g. ECUReset service) then the appropriate description of when to send the positive response message can be found in the service description of the diagnostic service.\nTable 16 defines the positive response A_PDU.\nTable 16 - Po

In [16]:
collection_page_type_split = chroma_client.create_collection(
    name="page_type_split",
    embedding_function=embedding_fn
)

docs, ids = [], []

for i, page in enumerate(pages):
    text = page_to_text(page)
    if is_table_page(page):
        docs.append(text)
        ids.append(f"table_page_{i}")
    else:
        chunks = chunk_text(text, 600)
        for j, c in enumerate(chunks):
            docs.append(c)
            ids.append(f"text_{i}_{j}")

collection_page_type_split.add(
    documents=docs,
    embeddings=embed(docs),
    ids=ids
)


In [17]:
results_page_type_split = collection_page_type_split.query(
    query_texts=["value of cvt in negative response for a_PDU parameter TA"],
    n_results=5
)

In [18]:
results_page_type_split

{'ids': [['table_page_40',
   'table_page_25',
   'text_30_0',
   'table_page_78',
   'table_page_208']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\n8.3 Positive response message\n8.3.1 Positive response message definition\nThis subclause defines the A_PDU parameters for the service response / confirmation (see 7.2 for a detailed description of the application layer protocol data unit A_PDU). There might be a separate table for each sub-function parameter $Level when the response messages of the different sub-function parameters $Level differ in the structure of the A_Data parameters.\nNOTE The positive response message of a diagnostic service (if required) shall be sent after the execution of the diagnostic service. In case a diagnostic service requires a different handling (e.g. ECUReset service) then the appropriate description of when to send the positive response message can be found in the service description of the diagnostic service.\nTable 16 defines the positi

In [22]:

embedding_fn = LocalSentenceTransformerEmbedding("./all-mpnet-base-v2")

collection_page_wise_mpnet = chroma_client.create_collection(
    name="page_wise_mpnet1",
    embedding_function=embedding_fn
)


In [23]:
docs = [page_to_text(p) for p in pages]

collection_page_wise_mpnet.add(
    documents=docs,
    ids=[f"_page_{i}" for i in range(len(docs))]
)


In [24]:
results_page_wise_mpnet = collection_page_wise_mpnet.query(
    query_texts=["value of cvt in negative response for A_PDU parameter TA"],
    n_results=5
)

In [25]:
results_page_wise_mpnet

{'ids': [['_page_40', '_page_25', '_page_339', '_page_41', '_page_332']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\n8.3 Positive response message\n8.3.1 Positive response message definition\nThis subclause defines the A_PDU parameters for the service response / confirmation (see 7.2 for a detailed description of the application layer protocol data unit A_PDU). There might be a separate table for each sub-function parameter $Level when the response messages of the different sub-function parameters $Level differ in the structure of the A_Data parameters.\nNOTE The positive response message of a diagnostic service (if required) shall be sent after the execution of the diagnostic service. In case a diagnostic service requires a different handling (e.g. ECUReset service) then the appropriate description of when to send the positive response message can be found in the service description of the diagnostic service.\nTable 16 defines the positive response A_PDU.\nTable 16 - P

In [26]:

embedding_fn = LocalSentenceTransformerEmbedding("./bge-large-en")

collection_page_wise_bge = chroma_client.create_collection(
    name="page_wise_bge",
    embedding_function=embedding_fn
)


In [27]:
docs = [page_to_text(p) for p in pages]

collection_page_wise_bge.add(
    documents=docs,
    ids=[f"_page_{i}" for i in range(len(docs))]
)


In [121]:
results_page_wise_bge = collection_page_wise_bge.query(
    query_texts=["Request A_PDU parameter Mtype CVT VYTE value"],
    n_results=5
)

In [122]:
results_page_wise_bge

{'ids': [['_page_37', '_page_330', '_page_62', '_page_66', '_page_329']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\n8.2 Request message\n8.2.1 Request message definition\nThis subclause includes one or multiple tables, which define the A_PDU (Application layer protocol data unit, see 7) parameters for the service request/indication. There might be a separate table for each sub-function parameter (\\$Level) in case the request messages of the different sub-function parameters (\\$Level) differ in the structure of the A_Data parameters and cannot be specified clearly in single table.\nTable 9 defines the request A_PDU definition with sub-function\nTable 9 - Request A_PDU definition with sub-function\n<table><tr><td>A_PDU parameter</td><td>Parameter Name</td><td>Cvt</td><td>Byte Value</td><td>Mnemonic</td></tr><tr><td>MType</td><td>Message Type</td><td>M</td><td>xx</td><td>MT</td></tr><tr><td>SA</td><td>Source Address</td><td>M</td><td>xxxx</td><td>SA</td></tr><tr><td>TA<

In [82]:
embedding_fn = LocalSentenceTransformerEmbedding("./bge-large-en")

collection_page_type_split_bge = chroma_client.create_collection(
    name="page_type_split_bge3",
    embedding_function=embedding_fn
)

docs, ids = [], []

for i, page in enumerate(pages):
    text = page_to_text(page)
    if is_table_page(page):
        docs.append(text)
        ids.append(f"table_page_{i}")
    else:
        chunks = semantic_chunk(text, 600)
        for j, c in enumerate(chunks):
            docs.append(c)
            ids.append(f"text_{i}_{j}")

collection_page_type_split_bge.add(
    documents=docs,
    ids=ids
)


In [100]:
results_page_type_split_bge = collection_page_type_split_bge.query(
    query_texts=["value of cvt in negative response for a_PDU parameter TA"],
    n_results=5
)

In [101]:
results_page_type_split_bge

{'ids': [['table_page_25',
   'table_page_40',
   'table_page_78',
   'table_page_70',
   'table_page_41']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\n7.4 Negative response/confirmation service primitive\nEach diagnostic service has a negative response/negative confirmation message specified with message A_Data bytes according to Table 3. The first A_Data byte (A_PCI.NR_SI) is always the specific negative response service identifier. The second A_Data byte (A_PCI.SI) shall be a copy of the service identifier value from the service request/indication message that the negative response message corresponds to.\nTable 3 — Negative response A_PDU\n<table><tr><td>A_PDU parameter</td><td>Parameter Name</td><td>Cvt</td><td>Byte value</td><td>Mnemonic</td></tr><tr><td>SA</td><td>Source Address</td><td>M</td><td>0xXXXX</td><td>SA</td></tr><tr><td>TA</td><td>Target Address</td><td>M</td><td>0xXXXX</td><td>TA</td></tr><tr><td>TAtype</td><td>Target Address type</td><td>M</td><td>0x

In [96]:
results_page_type_split_bge = collection_page_type_split_bge.query(
    query_texts=["diagnostic session control byte value for diagnostic session controlSID"],
    n_results=5
)

In [97]:
results_page_type_split_bge

{'ids': [['table_page_43',
   'table_page_25',
   'table_page_327',
   'table_page_45',
   'table_page_46']],
 'embeddings': None,
 'documents': [['ISO 14229-1:2013(E)\nTable 22 — Diagnostic and Communication Management functional unit\n<table><tr><td>Service</td><td>Description</td></tr><tr><td>DiagnosticSessionControl</td><td>The client requests to control a diagnostic session with a server(s).</td></tr><tr><td>ECUReset</td><td>The client forces the server(s) to perform a reset.</td></tr><tr><td>SecurityAccess</td><td>The client requests to unlock a secured server(s).</td></tr><tr><td>CommunicationControl</td><td>The client controls the setting of communication parameters in the server (e.g., communication baudrate).</td></tr><tr><td>TesterPresent</td><td>The client indicates to the server(s) that it is still present.</td></tr><tr><td>AccessTimingParameter</td><td>The client uses this service to read/modify the timing parameters for an active communication.</td></tr><tr><td>SecuredDa

In [85]:
def recall_at_k(retrieved_ids, relevant_ids, k):
    return len(set(retrieved_ids[:k]) & set(relevant_ids)) / len(relevant_ids)


In [86]:
def mrr(retrieved_ids, relevant_ids):
    for i, doc_id in enumerate(retrieved_ids):
        if doc_id in relevant_ids:
            return 1 / (i + 1)
    return 0


In [87]:
ground_truth = {
    "value of cvt in negative response for A_PDU parameter TA": ["_page_25"]
}


In [88]:
query = "value of cvt in negative response for A_PDU parameter TA"

retrieved_ids = results_page_type_split_bge["ids"][0]   # list of page IDs
documents = results_page_type_split_bge["documents"][0]
distances = results_page_type_split_bge["distances"][0]


In [89]:
retrieved_ids

['table_page_25',
 'table_page_40',
 'table_page_78',
 'table_page_70',
 'table_page_41']

In [90]:
relevant_ids = ground_truth[query]

print("Recall@5:", recall_at_k(retrieved_ids, relevant_ids, 5))


Recall@5: 0.0


In [91]:
def precision_at_k(retrieved_ids, relevant_ids, k):
    return len(set(retrieved_ids[:k]) & set(relevant_ids)) / k


In [92]:
relevant_ids = ground_truth[query]

print("Precision@5:", precision_at_k(retrieved_ids, relevant_ids, 5))


Precision@5: 0.0


In [93]:
print("MRR:", mrr(retrieved_ids, relevant_ids))


MRR: 0


In [94]:
for i, (doc, dist) in enumerate(zip(documents, distances)):
    print(f"\nRank {i+1} | Distance: {dist}")
    print(doc[:500])



Rank 1 | Distance: 0.3312804698944092
ISO 14229-1:2013(E)
7.4 Negative response/confirmation service primitive
Each diagnostic service has a negative response/negative confirmation message specified with message A_Data bytes according to Table 3. The first A_Data byte (A_PCI.NR_SI) is always the specific negative response service identifier. The second A_Data byte (A_PCI.SI) shall be a copy of the service identifier value from the service request/indication message that the negative response message corresponds to.
Table 3 — Negativ

Rank 2 | Distance: 0.3412989377975464
ISO 14229-1:2013(E)
8.3 Positive response message
8.3.1 Positive response message definition
This subclause defines the A_PDU parameters for the service response / confirmation (see 7.2 for a detailed description of the application layer protocol data unit A_PDU). There might be a separate table for each sub-function parameter $Level when the response messages of the different sub-function parameters $Level differ in 