In [1]:
# !pip install chromadb sentence-transformers rank-bm25

In [2]:
import json
import re
from typing import List, Dict
from collections import defaultdict

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi


In [3]:
JSON_PATH = "mineru_0_to_40.json"

with open(JSON_PATH, "r", encoding="utf-8") as f:
    pages = json.load(f)

len(pages)


40

In [4]:
def extract_blocks(pages):
    for page in pages:
        for block in page["blocks"]:
            block["page"] = page["page"]
            yield block


In [5]:
blocks = list(extract_blocks(pages))
len(blocks)


843

In [6]:
semantic_units = []

for block in blocks:
    if block["type"] in {"text", "title", "list"} and block.get("content"):
        semantic_units.append({
            "type": "text",
            "page": block["page"],
            "content": block["content"]
        })

    if block["type"] == "table":
        semantic_units.append({
            "type": "table",
            "page": block["page"],
            "content": block["content"]
        })

    if block["type"] == "table_caption":
        semantic_units.append({
            "type": "table_caption",
            "page": block["page"],
            "content": block["content"]
        })


In [7]:
len(semantic_units)


578

In [8]:
tables = []
pending_caption = None

for unit in semantic_units:
    if unit["type"] == "table_caption":
        pending_caption = unit["content"]

    elif unit["type"] == "table":
        tables.append({
            "page": unit["page"],
            "caption": pending_caption,
            "table_html": unit["content"],
            "embedding_text": f"{pending_caption}\n{unit['content'][:500]}"
        })
        pending_caption = None

len(tables)


16

In [9]:
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


In [10]:
chroma_client = chromadb.Client(
    Settings(persist_directory="./chroma_phase_b", anonymized_telemetry=False)
)

text_collection = chroma_client.create_collection("B_text_index")
table_collection = chroma_client.create_collection("B_table_index")


In [11]:
text_units = [
    u for u in semantic_units
    if u["type"] == "text"
]

text_embeddings = embedder.encode(
    [u["content"] for u in text_units],
    show_progress_bar=True
)

text_collection.add(
    documents=[u["content"] for u in text_units],
    embeddings=text_embeddings,
    ids=[f"text_{i}" for i in range(len(text_units))],
    metadatas=[{"page": u["page"]} for u in text_units]
)


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

In [12]:
table_embeddings = embedder.encode(
    [t["embedding_text"] for t in tables],
    show_progress_bar=True
)

table_collection.add(
    documents=[t["embedding_text"] for t in tables],
    embeddings=table_embeddings,
    ids=[f"table_{i}" for i in range(len(tables))],
    metadatas=[
        {
            "page": t["page"],
            "caption": t["caption"] if t["caption"] is not None else ""
        }
        for t in tables
    ]
)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
table_ref_pattern = re.compile(r"Table\s+(\d+)", re.IGNORECASE)

def find_table_references(text):
    return table_ref_pattern.findall(text)


In [14]:
def expand_with_related_tables(text_hits, table_hits):
    related = []

    for t in text_hits:
        refs = find_table_references(t)
        for ref in refs:
            for table in tables:
                if table["caption"] and f"Table {ref}" in table["caption"]:
                    related.append(table)

    return related


In [15]:
def build_table_payload(table):
    return {
        "caption": table["caption"],
        "page": table["page"],
        "table_html": table["table_html"],
        "instructions": (
            "Explain this table using only the provided rows and columns. "
            "Do not infer or add new fields."
        )
    }


In [16]:
def phase_b_query(query, k_text=5, k_table=3):
    q_emb = embedder.encode([query])

    text_res = text_collection.query(
        query_embeddings=q_emb,
        n_results=k_text
    )["documents"][0]

    table_res = table_collection.query(
        query_embeddings=q_emb,
        n_results=k_table
    )

    expanded_tables = expand_with_related_tables(
        text_res,
        table_res["documents"][0]
    )

    return {
        "text": text_res,
        "tables": [
            build_table_payload(t) for t in expanded_tables
        ]
    }


In [17]:
res = phase_b_query(
    "How is the sub-function byte value calculated?"
)

res


{'text': ['The sub-function parameter byte is divided into two parts (on bit-level) as defined in Table 11.',
  'Table 14 defines the calculation of the sub-function byte value.',
  'The complete sub-function parameter byte value is calculated based on the value of the suppressPosRspMsgIndicationBit and the sub-function parameter value chosen.',
  'The sub-function parameter value is a 7 bit value (bits 6-0 of the sub-function parameter byte) that can have multiple values to further specify the service behaviour.',
  'Each service contains a table that defines values for the sub-function parameter values, taking only into account the bits 0-6.'],
 'tables': [{'caption': 'Table 11—SubFunction parameter structure',
   'page': '039.png',
   'table_html': '<table><tr><td>Bit position</td><td>Description</td></tr><tr><td>7</td><td>suppressPosRspMsgIndicationBit</td></tr><tr><td></td><td>This bit indicates if a positive response message shall be suppressed by the server.‘0’ = FALSE, do not s

In [22]:
res = phase_b_query(
    "  ISO"
)

res


{'text': ['Web www.iso.org',
  'ISO copyright office',
  '\\(\\odot\\) ISO 2013',
  'ISO (the International Organization for Standardization) is a worldwide federation of national standards bodies (ISO member bodies). The work of preparing International Standards is normally carried out through ISO technical committees. Each member body interested in a subject for which a technical committee has been established has the right to be represented on that committee. International organizations, governmental and non-governmental, in liaison with ISO, also take part in the work. ISO collaborates closely with the International Electrotechnical Commission (IEC) on all matters of electrotechnical standardization.',
  "All rights reserved. Unless otherwise specified, no part of this publication may be reproduced or utilized otherwise in any form or by any means, electronic or mechanical, including photocopying, or posting on the internet or an intranet, without prior written permission. Permissi