In [136]:
import sys
import os
import re

# Append the parent directory to sys.path using os.getcwd() instead of __file__
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

import tiktoken
from pathlib import Path
from typing import List, Dict
import json
from IPython.display import display, Markdown
from collections import deque

from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import OllamaEmbeddings

import ollama

from src.utils.file_utils import *

In [None]:
def ask_LLM_with_JSON(prompt, ollama_client, model, model_options):

    messages = []
    messages.append({"role": "system", "content": "You are a helpful assistant, who helps the user with their query. You are designed to output JSON."})     
    messages.append({"role": "user", "content": prompt})   

    response = ollama_client.chat(
        model=model,
        messages=messages,
        options=model_options
    )

    return response["message"]["content"]

def recover_json(json_str, verbose = False):
    decoded_object = {}

    if '{' not in json_str:
        return json_str

    json_str = extract_json(json_str)

    try:
        decoded_object = json.loads(json_str)
        return decoded_object
    except Exception:
        try:
            decoded_object = json.loads(json_str.replace("'", '"'))
            return decoded_object
        except Exception:
            try:
                decoded_object = json_repair.loads(json_str.replace("'", '"'))

                for k, d in decoded_object.items():
                    dd = d.replace("'", '"')
                    decoded_object[k] = json.loads(dd)
                
                return decoded_object
            except:
                print(f"all json recovery operations have failed for {json_str}")
        
            if verbose:
                if isinstance(decoded_object, dict):
                    print(f"\n{bc.OKBLUE}>>> Recovering JSON:\n{bc.OKGREEN}{json.dumps(decoded_object, indent=3)}{bc.ENDC}")
                else:
                    print(f"\n{bc.OKBLUE}>>> Recovering JSON:\n{bc.OKGREEN}{json_str}{bc.ENDC}")


    return json_str

def extract_json(s):
    code = re.search(r"```json(.*?)```", s, re.DOTALL)
    if code:
        return code.group(1)
    else:
        return s
    
def chunk_markdown_table_with_overlap(md_table, cols=None, n_tokens=512, overlap=128):
    """
    Splits a markdown table into chunks with overlapping tokens.
    Each returned chunk already has the header prepended.

    Parameters:
        - md_table (str): The markdown table as a string.
        - cols (list of str], optional): List of column headers. If provided, overrides the header in md_table.
        - n_tokens (int): Maximum number of tokens per chunk (including header tokens).
        - overlap (int): Number of tokens to overlap between adjacent chunks (data‐rows only).

    Returns:
        - chunks (list of str): Each element is a string consisting of
                                (header + data‐rows with overlaps).
        - header (str): The header block (header row + separator row).
    """
    # Split into non‐empty, stripped lines
    mds = [line.rstrip() for line in md_table.strip().split("\n") if line.strip()]
    if not mds:
        return [], ""

    # Build header based on cols override or the first two lines of md_table
    if cols is not None:
        # Use the provided cols to generate a new header
        header_row = "| " + " | ".join(cols) + " |"
        separator = "| " + " | ".join(["---"] * len(cols)) + " |"
        header = header_row + "\n" + separator + "\n"
        # Skip the first two lines of mds (the original header + separator)
        data_start_idx = 2
    else:
        # Use the existing first line as header; check if second line is the separator
        header = mds[0] + "\n"
        if len(mds) > 1 and all(ch in "-:|" for ch in mds[1].replace(" ", "")):
            header += mds[1] + "\n"
            data_start_idx = 2
        else:
            data_start_idx = 1

    # Count how many tokens the header takes (so each chunk can include that cost)
    header_token_count = get_token_count(header)

    chunks = []                  # Will hold final strings: (header + data‐rows)
    current_chunk_data = []      # Collects just the data‐rows (each ending in "\n")
    current_token_count = header_token_count

    # Iterate over each data row from the markdown table
    for raw_row in mds[data_start_idx:]:
        row = raw_row.rstrip()
        if not row.startswith("|"):
            # Skip lines that aren't table rows
            continue
        row_with_nl = row + "\n"
        row_tokens = get_token_count(row_with_nl)

        # If adding this row would exceed n_tokens, finalize the current chunk
        if current_token_count + row_tokens > n_tokens:
            # Prepend the header to the collected rows and append to chunks
            chunk_text = header + "".join(current_chunk_data)
            chunks.append(chunk_text)

            # Build overlap buffer by walking backwards over the data rows
            overlap_buffer = []
            overlap_tokens = 0
            for prev_row_with_nl in reversed(current_chunk_data):
                prev_tokens = get_token_count(prev_row_with_nl)
                if overlap_tokens + prev_tokens > overlap:
                    break
                overlap_buffer.insert(0, prev_row_with_nl)
                overlap_tokens += prev_tokens

            # Start a new chunk with just the overlap rows
            current_chunk_data = list(overlap_buffer)
            current_token_count = header_token_count + overlap_tokens

        # Add this row to the (new or continuing) chunk
        current_chunk_data.append(row_with_nl)
        current_token_count += row_tokens

    # If any rows remain, append them as the final chunk
    if current_chunk_data:
        chunk_text = header + "".join(current_chunk_data)
        chunks.append(chunk_text)

    return chunks, header

# def chunk_markdown_tables(
#     tables_dir: str,
#     tables_summaries_dir: str,
#     prompt: str,
#     cols: List[List[str]],
#     n_tokens: int,
#     overlap: int,
#     ollama_client: ollama_client,
#     model: str,
#     model_options: Dict,
#     tables_chunks_dir: str
# ) -> None:
#     """
#     Opens markdown tables and split them into chunnks, then, stored the resulting tables chunks as well as summaries in dedicated folders.

#     Args:
#         - tables_dir (str): The path to the directory where to find the tables.
#         - tables_summaries_dir (str): The path to the directory where to find the tables.
#         - prompt (str): The prompt to use for the model.
#         - cols (List(List(str))): A list of lists of columns corresponding  to the tables respectively.
#         - n_token (int): The chunk size.
#         - overlap (int): Number of tokens to overlap between adjacent chunks (data‐rows only).
#         - model (str): The name of the model to use for chunking.
#         - ollama_client (Ollama Client): The Ollama client to use for querying the model.
#         - model_options (Dict): A dictionary containaing the options (like the temperature) to run the model.
#         - tables_chunks_dir (str): The path to the directory where to stored the generated tables chunks.

#     returns:
#         None.
#     """
#     tables_path = Path(tables_dir)
#     tables_summaries_path = Path(tables_summaries_dir)
#     tables_chunks_path = Path(tables_chunks_dir)

#     # tables_path.mkdir(parents=True, exist_ok=True)
#     tables_summaries_path.mkdir(parents=True, exist_ok=True)
#     tables_chunks_path.mkdir(parents=True, exist_ok=True)

#     if not tables_path.is_dir():
#         raise ValueError(f"Folder '{tables_path}' does not exist.")

#     # if not tables_path.is_dir() or not any(tables_path.iterdir()):
#     #     raise ValueError(f"Folder '{tables_path}' does not exist or is empty.")

#     print(f"\nProcessing the tables in {tables_path}...\n")

#     if not cols:
#         for table_path in tables_path.rglob("*.md"):
#             if table_path.is_file():
#                 print(f"\nReading file: {table_path}")
#                 with open(table_path, "r", encoding="utf-8") as f:
#                     md_table = f.read()
#                     chunks, _, summary = chunk_markdown_table(
#                         prompt, md_table, cols, n_tokens, overlap, ollama_client, model, model_options
#                     )

#                     sumary_file_name = (
#                         tables_summaries_path / f"{table_path.stem}_summary.txt"
#                     )
#                     with open(sumary_file_name, "w", encoding="utf-8") as file:
#                         file.write(summary)
#                     print(f"Saved table summary to: {sumary_file_name}")

#                     for idx, chunk in enumerate(chunks, 1):
#                         chunk_with_summary = f"{summary}\n\n{chunk}"
#                         table_name = (
#                             tables_chunks_path / f"{table_path.stem}_chunk_{idx}.md"
#                         )
#                         with open(table_name, "w", encoding="utf-8") as file:
#                             file.write(chunk_with_summary)
#                         print(f"Saved table chunk to: {table_name}")

#     elif len(cols) == len(list(tables_path.rglob("*.md"))):
#         for idx, table_path in enumerate(tables_path.rglob("*.md")):
#             if table_path.is_file():
#                 print(f"\nReading file: {table_path}")
#                 with open(table_path, "r", encoding="utf-8") as f:
#                     md_table = f.read()
#                     chunks, _, summary = chunk_markdown_table(
#                         prompt, md_table, cols[idx], n_tokens, overlap, ollama_client, model, model_options
#                     )
#                     sumary_file_name = (
#                         tables_summaries_path / f"{table_path.stem}_summary.txt"
#                     )
#                     with open(sumary_file_name, "w", encoding="utf-8") as file:
#                         file.write(summary)
#                     print(f"Saved table summary to: {sumary_file_name}")

#                     for idxx, chunk in enumerate(chunks, 1):
#                         table_name = (
#                             tables_chunks_path / f"{table_path.stem}_chunk_{idxx}.md"
#                         )
#                         with open(table_name, "w", encoding="utf-8") as file:
#                             file.write(chunk)
#                         print(f"Saved table to: {table_name}")

#     else:
#         print("Please provide a list of columns for each table.")

#     print(
#         f"\nAll tables have been processed.\nTables chunks were saved in Markdown to: {tables_dir}.\nTable summaries were saved in TXT to: {tables_summaries_dir}.\n"
#     )

def chunk_markdown_table(prompt, md_table, cols, n_tokens, overlap, ollama_client, model, model_options):
    prompt = prompt.format(
        table=md_table.split("\n")
    )
    output = ask_LLM_with_JSON(prompt, ollama_client, model, model_options)
    try:
        outd = recover_json(output)
        cols = outd["columns"].split(",")
        summary = outd["summary_of_the_table"]
    except:
        logc(f"Could not recover with malformed JSON {output}")
        return [], "", ""

    chunks, header = chunk_markdown_table_with_overlap(
        md_table, cols, n_tokens=n_tokens, overlap=overlap
    )
    print("Chunks:", len(chunks))

    return chunks, header, summary

def get_token_count(input_text: str) -> int:
    """Returns the number of tokens for the input text."""
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(input_text))

def hard_split(text, max_chunk_size):
    """
    Fallback simple splitter that breaks a text into chunks
    of at most max_chunk_size tokens, using whitespace as delimiter.
    """
    words = text.split()
    chunks = []
    current = ""

    for word in words:
        if not current:
            current = word
        else:
            candidate = f"{current} {word}"
            # If adding this word would exceed max_chunk_size, finalize current chunk
            if get_token_count(candidate) > max_chunk_size:
                chunks.append(current)
                current = word
            else:
                current = candidate

    # Add the last chunk if nonempty
    if current:
        chunks.append(current)

    return chunks

def process_semantic_split(texts, semantic_splitter, max_chunk_size):
    """Splits a list of texts semantically into chunks within the token limit."""
    # Initialize the queue and result list
    queue = deque(texts)
    result = []
    step = 0

    while queue:
        step += 1
        message = f"\nProcessing step {step} with {len(queue)} segments in the queue."
        print(message)
        print("=" * len(message))
        # Pop the next segment from the queue
        segment = queue.popleft()
        print(f"\nCurrent segment: {segment[:10]}... (length: {get_token_count(segment)} tokens)")

        # Skip empty or whitespace-only segments
        if not segment.strip():
            print("\nSkipping empty segment.")
            continue

        # Compute token count once per segment
        token_count = get_token_count(segment)
        if token_count == 0:
            print("\nSkipping segment with zero token count.")
            continue

        # If the segment exceeds max size, attempt semantic splitting
        if token_count > max_chunk_size:
            print(f"\nSegment exceeds max chunk size of {max_chunk_size} tokens (has {token_count}).")
            sub_segments = semantic_splitter.split_text(segment)

            # If no valid sub-segments were created, fallback to hard split
            if not sub_segments or sub_segments == [segment]:
                print("\nNo semantic chunks created; using hard split fallback.")
                fallback_chunks = hard_split(segment, max_chunk_size)
                for fc in fallback_chunks:
                    print(f"  - Adding fallback chunk (tokens: {get_token_count(fc)}): {fc[:30]}...")
                    result.append(fc)
                continue

            # Reinsert sub-segments at the front of the queue in reverse order
            for sub in reversed(sub_segments):
                queue.appendleft(sub)
            continue

        # If the segment is within the token limit, add it to the result
        print(f"\nSegment is within max chunk size of {max_chunk_size} tokens (has {token_count}).")
        result.append(segment)

    return result

# def semantic_chunk_text_file(
#     file_path: Path,
#     embed_model_info: Dict,
#     max_chunk_size: int,
#     buffer_size: int,
#     breakpoint_threshold_type: str,
#     breakpoint_threshold_amount: float,
#     sentence_split_regex: str,
#     verbose: bool,
# ):
#     """Processes a single text file and returns semantic chunks."""
#     # Initialize embeddings and splitters
#     embedding_model = AzureOpenAIEmbeddings(
#         azure_deployment=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL"],
#         azure_endpoint=f"https://{embed_model_info['AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE']}.openai.azure.com",
#         openai_api_key=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL_RESOURCE_KEY"],
#         openai_api_version=embed_model_info["AZURE_OPENAI_EMBEDDING_MODEL_API_VERSION"],
#     )

#     semantic_splitter = SemanticChunker(
#         embedding_model,
#         buffer_size=buffer_size,
#         breakpoint_threshold_type=breakpoint_threshold_type,
#         breakpoint_threshold_amount=breakpoint_threshold_amount,
#         sentence_split_regex=sentence_split_regex,
#     )

#     # headers_to_split_on = [
#     #     ("#", "header_1"),
#     #     ("##", "header_2"),
#     #     ("###", "header_3"),
#     # ]
#     # markdown_splitter = MarkdownHeaderTextSplitter(
#     #     headers_to_split_on=headers_to_split_on,
#     #     strip_headers=False
#     # )

#     with open(file_path, "r", encoding="utf-8") as f:
#         input_text = f.read()

#     if verbose:
#         print(f"\nReading file: {file_path}")

#     # Markdown split
#     # md_header_splits = markdown_splitter.split_text(input_text)
#     # plain_content = [value.page_content for value in md_header_splits]

#     # Semantic split
#     semantic_chunks = process_semantic_split(
#         [input_text], semantic_splitter, max_chunk_size
#     )
#     # semantic_chunks = process_semantic_split(plain_content, semantic_splitter, max_chunk_size)

#     return semantic_chunks

### Chunk Table

In [3]:
DOCS_PATH = "../data/processed"

In [4]:
docs = [os.path.join(DOCS_PATH, d) for d in os.listdir(DOCS_PATH)]
tables = [d for d in docs if d.endswith(".md") and "_Table_" in d]

In [5]:
tables

['../data/processed/46- Unit of Activity (UoA)_page_1_Table_2.md',
 '../data/processed/46- Unit of Activity (UoA)_page_2_Table_1.md',
 '../data/processed/46- Unit of Activity (UoA)_page_1_Table_1.md']

In [6]:
with open(tables[1], "r", encoding="utf-8") as f:
    md_table = f.read()

In [7]:
print(md_table)

| HOB/Business name   | Pacesetter   | CHJV                            | PetroAlliance         | Slider                   | Radius Motors            | Radius DTSS               | PAWC-Cementing                   |
|---------------------|--------------|---------------------------------|-----------------------|--------------------------|--------------------------|---------------------------|----------------------------------|
| UoA Business Line   | WCM          | WCM                             | WCM                   | WCD                      | WCD                      | WCF                       | UoA Sub-Business Line            |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | UoA Geounit                      |
| CAL                 | CHG          | RUL                             | USL                   | RUL                      | RUL                 

In [8]:
with open("../src/prompts/markdown_extract_header_and_summarize_prompt.md", "r", encoding="utf-8") as f:
    prompt = f.read()
print(prompt)

You are a Data Engineer resonsible for reforming and preserving the quality of Markdown tables. A table will be passed to you in the form of a Markdown string. You are designed to output JSON. 

Your task is to extract the column names of the header of the table from the Markdown string in the form of a comma-separated list. If the column names do exist, please return them verbatim word-for-word with no change, except fixing format or alignment issues (extra spaces and new lines can be removed). 

If the table does not have a header, then please check the data rows and generate column names for the header that fit the data types of the columns and the nature of the data. 

**VERY IMPORTANT**: If the table has an unnamed index column, typically the leftmost column, you **MUST** generate a column name for it.

Finally, please generate a brief semantic summary of the table in English. This is not about the technical characteristics of the table. The summary should summarize the business p

In [9]:
prompt= prompt.format(
    table=md_table.split("\n")[:100]
)
print(prompt)

You are a Data Engineer resonsible for reforming and preserving the quality of Markdown tables. A table will be passed to you in the form of a Markdown string. You are designed to output JSON. 

Your task is to extract the column names of the header of the table from the Markdown string in the form of a comma-separated list. If the column names do exist, please return them verbatim word-for-word with no change, except fixing format or alignment issues (extra spaces and new lines can be removed). 

If the table does not have a header, then please check the data rows and generate column names for the header that fit the data types of the columns and the nature of the data. 

**VERY IMPORTANT**: If the table has an unnamed index column, typically the leftmost column, you **MUST** generate a column name for it.

Finally, please generate a brief semantic summary of the table in English. This is not about the technical characteristics of the table. The summary should summarize the business p

In [23]:
ollama_client = ollama.Client()
model = "phi4"
model_options = {
        "temperature": 0.2,
        "top_p": .95
}

json_output = ask_LLM_with_JSON(prompt, ollama_client, model, model_options)
json_output = recover_json(json_output)

In [25]:
type(json_output)

dict

In [26]:
json_output

{'columns': 'HOB/Business name, Pacesetter, CHJV, PetroAlliance, Slider, Radius Motors, Radius DTSS, PAWC-Cementing',
 'columns_inferred': False,
 'total_number_of_columns': 8,
 'summary_of_the_table': "The table provides a structured overview of various business units and their associated operational metrics or roles within an organization. It lists different business names alongside specific performance indicators or managerial responsibilities, such as 'WCM', 'DMHV', and 'CHG'. The columns represent distinct categories like 'Pacesetter', 'CHJV', and others, which likely correspond to key performance areas or strategic initiatives. Additionally, the table includes information on resource management roles, indicating who is responsible for certain operational aspects within these business units. This setup suggests a focus on tracking performance metrics and managerial oversight across different segments of the organization."}

In [29]:
Markdown(json_output["summary_of_the_table"])

The table provides a structured overview of various business units and their associated operational metrics or roles within an organization. It lists different business names alongside specific performance indicators or managerial responsibilities, such as 'WCM', 'DMHV', and 'CHG'. The columns represent distinct categories like 'Pacesetter', 'CHJV', and others, which likely correspond to key performance areas or strategic initiatives. Additionally, the table includes information on resource management roles, indicating who is responsible for certain operational aspects within these business units. This setup suggests a focus on tracking performance metrics and managerial oversight across different segments of the organization.

In [32]:
cols = json_output["columns"].split(",")
cols

['HOB/Business name',
 ' Pacesetter',
 ' CHJV',
 ' PetroAlliance',
 ' Slider',
 ' Radius Motors',
 ' Radius DTSS',
 ' PAWC-Cementing']

In [83]:
chunks, header = chunk_markdown_table_with_overlap(md_table, cols=None, n_tokens=512, overlap=128)

In [84]:
for idx, chunk in enumerate(chunks, 1):
    print(f"Chunk {idx}:\n{chunk}\n\n")
    display(Markdown(chunk))
    print("\n" + "="*80 + "\n")

Chunk 1:
| HOB/Business name   | Pacesetter   | CHJV                            | PetroAlliance         | Slider                   | Radius Motors            | Radius DTSS               | PAWC-Cementing                   |
|---------------------|--------------|---------------------------------|-----------------------|--------------------------|--------------------------|---------------------------|----------------------------------|
| UoA Business Line   | WCM          | WCM                             | WCM                   | WCD                      | WCD                      | WCF                       | UoA Sub-Business Line            |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | UoA Geounit                      |
| CAL                 | CHG          | RUL                             | USL                   | RUL                      | RUL        

| HOB/Business name   | Pacesetter   | CHJV                            | PetroAlliance         | Slider                   | Radius Motors            | Radius DTSS               | PAWC-Cementing                   |
|---------------------|--------------|---------------------------------|-----------------------|--------------------------|--------------------------|---------------------------|----------------------------------|
| UoA Business Line   | WCM          | WCM                             | WCM                   | WCD                      | WCD                      | WCF                       | UoA Sub-Business Line            |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | UoA Geounit                      |
| CAL                 | CHG          | RUL                             | USL                   | RUL                      | RUL                      | RUL                       | UoA Location                     |
| PKST                | CHJV         | PAD                             | PSL                   | n/a                      | n/a                      | PAWC                      | Revenue recognized under         |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | POC responsible for manual input |
| PCSB Controller     | CHG WEC PSCM | PetroAlliance Resources Manager | USL Resources Manager | Radius Resources Manager | Radius Resources Manager | WIT Resources Coordinator | Controlled                       |






In [89]:
with open("../src/prompts/markdown_extract_header_and_summarize_prompt.md", "r", encoding="utf-8") as f:
    prompt = f.read()
print(prompt)

You are a Data Engineer resonsible for reforming and preserving the quality of Markdown tables. A table will be passed to you in the form of a Markdown string. You are designed to output JSON. 

Your task is to extract the column names of the header of the table from the Markdown string in the form of a comma-separated list. If the column names do exist, please return them verbatim word-for-word with no change, except fixing format or alignment issues (extra spaces and new lines can be removed). 

If the table does not have a header, then please check the data rows and generate column names for the header that fit the data types of the columns and the nature of the data. 

**VERY IMPORTANT**: If the table has an unnamed index column, typically the leftmost column, you **MUST** generate a column name for it.

Finally, please generate a brief semantic summary of the table in English. This is not about the technical characteristics of the table. The summary should summarize the business p

In [90]:
cols=None
n_tokens=512
overlap=128
chunks, header, summary = chunk_markdown_table(prompt, md_table, cols, n_tokens, overlap, ollama_client, model, model_options)

Chunks: 1


In [94]:
print(header)

| HOB/Business name |  Pacesetter |  CHJV |  PetroAlliance |  Slider |  Radius Motors |  Radius DTSS |  PAWC-Cementing |
| --- | --- | --- | --- | --- | --- | --- | --- |



In [91]:
for idx, chunk in enumerate(chunks, 1):
    print(f"Chunk {idx}:\n{chunk}\n\n")
    display(Markdown(chunk))
    print("\n" + "="*80 + "\n")

Chunk 1:
| HOB/Business name |  Pacesetter |  CHJV |  PetroAlliance |  Slider |  Radius Motors |  Radius DTSS |  PAWC-Cementing |
| --- | --- | --- | --- | --- | --- | --- | --- |
| UoA Business Line   | WCM          | WCM                             | WCM                   | WCD                      | WCD                      | WCF                       | UoA Sub-Business Line            |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | UoA Geounit                      |
| CAL                 | CHG          | RUL                             | USL                   | RUL                      | RUL                      | RUL                       | UoA Location                     |
| PKST                | CHJV         | PAD                             | PSL                   | n/a                      | n/a                      | PAWC                      |

| HOB/Business name |  Pacesetter |  CHJV |  PetroAlliance |  Slider |  Radius Motors |  Radius DTSS |  PAWC-Cementing |
| --- | --- | --- | --- | --- | --- | --- | --- |
| UoA Business Line   | WCM          | WCM                             | WCM                   | WCD                      | WCD                      | WCF                       | UoA Sub-Business Line            |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | UoA Geounit                      |
| CAL                 | CHG          | RUL                             | USL                   | RUL                      | RUL                      | RUL                       | UoA Location                     |
| PKST                | CHJV         | PAD                             | PSL                   | n/a                      | n/a                      | PAWC                      | Revenue recognized under         |
| PCSB                | DMHV         | DMHV                            | SLID                  | MOT                      | DTSS                     | WIT                       | POC responsible for manual input |
| PCSB Controller     | CHG WEC PSCM | PetroAlliance Resources Manager | USL Resources Manager | Radius Resources Manager | Radius Resources Manager | WIT Resources Coordinator | Controlled                       |






In [93]:
Markdown(summary)

The table provides a structured overview of various business lines and their associated operational units within an organization. It details the alignment between different business names (HOB/Business name) and specific operational metrics or roles such as Pacesetter, CHJV, PetroAlliance, Slider, Radius Motors, Radius DTSS, and PAWC-Cementing. Each row represents a unique combination of these elements, indicating how various units like PCSB, CAL, PKST, and others are managed under different operational frameworks (e.g., WCM, DMHV). The table also highlights the roles responsible for manual input or control within these business lines, such as 'POC responsible for manual input' and 'Controlled,' suggesting a focus on accountability and resource management. Overall, this table serves to map out organizational structures and responsibilities across different operational units.

### Chunk Text

In [272]:
texts = [d for d in docs if d.endswith(".md") and "_Text" in d]
print(f"Found {len(texts)} text files.")
print(f"\nTexts:\n")
for text in texts:
    print(text)

Found 2 text files.

Texts:

../data/processed/46- Unit of Activity (UoA)_page_1_Text.md
../data/processed/46- Unit of Activity (UoA)_page_2_Text.md


In [273]:
with open(texts[0], "r", encoding="utf-8") as f:
    text = f.read()

print(text)

## Unit of Activity (UoA)

## OBJECTIVE

## WELL CONSTRUCTION UNIT OF ACTIVITY LEVELS

## SUB-BUSINESS LINE UNIT OF ACTIVITY MATRIX

## Unit of Activity (UoA)

- · WELL CONSTRUCTION UNIT OF ACTIVITY LEVELS
- · SUB-BUSINESS LINE UNIT OF ACTIVITY MATRIX
- · HOW TO FIND UNIT OF ACTIVITY
- · REGIONAL BUSINESS ACTIVITY CAPTURE GUIDELINES (Manual Entry in Sharepoint)

The objective of this page is to describe the rules used to build the Well Construction Unit of Activity (UoA) and, subsequently, the Well Construction Unit of Activity dashboard that can be found under https://biportfolio.data.slb.com/ in the PSD section.

Capturing activity in terms of job count, historical data, and forecast is critical for understanding the performance and trends of the Well Construction Business.  It allows for accurate monitoring of progress and identification of potential issues, as well as the ability to make informed decisions about future operations. The accuracy of forecasting depends on the quality 

In [274]:
# Create an embedding model instance
embedding_model = OllamaEmbeddings(
    model="mxbai-embed-large",      # The Ollama model name
    base_url="http://localhost:11434",  # Default Ollama server address
    # embed_instruction="passage: "   # (Optional) prefix if your model expects an instruction
)

buffer_size = 5
breakpoint_threshold_type = "percentile"
breakpoint_threshold_amount = 0.95
sentence_split_regex =  r"(?<=[.!?]) +" # r"\n\n\n"  # 
max_chunk_size = 64

In [275]:
semantic_splitter = SemanticChunker(
    embedding_model,
    buffer_size=buffer_size,
    breakpoint_threshold_type=breakpoint_threshold_type,
    breakpoint_threshold_amount=breakpoint_threshold_amount,
    sentence_split_regex=sentence_split_regex,
)

In [276]:
semntic_chunks = process_semantic_split([text], semantic_splitter, max_chunk_size)


Processing step 1 with 1 segments in the queue.

Current segment: ## Unit of... (length: 278 tokens)

Segment exceeds max chunk size of 64 tokens (has 278).

Processing step 2 with 1 segments in the queue.

Current segment: ## Unit of... (length: 277 tokens)

Segment exceeds max chunk size of 64 tokens (has 277).

No semantic chunks created; using hard split fallback.
  - Adding fallback chunk (tokens: 64): ## Unit of Activity (UoA) ## O...
  - Adding fallback chunk (tokens: 64): MATRIX - · HOW TO FIND UNIT OF...
  - Adding fallback chunk (tokens: 64): that can be found under https:...
  - Adding fallback chunk (tokens: 64): ability to make informed decis...
  - Adding fallback chunk (tokens: 15): in Well Construction - Divisio...


In [282]:
for idx, chunk in enumerate(semntic_chunks, 1):
    print(f"Semantic Chunk {idx}:\n{chunk}")
    print(f"\nToken count: {get_token_count(chunk)}")
    print("\n" + "="*80 + "\n")

Semantic Chunk 1:
## Unit of Activity (UoA) ## OBJECTIVE ## WELL CONSTRUCTION UNIT OF ACTIVITY LEVELS ## SUB-BUSINESS LINE UNIT OF ACTIVITY MATRIX ## Unit of Activity (UoA) - · WELL CONSTRUCTION UNIT OF ACTIVITY LEVELS - · SUB-BUSINESS LINE UNIT OF ACTIVITY

Token count: 64


Semantic Chunk 2:
MATRIX - · HOW TO FIND UNIT OF ACTIVITY - · REGIONAL BUSINESS ACTIVITY CAPTURE GUIDELINES (Manual Entry in Sharepoint) The objective of this page is to describe the rules used to build the Well Construction Unit of Activity (UoA) and, subsequently, the Well Construction Unit of Activity dashboard

Token count: 64


Semantic Chunk 3:
that can be found under https://biportfolio.data.slb.com/ in the PSD section. Capturing activity in terms of job count, historical data, and forecast is critical for understanding the performance and trends of the Well Construction Business. It allows for accurate monitoring of progress and identification of potential issues, as well as the

Token count: 64


Semantic

### Calculate Text Embeddings

In [1]:
import ollama

response = ollama.embeddings(
    model='mxbai-embed-large',
    prompt='Your document text here'
)

embed_vector = response['embedding']  # list of ~1,024 floats
print(len(embed_vector))  # ➝ 1024 

1024


In [1]:
import ollama

from llama_index.core import (
    StorageContext,
    Settings,
    SimpleKeywordTableIndex
)

In [2]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

In [3]:



from llama_index.vector_stores.qdrant import QdrantVectorStore


from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.http.exceptions import (
    ResponseHandlingException,
    UnexpectedResponse
)

In [9]:
from llama_index.core.indices.base import BaseIndex

In [11]:
from llama_index.embeddings.ollama import OllamaEmbedding

In [18]:
# Embedding Model
ollama_embed = OllamaEmbedding(
    model_name="mxbai-embed-large",
    base_url="http://localhost:11434" ,
    ollama_additional_kwargs={},   # e.g. {"mirostat": 0}
    client_kwargs=None,            # optional httpx.Client params
)

In [23]:
test_input = "This is a test input to determine embedding dimensions."
# OllamaEmbedding uses the method 'get_text_embedding' to get the embedding vector
embed_vector = ollama_embed.get_text_embedding(test_input)
embed_dim = len(embed_vector)
print(f"Embedding dimension: {embed_dim}")  # Should print 1024 for mxbai-embed-large

Embedding dimension: 1024


In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

In [54]:
import os

DOCS_PATH = "../data/processed"
docs = [os.path.join(DOCS_PATH, d) for d in os.listdir(DOCS_PATH)]

for doc in docs:
    print(doc)

../data/processed/46- Unit of Activity (UoA)_page_1_Text.md
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_3_Text.md
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_2_Text.md
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_1_description.txt
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_2_Picture_2_description.txt
../data/processed/46- Unit of Activity (UoA)_page_2_Text.md
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_4_Picture_1_description.txt
../data/processed/46- Unit of Activity (UoA)_page_1_Table_2.md
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_3_Picture_1_description.txt
../data/processed/46- Unit of Activity (UoA)_page_2_Table_1.md
../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Text.md
../data/processed/46- Unit of Activity (UoA)_page_1_Table_1.md
../data/processed/172- D

In [55]:
with open(
    "../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_2_description.txt",
    "r",
    encoding="utf-8"
) as f:
    description = f.read()
print(description)

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_2_description.txt'

In [51]:
import re

def extract_markdown_tables(text: str) -> list[str]:
    """
    Finds all Markdown tables in the text.
    A Markdown table is defined as:
      1) A header row    (a line starting and ending with '|')
      2) A separator row (a line of pipes, dashes, spaces, or colons)
      3) One or more data rows (lines starting and ending with '|')

    The tables are expected to be in the format:

    | Column1 | Column2 |
    |---------|---------|
    | Data1   | Data2   |
    
    If no tables are found, it returns an empty list.

    Returns a list of the raw table strings.
    """
    table_pattern = (
        r'(?:^[ \t]*\|.*\r?\n)'                    # header row
        r'(?:^[ \t]*\|[-\s|:]+\r?\n)'              # separator row
        r'(?:^[ \t]*\|.*\r?\n?)+'                  # one or more data rows
    )
    return re.findall(table_pattern, text, flags=re.MULTILINE)

def remove_markdown_tables(text: str) -> str:
    """
    Removes all Markdown tables (as defined above) from the text.
    """
    table_pattern = (
        r'(?:^[ \t]*\|.*\r?\n)'
        r'(?:^[ \t]*\|[-\s|:]+\r?\n)'
        r'(?:^[ \t]*\|.*\r?\n?)+'
    )
    return re.sub(table_pattern, '', text, flags=re.MULTILINE)

def remove_code_blocks(text: str) -> str:
    """
    Strips out any fenced code block (```...```), regardless of language.
    """
    return re.sub(r'```.*?```', '', text, flags=re.DOTALL)

def remove_mermaid_blocks(text: str) -> str:
    """
    Specifically strips out any ```mermaid ... ``` block.
    """
    return re.sub(r'```mermaid.*?```', '', text, flags=re.DOTALL)

def remove_extracted_text_blocks(text: str) -> str:
    """
    Removes any ```EXTRACTED TEXT ... ``` sections.
    """
    return re.sub(r'```EXTRACTED TEXT.*?```', '', text, flags=re.DOTALL)

def clean_up_text(text: str) -> str:
    """
    Cleans up the input text by:
      1. Removing all fenced code blocks
      2. Removing mermaid diagrams
      3. Removing EXTRACTED TEXT sections
      4. Removing all Markdown tables
    """
    text = remove_code_blocks(text)
    text = remove_mermaid_blocks(text)
    # text = remove_extracted_text_blocks(text)
    text = remove_markdown_tables(text)
    return text

# def extract_markdown_table(s):
#     """
#     Extracts Markdown tables from a given string.
#     This function uses a regular expression to find all Markdown tables in the input string.
#     The tables are expected to be in the format:
#     | Column1 | Column2 |
#     |---------|---------|
#     | Data1   | Data2   |
#     If no tables are found, it returns an empty list.

#     Args:
#         - s (str): The input string that may contain Markdown tables.

#     Returns:
#         - List[str]: A list of strings, each representing a Markdown table found in the input string.
#     """
#     try:
#         table_pattern = r"(\|.*\|\s*\n\|[-| ]+\|\s*\n(\|.*\|\s*\n)+)"
#         tables = re.findall(table_pattern, s, re.MULTILINE)
#     except:
#         tables = []
#         print("Error extracting markdown tables", f"Error extracting markdown tables from text '{s[:50]}'.")  # logc("Error extracting markdown tables", f"Error extracting markdown tables from text '{s[:50]}'.")

#     return tables

# def remove_code(s):
#     return re.sub(r"```python(.*?)```", "", s, flags=re.DOTALL)

# def remove_markdown(s):
#     return re.sub(r"```markdown(.*?)```", "", s, flags=re.DOTALL)

# def remove_mermaid(s):
#     return re.sub(r"```mermaid(.*?)```", "", s, flags=re.DOTALL)

# def remove_extracted_text(s):
#     return re.sub(r"```EXTRACTED TEXT(.*?)```", "", s, flags=re.DOTALL)

# def clean_up_text(text):
#     """
#     Cleans up the input text by removing code blocks, mermaid diagrams, markdown formatting (tables, images, etc.),
#     and any extracted text sections. This is useful for preparing text for further processing
#     or analysis, ensuring that the text is free from unnecessary formatting and code snippets.

#     Args:
#         text (str): The input text to be cleaned up.

#     Returns:
#         str: The cleaned-up text with all specified elements removed.
#     """
#     # Remove code blocks, mermaid diagrams, markdown formatting, and extracted text sections
#     tables = extract_markdown_table(text)
#     if tables:
#         for table in tables:
#             text = text.replace(table[0], "")
#     text = remove_code(text)
#     text = remove_markdown(text)
#     # text = remove_extracted_text(text)
#     # text = remove_mermaid(text)
#     return text

In [52]:
clean_description = clean_up_text(description)
print(clean_description)

The image provided is a screenshot of a pegging report in a business intelligence (BI) dashboard. The report is designed to analyze and manage the relationship between demand and supply within a supply chain management system. The document is structured to provide insights into the availability and allocation of materials and resources.

### 1. Information Conveyed by the Image:
The image outlines a pegging report that links demand and supply. It provides a detailed view of the supply and demand status, including open reservations, stock on hand (SOH), and planned orders. The report is refreshed daily and is used to manage and execute supply chain operations efficiently.

### 2. Description of the Image:
The image is a screenshot of a BI dashboard with a blue header and footer. The main content is divided into sections: a scope description, a data table, and a summary at the bottom. The scope describes the demand and supply elements, while the data table provides detailed information o

In [59]:
from PIL import Image, ImageStat
from pathlib import Path

def is_relevant_image(
        image_path: Path,
        image_pixel_threshold: int,
        image_pixel_variance_threshold: float
):
    """
    Determines if an image is relevant based on non-white pixel count and color variance.

    Parameters:
    - image_path (Path): Path to the image file.
    - pixel_threshold (int): Minimum number of non-white pixels required.
    - variance_threshold (float): Minimum average variance across color channels.

    Returns:
    - bool: True if the image is relevant, False otherwise.
    """
    try:
        with Image.open(image_path) as img:
            # Calculate non-white pixels in grayscale
            grayscale = img.convert("L")
            histogram = grayscale.histogram()
            non_white = sum(histogram[:-1])

            if non_white <= image_pixel_threshold:
                return False

            # Calculate average variance across all color channels
            stat = ImageStat.Stat(img)
            avg_variance = sum(stat.var) / len(stat.var)

            if avg_variance < image_pixel_variance_threshold:
                return False

            return True
    except (IOError, OSError) as e:
        print(f"Error processing {image_path}: {e}")
        return False

In [63]:
import os

DOCS_PATH = "../data/detections"
images = [os.path.join(DOCS_PATH, d) for d in os.listdir(DOCS_PATH)]
images = [img for img in images if "_Picture_" in img]

for img in images:
    print(img)

../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_5.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_2_Picture_1.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_2_Picture_2.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_3.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_2.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_1_Picture_1.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_4_Picture_3.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_4_Picture_2.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_4_Picture_1.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_3_Picture_1.jpg
../data/detections/172- Demand&Supply Visualization - 5.2 Soft Pegging_page_3_Pi

In [65]:
IMAGE_PIXEL_THRESHOLD = 10000
IMAGE_PIXEL_VARIANCE_THRESHOLD = 500

for img in images:
    print(
        is_relevant_image(
        img,
        IMAGE_PIXEL_THRESHOLD,
        IMAGE_PIXEL_VARIANCE_THRESHOLD
        )

    )

True
True
True
True
True
True
True
True
True
True
True
