In [67]:
# Install required libraries (uncomment as needed)
# !pip install nltk transformers chromadb llama-index llama-index-embeddings-huggingface llama-index-vector-stores-chroma docling groq

In [68]:
import os
import tempfile
import nltk
from transformers import AutoTokenizer
from docling.document_converter import DocumentConverter
import chromadb
from uuid import uuid4
from llama_index.core.schema import Document, NodeWithScore, QueryBundle, TextNode
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

# Download punkt tokenizer for sentence splitting
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')


In [69]:
def chunk_markdown_by_heading(structured_text):
    """
    Chunk markdown text based on H2 headings (##).
    Each chunk starts with an H2 heading and includes all content until the next H2.
    """
    lines = structured_text.strip().split('\n')
    chunks = []
    current_chunk_lines = []
    chunk_number = 1

    for line in lines:
        if line.strip().startswith("##") and not line.strip().startswith("###"):
            if current_chunk_lines:
                content = '\n'.join(current_chunk_lines).strip()
                chunks.append((f"chunk {chunk_number}", content))
                chunk_number += 1
                current_chunk_lines = []
        current_chunk_lines.append(line)

    if current_chunk_lines:
        content = '\n'.join(current_chunk_lines).strip()
        chunks.append((f"chunk {chunk_number}", content))
    return chunks


In [70]:
# print(chunks)

In [71]:
def chunk_text_for_embedding(chunk_texts, model_name="BAAI/bge-base-en-v1.5", max_tokens=512, buffer_tokens=20):
    """
    Further chunk text into smaller pieces suitable for embedding models,
    respecting a maximum token limit and sentence boundaries.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_final_chunks = []

    for text in chunk_texts:
        sentences = nltk.sent_tokenize(text)
        current_chunk_sentences = []
        current_tokens = 0

        for sentence in sentences:
            token_count = len(tokenizer.encode(sentence, add_special_tokens=False))

            if current_tokens + token_count <= max_tokens - buffer_tokens:
                current_chunk_sentences.append(sentence)
                # current_tokens += token_count
            else:
                if current_chunk_sentences:
                    all_final_chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = [sentence]
                current_tokens = token_count

        if current_chunk_sentences:
            all_final_chunks.append(" ".join(current_chunk_sentences))
    return all_final_chunks


In [72]:
# def chunk_text_for_embedding(chunk_texts, model_name="BAAI/bge-base-en-v1.5", max_tokens=512, buffer_tokens=20):
#     """
#     Further chunk text into smaller pieces suitable for embedding models,
#     respecting a maximum token limit and sentence boundaries.
#     """
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     all_final_chunks = []

#     for text in chunk_texts:
#         sentences = nltk.sent_tokenize(text)
#         current_chunk_sentences = []
#         current_tokens = 0

#         for sentence in sentences:
#             token_count = len(tokenizer.encode(sentence, add_special_tokens=False))

#             if current_tokens + token_count <= max_tokens - buffer_tokens:
#                 current_chunk_sentences.append(sentence)
#                 # current_tokens += token_count
#             else:
#                 if current_chunk_sentences:
#                     all_final_chunks.append(" ".join(current_chunk_sentences))
#                 current_chunk_sentences = [sentence]
#                 current_tokens = token_count

#         if current_chunk_sentences:
#             all_final_chunks.append(" ".join(current_chunk_sentences))
#     return all_final_chunks


from transformers import AutoTokenizer
import nltk

def chunk_text_for_embedding(
    chunk_texts, 
    model_name="BAAI/bge-base-en-v1.5", 
    max_tokens=512, 
    buffer_tokens=20, 
    overlap_tokens=50
):
    """
    Splits text into smaller overlapping chunks suitable for embedding.
    - Respects max token limit.
    - Uses sentence boundaries where possible.
    - Falls back to token-level splitting for very long sentences/paragraphs.
    """

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_final_chunks = []

    for text in chunk_texts:
        sentences = nltk.sent_tokenize(text)
        current_chunk_sentences = []
        current_tokens = 0

        for sentence in sentences:
            tokenized = tokenizer.encode(sentence, add_special_tokens=False)
            token_count = len(tokenized)

            # Case 1: Sentence fits in current chunk
            if current_tokens + token_count <= max_tokens - buffer_tokens:
                current_chunk_sentences.append(sentence)
                current_tokens += token_count

            else:
                # Save the current chunk before overflow
                if current_chunk_sentences:
                    all_final_chunks.append(" ".join(current_chunk_sentences))

                # Case 2: Sentence itself is too long → break into token chunks
                if token_count > max_tokens - buffer_tokens:
                    print(f"Warning: Sentence too long, splitting by tokens. Sentence length: {token_count} tokens.")
                    start = 0
                    while start < len(tokenized):
                        end = min(start + max_tokens - buffer_tokens, len(tokenized))
                        chunk_tokens = tokenized[start:end]
                        chunk_text = tokenizer.decode(chunk_tokens)
                        print(f"  Created token chunk of {len(chunk_tokens)} tokens.")
                        all_final_chunks.append(chunk_text)

                        if end == len(tokenized):  # ✅ stop if at the end
                            break
                        start = end - overlap_tokens  # move with overlap

                    current_chunk_sentences = []
                    current_tokens = 0

                # Case 3: Sentence starts new chunk
                else:
                    current_chunk_sentences = [sentence]
                    current_tokens = token_count

        # Save leftover chunk
        if current_chunk_sentences:
            all_final_chunks.append(" ".join(current_chunk_sentences))

    return all_final_chunks



In [73]:

def build_index_from_pdf(pdf_path):
    """
    Converts a PDF to markdown, chunks it, embeds it, and builds a ChromaDB vector index.
    """
    print(f"Building index from PDF: {pdf_path}")
    converter = DocumentConverter()
    result = converter.convert(pdf_path)
    structured_text = result.document.export_to_markdown()

    # Step 1: Chunk by markdown headings
    chunks_with_ids = chunk_markdown_by_heading(structured_text)
    print(f"Initial chunks created: {len(chunks_with_ids)}")
    chunk_texts_only = [text for _, text in chunks_with_ids]

    # Step 2: Further chunk for embedding model token limits
    token_chunks = chunk_text_for_embedding(chunk_texts_only)

    documents = [Document(text=text) for text in token_chunks]

    metadatas = [{"chunk_id": str(i+1), "source_pdf": os.path.basename(pdf_path)} for i in range(len(token_chunks))]
    ids = [str(uuid4()) for _ in token_chunks]

    chroma_client = chromadb.Client()
    vector_store = ChromaVectorStore(chroma_collection=chroma_client.get_or_create_collection("my_docs"))
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

    index = VectorStoreIndex.from_documents(
        documents=documents,
        metadatas=metadatas,
        ids=ids,
        embed_model=embed_model,
        vector_store=vector_store)
    print("Index built successfully.")
    return index,token_chunks,chunk_texts_only


In [74]:
def rerank_results(results, query, top_n=5):
    """
    Rerank retrieval results using FlagEmbeddingReranker.
    """
    print(f"Reranking results for query: '{query}'")
    reranker = FlagEmbeddingReranker(
        top_n=top_n,
        model="BAAI/bge-reranker-large",
        use_fp16=False
    )
    nodes_for_reranker = [NodeWithScore(node=TextNode(text=node.get_content())) for node in results]
    query_bundle = QueryBundle(query_str=query)
    ranked_nodes = reranker._postprocess_nodes(nodes_for_reranker, query_bundle)
    print(f"Reranked {len(ranked_nodes)} nodes.")
    return ranked_nodes


In [75]:
# from groq import Groq
from llama_index.llms.groq import Groq

# def llm_refine(ranked_nodes, query, groq_api_key):

#     """
#     Generate an answer using a Groq LLM based on reranked context.
#     """
    
#     print(f"Refining answer with LLM for query: '{query}'")
#     reranked_texts = [node.node.get_content() for node in ranked_nodes]
#     context = "\n\n".join(reranked_texts)
#     llm = Groq(api_key=groq_api_key, model="openai/gpt-oss-120b")
#     prompt = f"""Based on the following context, answer the query comprehensively and concisely.
#     If the context does not contain enough information, state that.\n\nQuery: {query}\n\nContext:\n{context}"""
#     response = llm.complete(prompt)
#     print("LLM refinement complete.")
#     return response.text



def llm_refine(ranked_nodes, query, groq_api_key):
    """
    Refines the answer using an LLM (Groq) based on reranked context.
    """
    print(f"Refining answer with LLM for query: '{query}'")
    reranked_texts = [node.node.get_content() for node in ranked_nodes]
    context = "\n\n".join(reranked_texts)
    print(context)
    # Initialize Groq LLM
    llm = Groq(api_key=groq_api_key, model="openai/gpt-oss-120b") # Using a powerful Groq model

    prompt = f"""Based on the following context, answer the query comprehensively and concisely. 
    If the context does not contain enough information, state that.\n\nQuery: {query}\n\nContext:\n{context}"""
    
    # Use llm.complete for simple text generation
    response = llm.complete(prompt)
    print("LLM refinement complete.")
    print(f"LLM {response.text}")
    return response.text # Access the text attribute of the CompletionResponse



In [76]:
# Example usage (run cells as needed):

# Step 1: User provides PDF file
pdf_path = "Avid Technology Inc - Global Terms and Conditions_EXECUTED_05.18.2_removed_removed.pdf"  # replace with actual filepath

# Step 2: Build the vector index from PDF
index,tk,chunk_txt = build_index_from_pdf(pdf_path)

# # Step 3: Handle user query
# query = "WHAT IS GLOBAL GOURMET"  # replace with your query
# retriever = index.as_retriever(similarity_top_k=10)
# results = retriever.retrieve(query)

# # Step 4: Rerank the results
# ranked_nodes = rerank_results(results, query, top_n=5)

# # Step 5: Use LLM for final answer
# groq_api_key = "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"  # set your actual key
# # answer = llm_refine(ranked_nodes, query, groq_api_key)

# # print("LLM Refined Answer:", answer)


Building index from PDF: Avid Technology Inc - Global Terms and Conditions_EXECUTED_05.18.2_removed_removed.pdf




Initial chunks created: 4
Index built successfully.


In [77]:
print(tk)

['<!-- image -->', '## GLOBAL  TERMS  AND  CONDITIONS\n\n1. Structure. These Global Terms and Conditions (\'Global  Terms  and  Conditions")  are  incorporated  by  reference into each country-specific Master  Services  Agreement entered into  by  Customer and the  applicable Digital Realty  entity  (each, a \'Country  MSA\'). The  terms  \'Customer\'  and \'Digital Realty\' are defined in the applicable Country MSA. Customer and Digital Realty may  enter  into orders  (whether  executed by a manual or electronic signature or via Digital Realty\'s online ordering platform, each  an \'Order\'), each  of  which incorporates  the  terms of the applicable Country MSA  and constitutes a separate and distinct  contract  for  data center-related  services  and/or  products to be provided by Digital Realty to Customer (\'Services\'), as more  particularly  set  forth on each  Order. An Order,  together  with its applicable Country MSA and these Global Terms and Conditions, is  referred  to  as

In [78]:
# # print(type(ranked_nodes))
for i in tk:
    print(i)
    print("#"*40)


<!-- image -->
########################################
## GLOBAL  TERMS  AND  CONDITIONS

1. Structure. These Global Terms and Conditions ('Global  Terms  and  Conditions")  are  incorporated  by  reference into each country-specific Master  Services  Agreement entered into  by  Customer and the  applicable Digital Realty  entity  (each, a 'Country  MSA'). The  terms  'Customer'  and 'Digital Realty' are defined in the applicable Country MSA. Customer and Digital Realty may  enter  into orders  (whether  executed by a manual or electronic signature or via Digital Realty's online ordering platform, each  an 'Order'), each  of  which incorporates  the  terms of the applicable Country MSA  and constitutes a separate and distinct  contract  for  data center-related  services  and/or  products to be provided by Digital Realty to Customer ('Services'), as more  particularly  set  forth on each  Order. An Order,  together  with its applicable Country MSA and these Global Terms and Conditions

In [92]:
len(chunk_txt)
print(type(chunk_txt))

<class 'list'>


In [None]:
# def chunk_text_for_embedding(chunk_texts, model_name="BAAI/bge-base-en-v1.5", max_tokens=512, buffer_tokens=20):
#     """
#     Further chunk text into smaller pieces suitable for embedding models,
#     respecting a maximum token limit and sentence boundaries.
#     """
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     all_final_chunks = []

#     for text in chunk_texts:
#         sentences = nltk.sent_tokenize(text)
#         current_chunk_sentences = []
#         current_tokens = 0

#         for sentence in sentences:
#             token_count = len(tokenizer.encode(sentence, add_special_tokens=False))

#             if current_tokens + token_count <= max_tokens - buffer_tokens:
#                 current_chunk_sentences.append(sentence)
#                 # current_tokens += token_count
#             else:
#                 if current_chunk_sentences:
#                     all_final_chunks.append(" ".join(current_chunk_sentences))
#                 current_chunk_sentences = [sentence]
#                 current_tokens = token_count

#         if current_chunk_sentences:
#             all_final_chunks.append(" ".join(current_chunk_sentences))
#     return all_final_chunks


from transformers import AutoTokenizer
import nltk

def chunk_text_for_embedding(
    chunk_texts, 
    model_name="BAAI/bge-base-en-v1.5", 
    max_tokens=512, 
    buffer_tokens=20, 
    overlap_tokens=50
):
    """
    Splits text into smaller overlapping chunks suitable for embedding.
    - Respects max token limit.
    - Uses sentence boundaries where possible.
    - Falls back to token-level splitting for very long sentences/paragraphs.
    """

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_final_chunks = []
    print("ssss")
    for text in chunk_texts:
        print(f"chunk_text :: {text}")
        sentences = nltk.sent_tokenize(text)
        print(len(sentences))
        # print(type(sentences))
        # print(len(sentences))
        # print(f"sentence : {sentences}")
        current_chunk_sentences = []
        current_tokens = 0
        
        for sentence in sentences:
            # print(f"sentence1 :: {sentence}")
            tokenized = tokenizer.encode(sentence, add_special_tokens=False)
            # print(len(tokenized))
            token_count = len(tokenized)
        
            
            # Case 1: Sentence fits in current chunk
            if current_tokens + token_count <= max_tokens - buffer_tokens:
                current_tokens += token_count
                
                # print(f"sentence1 :: {sentence}")
                # print(f"current_tokens :: {current_tokens}")
                current_chunk_sentences.append(sentence)
    # print(f"current_tokens :: {current_tokens}")
        # break
  

                
        # print(f"current_sentencess :: {current_chunk_sentences}")
        # print(len(current_chunk_sentences))

        #     else:
        #         # print("eeee")
        #         # Save the current chunk before overflow
        #         if current_chunk_sentences:
        #             # print("dd")
        #             all_final_chunks.append(" ".join(current_chunk_sentences))

        #         # Case 2: Sentence itself is too long → break into token chunks
        #         if token_count > max_tokens - buffer_tokens:
        #             # print("fff")
        #             print(f"Warning: Sentence too long, splitting by tokens. Sentence length: {token_count} tokens.")
        #             start = 0
        #             while start < len(tokenized):
        #                 end = min(start + max_tokens - buffer_tokens, len(tokenized))
        #                 chunk_tokens = tokenized[start:end]
        #                 chunk_text = tokenizer.decode(chunk_tokens)
        #                 print(f"  Created token chunk of {len(chunk_tokens)} tokens.")
        #                 all_final_chunks.append(chunk_text)

        #                 if end == len(tokenized):  # ✅ stop if at the end
        #                     break
        #                 start = end - overlap_tokens  # move with overlap

        #             current_chunk_sentences = []
        #             current_tokens = 0

        #         # Case 3: Sentence starts new chunk
        #         else:
        #             current_chunk_sentences = [sentence]
        #             current_tokens = token_count

        # # Save leftover chunk
        # if current_chunk_sentences:
        #     all_final_chunks.append(" ".join(current_chunk_sentences))

    return all_final_chunks

chunk_text_for_embedding(chunk_txt)

ssss
chunk_text :: <!-- image -->
1
chunk_text :: ## GLOBAL  TERMS  AND  CONDITIONS

1. Structure. These Global Terms and Conditions ('Global  Terms  and  Conditions")  are  incorporated  by  reference into each country-specific Master  Services  Agreement entered into  by  Customer and the  applicable Digital Realty  entity  (each, a 'Country  MSA'). The  terms  'Customer'  and 'Digital Realty' are defined in the applicable Country MSA. Customer and Digital Realty may  enter  into orders  (whether  executed by a manual or electronic signature or via Digital Realty's online ordering platform, each  an 'Order'), each  of  which incorporates  the  terms of the applicable Country MSA  and constitutes a separate and distinct  contract  for  data center-related  services  and/or  products to be provided by Digital Realty to Customer ('Services'), as more  particularly  set  forth on each  Order.  An Order,  together  with its applicable Country MSA and these Global Terms and Conditions, is 

[]

In [133]:
print(type(chunk_txt))

<class 'list'>


In [137]:
from transformers import AutoTokenizer

def simple_token_chunks(texts, max_tokens=512, overlap=50):
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")

    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    all_chunks = []
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=False)

        # If text is longer than max_tokens, break it
        start = 0
        while start < len(tokens):
            end = start + max_tokens
            chunk_tokens = tokens[start:end]
            chunk_text = tokenizer.decode(chunk_tokens)
            all_chunks.append(chunk_text)

            # Move forward with overlap
            start = end - overlap if end - overlap > 0 else end

    return all_chunks


In [None]:
from transformers import AutoTokenizer
import nltk

# nltk.download("punkt")

def sentence_token_chunks(text, max_tokens=512, overlap=100):
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")

    sentences = nltk.sent_tokenize(text)
    print("ssss")
    print(len(sentences))
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        print(f"sentence {sentence}")
        print("#"*20)
        tokenized = tokenizer.encode(sentence, add_special_tokens=False)
        token_count = len(tokenized)

        # Case 1: Sentence itself too long
        if token_count > max_tokens:
            start = 0
            print("success")
            while start < token_count:
                end = start + max_tokens
                chunk_tokens = tokenized[start:end]
                chunks.append(tokenizer.decode(chunk_tokens))
                start = end - overlap
            continue

        # Case 2: Fits in current chunk
        if current_tokens + token_count <= max_tokens:
           
            current_chunk.append(sentence)
            current_tokens += token_count
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_tokens = token_count

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def chunk_list(long_text_list, max_tokens=512, overlap=100):
    """Takes a list of texts and returns list of token-aware chunks."""
    all_chunks = []
    for text in long_text_list:
        all_chunks.extend(sentence_token_chunks(text, max_tokens, overlap))
    return all_chunks


# # Example
# long_text = [
#     "This is a very long paragraph... (1st)",
#     "Here is another paragraph... (2nd)",
#     "And yet another... (3rd)"
# ]

chunks = chunk_list(chunk_txt)

for i, ch in enumerate(chunks, 1):
    print(f"Chunk {i} {ch}")


ssss
1
senetnce <!-- image -->
####################
ssss
7
senetnce ## GLOBAL  TERMS  AND  CONDITIONS

1.
####################
senetnce Structure.
####################
senetnce These Global Terms and Conditions ('Global  Terms  and  Conditions")  are  incorporated  by  reference into each country-specific Master  Services  Agreement entered into  by  Customer and the  applicable Digital Realty  entity  (each, a 'Country  MSA').
####################
senetnce The  terms  'Customer'  and 'Digital Realty' are defined in the applicable Country MSA.
####################
senetnce Customer and Digital Realty may  enter  into orders  (whether  executed by a manual or electronic signature or via Digital Realty's online ordering platform, each  an 'Order'), each  of  which incorporates  the  terms of the applicable Country MSA  and constitutes a separate and distinct  contract  for  data center-related  services  and/or  products to be provided by Digital Realty to Customer ('Services'), as more 

In [155]:
from transformers import AutoTokenizer

# Load tokenizer once
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")

def paragraph_chunker(paragraphs, max_tokens=512, overlap=50):
    """
    Splits a list of paragraphs into token-limited chunks.
    Keeps words intact, adds overlap for context.
    
    Args:
        paragraphs (list[str]): List of paragraph strings
        max_tokens (int): Max token length per chunk
        overlap (int): Number of tokens to overlap
    
    Returns:
        list[str]: List of chunked text strings
    """
    chunks = []

    for para in paragraphs:
        # Convert paragraph into tokens
        tokens = tokenizer.encode(para, add_special_tokens=False)

        # If paragraph fits within limit -> keep as is
        if len(tokens) <= max_tokens:
            chunks.append(para)
            continue

        # If paragraph is too long -> split into sub-chunks
        start = 0
        while start < len(tokens):
            end = start + max_tokens
            chunk_tokens = tokens[start:end]
            chunk_text = tokenizer.decode(chunk_tokens)

            chunks.append(chunk_text)

            # move forward with overlap
            start = end - overlap

    return chunks


# Example usage
paragraphs = chunk_txt

chunks = paragraph_chunker(paragraphs, max_tokens=512, overlap=50)

for i, ch in enumerate(chunks, 1):
    print(f"\nChunk {i} ({len(tokenizer.encode(ch, add_special_tokens=False))} tokens):\n{ch}")


Token indices sequence length is longer than the specified maximum sequence length for this model (1006 > 512). Running this sequence through the model will result in indexing errors



Chunk 1 (8 tokens):
<!-- image -->

Chunk 2 (222 tokens):
## GLOBAL  TERMS  AND  CONDITIONS

1. Structure. These Global Terms and Conditions ('Global  Terms  and  Conditions")  are  incorporated  by  reference into each country-specific Master  Services  Agreement entered into  by  Customer and the  applicable Digital Realty  entity  (each, a 'Country  MSA'). The  terms  'Customer'  and 'Digital Realty' are defined in the applicable Country MSA. Customer and Digital Realty may  enter  into orders  (whether  executed by a manual or electronic signature or via Digital Realty's online ordering platform, each  an 'Order'), each  of  which incorporates  the  terms of the applicable Country MSA  and constitutes a separate and distinct  contract  for  data center-related  services  and/or  products to be provided by Digital Realty to Customer ('Services'), as more  particularly  set  forth on each  Order.  An Order,  together  with its applicable Country MSA and these Global Terms and Condit

In [156]:
import re
from typing import List
from transformers import AutoTokenizer

# Load once at module import time
TOKENIZER = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")

def _token_len(text: str) -> int:
    """Accurate token count for a string using the chosen tokenizer."""
    return len(TOKENIZER.encode(text, add_special_tokens=False))

def _tail_words_for_overlap(words: List[str], overlap_tokens: int) -> List[str]:
    """
    Take as few words as possible from the end so that their
    token-sum >= overlap_tokens. Returns them in normal order.
    """
    tail = []
    acc = 0
    # Walk backwards and collect words until we hit the overlap budget
    for w in reversed(words):
        w_tokens = _token_len(w)
        tail.append(w)
        acc += w_tokens
        if acc >= overlap_tokens:
            break
    return list(reversed(tail))

def chunk_paragraphs_keep_words(
    paragraphs: List[str],
    max_tokens: int = 512,
    overlap_tokens: int = 50,
) -> List[str]:
    """
    Make overlapping chunks from paragraphs without breaking words.
    - Keeps paragraph boundaries (no chunk crosses paragraphs).
    - Uses token-based budget and token-based overlap.
    - Falls back for pathological 'words' longer than the budget.
    """
    chunks: List[str] = []

    for para in paragraphs:
        if not para.strip():
            continue  # skip empty paragraphs

        # Split into "words" while preserving the exact spacing after each token.
        # Example: "Hello world!" -> ["Hello ", "world!"]
        words = re.findall(r"\S+\s*", para)

        cur_words: List[str] = []
        cur_tokens = 0
        i = 0

        while i < len(words):
            w = words[i]
            w_tok = _token_len(w)

            # Pathological case: a single "word" (incl. trailing space) exceeds the budget.
            # We cannot keep words intact here; we slice by tokens as a last resort.
            if w_tok > max_tokens:
                # 1) Flush any current chunk
                if cur_words:
                    chunks.append("".join(cur_words).rstrip())
                    # Seed next chunk with overlap from the flushed chunk
                    cur_words = _tail_words_for_overlap(cur_words, overlap_tokens)
                    cur_tokens = sum(_token_len(x) for x in cur_words)

                # 2) Slice the long word by tokens
                w_tokens = TOKENIZER.encode(w, add_special_tokens=False)
                start = 0
                while start < len(w_tokens):
                    end = min(start + max_tokens, len(w_tokens))
                    part_text = TOKENIZER.decode(w_tokens[start:end])
                    chunks.append(part_text)
                    if end == len(w_tokens):
                        break
                    start = end - overlap_tokens  # token overlap inside the long 'word'

                i += 1
                # Reset current chunk after handling the long word
                cur_words = []
                cur_tokens = 0
                continue

            # Normal case: can we fit this whole word into the current chunk?
            if cur_tokens + w_tok <= max_tokens:
                cur_words.append(w)
                cur_tokens += w_tok
                i += 1
            else:
                # Finalize current chunk
                if cur_words:
                    chunks.append("".join(cur_words).rstrip())
                    # Prepare overlap tail for the next chunk
                    tail = _tail_words_for_overlap(cur_words, overlap_tokens)
                    cur_words = tail[:]  # start next chunk with overlap words
                    cur_tokens = sum(_token_len(x) for x in cur_words)
                else:
                    # (Shouldn't happen often) single small word didn't fit;
                    # start a new chunk with it.
                    cur_words = [w]
                    cur_tokens = w_tok
                    i += 1

        # Flush any remaining words from this paragraph
        if cur_words:
            chunks.append("".join(cur_words).rstrip())

    return chunks


paragraphs = chunk_txt
chunks = chunk_paragraphs_keep_words(paragraphs, max_tokens=512, overlap_tokens=50)

for i, ch in enumerate(chunks, 1):
    print(f"\n--- Chunk {i} (tokens={len(TOKENIZER.encode(ch, add_special_tokens=False))}) ---\n{ch}")



--- Chunk 1 (tokens=8) ---
<!-- image -->

--- Chunk 2 (tokens=222) ---
## GLOBAL  TERMS  AND  CONDITIONS

1. Structure. These Global Terms and Conditions ('Global  Terms  and  Conditions")  are  incorporated  by  reference into each country-specific Master  Services  Agreement entered into  by  Customer and the  applicable Digital Realty  entity  (each, a 'Country  MSA'). The  terms  'Customer'  and 'Digital Realty' are defined in the applicable Country MSA. Customer and Digital Realty may  enter  into orders  (whether  executed by a manual or electronic signature or via Digital Realty's online ordering platform, each  an 'Order'), each  of  which incorporates  the  terms of the applicable Country MSA  and constitutes a separate and distinct  contract  for  data center-related  services  and/or  products to be provided by Digital Realty to Customer ('Services'), as more  particularly  set  forth on each  Order.  An Order,  together  with its applicable Country MSA and these Global Te

In [80]:
current_chunk_sentences  = ["hese Global Terms and Conditions", "Global  Terms  and  Conditions"]

chunk_text = " ".join(current_chunk_sentences)