In [1]:
# ==============================
# Chunking Utilities Module
# ==============================

import re


def chunk_by_characters(text, chunk_size=200, overlap=50):
    """
    Split text into chunks of specified character length.

    Args:
        text: The text to chunk
        chunk_size: Number of characters per chunk
        overlap: Number of characters to overlap between chunks

    Returns:
        List of text chunks
    """
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)

        # Move start position (with overlap)
        start += chunk_size - overlap

    return chunks


def chunk_by_words(text, chunk_size=50, overlap=10):
    """
    Split text into chunks of specified word count.

    Args:
        text: The text to chunk
        chunk_size: Number of words per chunk
        overlap: Number of words to overlap between chunks

    Returns:
        List of text chunks
    """
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)

        # Move start position (with overlap)
        start += chunk_size - overlap

    return chunks


def chunk_by_sentences(text, max_chunk_size=500):
    """
    Split text into chunks by sentences, keeping sentences intact.

    Args:
        text: The text to chunk
        max_chunk_size: Maximum characters per chunk

    Returns:
        List of text chunks
    """
    sentences = re.split(r'(?<=[.!?])\s+', text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += " " + sentence if current_chunk else sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


def chunk_by_paragraphs(text, min_chunk_size=100):
    """
    Split text by paragraphs (double newlines).

    Args:
        text: The text to chunk
        min_chunk_size: Minimum characters per chunk (combine small paragraphs)

    Returns:
        List of text chunks
    """
    paragraphs = text.split("\n\n")

    chunks = []
    current_chunk = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        if len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = para

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


In [2]:
# Document A: FAQ

strategy_A = "sentence-based chunking"
reason_A = (
    "FAQs contain short, direct statements. Sentence chunking preserves "
    "questionâ€“answer clarity without fragmenting meaning."
)

doc_A = """
Q: What is the return policy?
A: Items can be returned within 30 days of purchase with original receipt.

Q: Do you offer international shipping?
A: Yes, we ship to over 50 countries worldwide. Shipping times vary by location.

Q: How do I track my order?
A: Use the tracking number sent to your email after shipment.
"""

chunks_A = chunk_by_sentences(doc_A, max_chunk_size=200)

for i, chunk in enumerate(chunks_A, 1):
    print(f"Chunk {i}:")
    print(chunk)
    print("-" * 80)


Chunk 1:
Q: What is the return policy? A: Items can be returned within 30 days of purchase with original receipt. Q: Do you offer international shipping? A: Yes, we ship to over 50 countries worldwide.
--------------------------------------------------------------------------------
Chunk 2:
Shipping times vary by location. Q: How do I track my order? A: Use the tracking number sent to your email after shipment.
--------------------------------------------------------------------------------


In [3]:
# Document B: Technical Documentation

strategy_B = "paragraph-based chunking"
reason_B = (
    "Technical documentation follows structured steps. Paragraph chunking "
    "keeps each step intact and easy to retrieve."
)

doc_B = """
Installation Guide

Step 1: Download the installer from our website.
Extract the zip file to your desired location.

Step 2: Run setup.exe as administrator.
Follow the on-screen instructions.

Step 3: Configure your API key in the settings file.
The settings file is located at config/settings.json.
"""

chunks_B = chunk_by_paragraphs(doc_B, min_chunk_size=80)

for i, chunk in enumerate(chunks_B, 1):
    print(f"Chunk {i}:")
    print(chunk)
    print("-" * 80)


Chunk 1:
Installation Guide
--------------------------------------------------------------------------------
Chunk 2:
Step 1: Download the installer from our website.
Extract the zip file to your desired location.

Step 2: Run setup.exe as administrator.
Follow the on-screen instructions.
--------------------------------------------------------------------------------
Chunk 3:
Step 3: Configure your API key in the settings file.
The settings file is located at config/settings.json.
--------------------------------------------------------------------------------


In [4]:
# Document C: Article

strategy_C = "word-based chunking"
reason_C = (
    "Articles contain long explanatory text. Word-based chunking maintains "
    "semantic coherence while ensuring uniform chunk size."
)

doc_C = """
Solar and wind power have seen tremendous growth in recent years. As technology improves
and costs decrease, renewable energy becomes increasingly competitive with fossil fuels.

Energy storage solutions are critical for renewable adoption. Battery technology advances
enable better grid management and reliability. This addresses the intermittent nature of
solar and wind power.

Policy support and public awareness continue to drive the transition. Many countries have
set ambitious renewable energy targets for the coming decades.
"""

chunks_C = chunk_by_words(doc_C, chunk_size=40, overlap=8)

for i, chunk in enumerate(chunks_C, 1):
    print(f"Chunk {i}:")
    print(chunk)
    print("-" * 80)


Chunk 1:
Solar and wind power have seen tremendous growth in recent years. As technology improves and costs decrease, renewable energy becomes increasingly competitive with fossil fuels. Energy storage solutions are critical for renewable adoption. Battery technology advances enable better grid management
--------------------------------------------------------------------------------
Chunk 2:
adoption. Battery technology advances enable better grid management and reliability. This addresses the intermittent nature of solar and wind power. Policy support and public awareness continue to drive the transition. Many countries have set ambitious renewable energy targets for the
--------------------------------------------------------------------------------
Chunk 3:
have set ambitious renewable energy targets for the coming decades.
--------------------------------------------------------------------------------
