In [None]:
def chunk_by_sentences(text, max_chunk_size=500):
    """
    Split text into chunks by sentences, keeping sentences intact.
    
    Args:
        text: The text to chunk
        max_chunk_size: Maximum characters per chunk
    
    Returns:
        List of text chunks
    """
    # Simple sentence splitting (split on . ! ?)
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # Check if adding this sentence would exceed max size
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            # Save current chunk and start new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            # Add sentence to current chunk
            current_chunk += " " + sentence if current_chunk else sentence
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test it
chunks = chunk_by_sentences(sample_document, max_chunk_size=400)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

In [None]:
def chunk_by_paragraphs(text, min_chunk_size=100):
    """
    Split text by paragraphs (double newlines).
    
    Args:
        text: The text to chunk
        min_chunk_size: Minimum characters per chunk (combine small paragraphs)
    
    Returns:
        List of text chunks
    """
    # Split by double newlines (paragraph separator)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        # If paragraph is too small, combine with next
        if len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            # Save previous chunk if exists
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start new chunk with this paragraph
            current_chunk = para
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test it
chunks = chunk_by_paragraphs(sample_document, min_chunk_size=100)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)