# Document Chunking Strategies

This notebook demonstrates 5 different types of document chunking strategies using LangChain.


## 1. Character Text Splitting

Basic splitting based on character count with separator.


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

# Load the text file
loader = TextLoader('SteveJobsSpeech.txt')
docs = loader.load()
text = docs[0].page_content

print(f"Original text length: {len(text)} characters")
print(f"First 200 characters: {text[:200]}...")


In [None]:
# Character-based splitting
char_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=100
)

char_chunks = char_splitter.split_text(text)
print(f"Number of chunks: {len(char_chunks)}")
print(f"First chunk length: {len(char_chunks[0])} characters")
print(f"First chunk: {char_chunks[0][:300]}...")


## 2. Token-based Chunking

Splitting based on token count using tiktoken.


In [None]:
from langchain.text_splitter import TokenTextSplitter

# Token-based splitting
token_splitter = TokenTextSplitter(
    chunk_size=200,
    chunk_overlap=20
)

token_chunks = token_splitter.split_text(text)
print(f"Number of token-based chunks: {len(token_chunks)}")
print(f"First token chunk: {token_chunks[0][:300]}...")


## 3. Recursive Character Text Splitting

Smart splitting that tries to preserve structure by using multiple separators.


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Recursive character splitting
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Number of recursive chunks: {len(recursive_chunks)}")
print(f"First recursive chunk: {recursive_chunks[0][:300]}...")


## 4. Markdown Header Text Splitting

Splits markdown documents based on header structure.


In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Load markdown file
md_loader = TextLoader('examplemdfile.md')
md_docs = md_loader.load()
md_text = md_docs[0].page_content

# Markdown header splitting
headers_to_split_on = [
    ("#", "Header1"),
    ("##", "Header2"),
    ("###", "Header3"),
]

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_chunks = md_splitter.split_text(md_text)

print(f"Number of markdown chunks: {len(md_chunks)}")
for i, chunk in enumerate(md_chunks[:3]):
    print(f"\nChunk {i+1}:")
    print(f"Metadata: {chunk.metadata}")
    print(f"Content preview: {chunk.page_content[:200]}...")


## 5. Semantic Chunking

AI-powered chunking based on semantic similarity using OpenAI embeddings.

**Note:** Requires OpenAI API key to be set in environment or passed to OpenAIEmbeddings.


In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
import os

# Semantic chunking (requires OpenAI API key)
try:
    # Initialize embeddings - add your API key here if not in environment
    embeddings = OpenAIEmbeddings()  # or OpenAIEmbeddings(api_key="your-key-here")
    
    # Create semantic chunker
    semantic_chunker = SemanticChunker(embeddings)
    
    # Split text semantically (using subset for demo)
    semantic_chunks = semantic_chunker.split_text(text[:2000])
    
    print(f"Number of semantic chunks: {len(semantic_chunks)}")
    for i, chunk in enumerate(semantic_chunks):
        print(f"\nSemantic Chunk {i+1} (length: {len(chunk)}):")
        print(f"{chunk[:200]}...")
        
except Exception as e:
    print(f"❌ Error with semantic chunking: {e}")
    print("This likely means the OpenAI API key is missing or invalid.")
    print("To use semantic chunking:")
    print("1. Get an API key from https://platform.openai.com/api-keys")
    print("2. Set it as: export OPENAI_API_KEY='your-key-here'")
    print("3. Or pass directly: OpenAIEmbeddings(api_key='your-key-here')")
