# 2 Document Splitting

In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
import string
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

chunk_size = 26
chunk_overlap = 4

r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
c_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="")  # separator defaults to "\n"
# The character splitter splits depending on a separator as the Python's native str.split method
# The chunks can be longer than chunk_size if the separator is not found, and a warning is thrown

text = string.ascii_letters

print("Recursive splitter:", r_splitter.split_text(text))
print("Character splitter:", c_splitter.split_text(text))

Recursive splitter: ['abcdefghijklmnopqrstuvwxyz', 'wxyzABCDEFGHIJKLMNOPQRSTUV', 'STUVWXYZ']
Character splitter: ['abcdefghijklmnopqrstuvwxyz', 'wxyzABCDEFGHIJKLMNOPQRSTUV', 'STUVWXYZ']


In [3]:
text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentences. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""
len(text)

496

In [4]:
from pprint import pprint

r_splitter = RecursiveCharacterTextSplitter(
    # First try to split by "\n\n", then "\n", etc. -> better split
    # The regex trick is to have the period at the end of the right sentence
    separators=["\n\n", "\n", r"(?<=\. )", " ", ""],
    chunk_size=450,
    chunk_overlap=0,
)

pprint(r_splitter.split_text(text))

['When writing documents, writers will use document structure to group '
 "content. This can convey to the reader, which idea's are related. For "
 'example, closely related ideas are in sentences. Similar ideas are in '
 'paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage '
 'returns. Carriage returns are the "backslash n" you see embedded in this '
 'string. Sentences have a period at the end, but also, have a space.and words '
 'are separated by space.']


In [5]:
# Try with a longer document
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/MachineLearning-Lecture01.pdf")
pages = loader.load()

splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", r"(?<=\. )", " ", ""],
    chunk_size=1000,
    chunk_overlap=150,
    # Token count can be used as length function, or
    # directly the langchain.text_splitter TokenTextSplitter 
    length_function=len,
)
docs = splitter.split_documents(pages)

print("pages:", len(pages))
print("docs:", len(docs))

pages: 22
docs: 78


In [6]:
# Context-aware splitting (useful for Markdown docs)
from langchain.text_splitter import MarkdownHeaderTextSplitter

md_doc = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

print(md_doc)

# Title

 ## Chapter 1

 Hi this is Jim

 Hi this is Joe

 ### Section 

 Hi this is Lance 

 
## Chapter 2

 Hi this is Molly


In [7]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

md_splits = md_splitter.split_text(md_doc)

md_splits[0]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim  \nHi this is Joe')