# Load Document

In [36]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data/", required_exts=[".txt"]).load_data()

text = documents[0].text

# Level 1 : Fixed Size Chunking

## Manual Splitting

In [18]:
chunks = []
chunk_size = 35
for i in range(0, len(text), chunk_size):
    chunk = text[i: i + chunk_size]
    chunks.append(chunk)
    
chunks

['What I Worked On\n\nFebruary 2021\n\nBe',
 'fore college the two main things I ',
 'worked on, outside of school, were ',
 "writing and programming. I didn't w",
 'rite essays. I wrote what beginning',
 ' writers were supposed to write the',
 'n, and probably still are: short st',
 'ories. My stories were awful. They ',
 'had hardly any plot, just character',
 's with strong feelings, which I ima',
 'gined made them deep.\n\nThe first pr',
 'ograms I tried writing were on the ',
 'IBM 1401 that our school district u',
 'sed for what was then called "data ',
 'processing." This was in 9th grade,',
 ' so I was 13 or 14. The school dist',
 "rict's 1401 happened to be in the b",
 'asement of our junior high school, ',
 'and my friend Rich Draves and I got',
 ' permission to use it. It was like ',
 "a mini Bond villain's lair down the",
 're, with all these alien-looking ma',
 'chines — CPU, disk drives, printer,',
 ' card reader — sitting up on a rais',
 'ed floor under bright fluorescen

## Langchain Character Text Splitter

In [19]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=100,
    chunk_overlap=10,
)
texts = text_splitter.create_documents([text])

texts
# print(texts[4])

[Document(metadata={}, page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school,'),
 Document(metadata={}, page_content="of school, were writing and programming. I didn't write essays. I wrote what beginning writers were"),
 Document(metadata={}, page_content='were supposed to write then, and probably still are: short stories. My stories were awful. They had'),
 Document(metadata={}, page_content='They had hardly any plot, just characters with strong feelings, which I imagined made them'),
 Document(metadata={}, page_content='made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district'),
 Document(metadata={}, page_content='district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14.'),
 Document(metadata={}, page_content="13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and"),
 Document(metadata={}, pag

## CharacterTextSplitter without any framework

In [20]:
def custom_text_splitter(text, chunk_size=100, chunk_overlap=20, separator=" "):
    chunks = []
    start = 0
    while start < len(text):
        # Find the next separator within the chunk size
        end = start + chunk_size
        if end < len(text):
            # Ensure we don't cut words in the middle
            end = text.rfind(separator, start, end)
            if end == -1:  # If no separator found, just use chunk_size
                end = start + chunk_size

        chunk = text[start:end].strip()
        if chunk:  # Avoid empty chunks
            chunks.append(chunk)

        start = end - chunk_overlap  # Overlapping chunks

    return chunks

chunks = custom_text_splitter(text, chunk_size=150, chunk_overlap=25)

chunks

["What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn't write",
 "ogramming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were",
 'stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs',
 'deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in',
 'processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my',
 "unior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these",
 'own there, with all these alien-looking machines — CPU, disk drives, printer, card 

## Llama Index - SentenceSplitter

In [21]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=100,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)

nodes

[TextNode(id_='87647ce3-4916-4565-9626-888f759359da', embedding=None, metadata={'file_path': '/Users/harshabajaj/Desktop/RAG/chunking/../data/paul_graham.txt', 'file_name': 'paul_graham.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_date': '2025-02-18', 'last_modified_date': '2025-02-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='812b2d82-2a32-49b3-a164-b79873167fc5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/harshabajaj/Desktop/RAG/chunking/../data/paul_graham.txt', 'file_name': 'paul_graham.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_date': '2025-02-18', 'last_modified_date': '2025-02-18'}, hash='e53bb9b115852dddc4861cc7d6a45beaaaae3eed3

# Level 2: Recursive Chunking

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text = "What I Worked On\n\nFebruary 2021 \nBefore college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep."

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=50,
    chunk_overlap=30,
    # separators=" ",
)
texts = text_splitter.create_documents([text])
print(texts)
# print(texts[0])
# print(texts[1])


[Document(metadata={}, page_content='What I Worked On'), Document(metadata={}, page_content='February 2021'), Document(metadata={}, page_content='Before college the two main things I worked on,'), Document(metadata={}, page_content='two main things I worked on, outside of school,'), Document(metadata={}, page_content='worked on, outside of school, were writing and'), Document(metadata={}, page_content="of school, were writing and programming. I didn't"), Document(metadata={}, page_content="and programming. I didn't write essays. I wrote"), Document(metadata={}, page_content="didn't write essays. I wrote what beginning"), Document(metadata={}, page_content='I wrote what beginning writers were supposed to'), Document(metadata={}, page_content='writers were supposed to write then, and probably'), Document(metadata={}, page_content='to write then, and probably still are: short'), Document(metadata={}, page_content='and probably still are: short stories. My stories'), Document(metadata={}, 

# Level 3 : Document Based Chunking

## Markdown

In [23]:
# Load markdown text

import pymupdf4llm

md_text = pymupdf4llm.to_markdown("../data/attention.pdf")

md_text

Processing ../data/attention.pdf...


'### Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\n## Attention Is All You Need\n\n\n**Ashish Vaswani[∗]**\nGoogle Brain\n```\navaswani@google.com\n\n```\n**Llion Jones[∗]**\nGoogle Research\n```\n llion@google.com\n\n```\n\n**Noam Shazeer[∗]**\nGoogle Brain\n```\nnoam@google.com\n\n```\n\n**Aidan N. Gomez[∗†]**\nUniversity of Toronto\n```\naidan@cs.toronto.edu\n\n```\n\n**Niki Parmar[∗]**\nGoogle Research\n```\nnikip@google.com\n\n```\n\n**Jakob Uszkoreit[∗]**\nGoogle Research\n```\nusz@google.com\n\n```\n\n**Łukasz Kaiser[∗]**\nGoogle Brain\n```\nlukaszkaiser@google.com\n\n```\n\n**Illia Polosukhin[∗‡]**\n```\nillia.polosukhin@gmail.com\n\n### Abstract\n\n```\n\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encod

In [24]:
from langchain.text_splitter import MarkdownTextSplitter

markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=10)
docs = markdown_splitter.create_documents([md_text])

docs

[Document(metadata={}, page_content='### Provided proper attribution is provided, Google hereby grants permission to reproduce the tables'),
 Document(metadata={}, page_content='tables and figures in this paper solely for use in journalistic or scholarly works.'),
 Document(metadata={}, page_content='## Attention Is All You Need\n\n\n**Ashish Vaswani[∗]**\nGoogle Brain'),
 Document(metadata={}, page_content='```\navaswani@google.com\n\n```\n**Llion Jones[∗]**\nGoogle Research\n```\n llion@google.com'),
 Document(metadata={}, page_content='```\n\n**Noam Shazeer[∗]**\nGoogle Brain\n```\nnoam@google.com'),
 Document(metadata={}, page_content='```\n\n**Aidan N. Gomez[∗†]**\nUniversity of Toronto\n```\naidan@cs.toronto.edu'),
 Document(metadata={}, page_content='```\n\n**Niki Parmar[∗]**\nGoogle Research\n```\nnikip@google.com'),
 Document(metadata={}, page_content='```\n\n**Jakob Uszkoreit[∗]**\nGoogle Research\n```\nusz@google.com'),
 Document(metadata={}, page_content='```\n\n**Łukasz Ka

## Multi-Modal

In [25]:
#!pip3 install "unstructured[all-docs]"
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
filepath = "../data/attention.pdf"

In [27]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename=filepath,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path="static/pdfImages/",
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

# Level 4: Semantic Chunking

In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")


  from .autonotebook import tqdm as notebook_tqdm


## 1. SemanticSplitterNodeParse class - llama index

In [2]:
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)

In [6]:
nodes = splitter.get_nodes_from_documents(documents)

In [15]:
print(nodes[0].get_content())


What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.

The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the

In [14]:
# comparing with base splitter

base_nodes = base_splitter.get_nodes_from_documents(documents)

print(base_nodes[0].get_content())

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.

The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the

## 2. SemanticChunker - langchain

In [40]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings

embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-zh-v1.5")

text_splitter = SemanticChunker(embed_model)

docs = text_splitter.create_documents([text])

Error while downloading from https://cdn-lfs.hf.co/repos/4a/88/4a88d6caacd50c2f1573210b4304f3e335d0cda244a88621347fc76a0a72a629/74541fa3bddf35c2ed35a7c21542776fb830cde7d372dbe05ca42aa70f2bf904?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1739881343&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczOTg4MTM0M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy80YS84OC80YTg4ZDZjYWFjZDUwYzJmMTU3MzIxMGI0MzA0ZjNlMzM1ZDBjZGEyNDRhODg2MjEzNDdmYzc2YTBhNzJhNjI5Lzc0NTQxZmEzYmRkZjM1YzJlZDM1YTdjMjE1NDI3NzZmYjgzMGNkZTdkMzcyZGJlMDVjYTQyYWE3MGYyYmY5MDQ%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=V30OJ3yk3yIczDXijyBqEc8i0vO1EbHoEO4etCMjWYE0lTQw1rzd9B8x-seEtpgtn2oHqSVW81axwfVixHonknnJRp-y4ZedTzraTYXiiIAt9AhWZtDC%7Eqkm7eEGF%7EB1AQ4WrqjXDeChFBc3UEhiLVBf3DadMB6UviPmiJs8e-p9m6%7Eg6V9l1hr2MkB4hZ255ADXA4WZ0FkD9iyvm3g05zb6%7EVNkBC4%7Epr0Z%7ENZG3yMrBFTldZMyVpJvYVSLsVq

In [38]:
print(docs[0].page_content)

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep. The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights. The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the c