In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [2]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)


## Document structure in Langchain

In [3]:
doc = Document(page_content="THis is the main text content that will be embedded and searched.",
metadata={
    "source":"example.txt",
    "page":1,
    "author": "Suman",
    "date_created":"12-11-2025"
})
print(f"Document Structure\nContent:{doc.page_content}\nMetadata:{doc.metadata}")

Document Structure
Content:THis is the main text content that will be embedded and searched.
Metadata:{'source': 'example.txt', 'page': 1, 'author': 'Suman', 'date_created': '12-11-2025'}


In [4]:
def imp_metadata():
    print("Why metadata is important?\n"
      "1. Filtering search results,\n"
      "2. Tracking document sources\n"
      "3. Providing context in responses\n"
      "4. Debugging and auditing")

In [5]:
imp_metadata()

Why metadata is important?
1. Filtering search results,
2. Tracking document sources
3. Providing context in responses
4. Debugging and auditing


In [6]:
type(doc)

langchain_core.documents.base.Document

## Text loader- Read single file

In [7]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

In [8]:
loader = TextLoader("data/text_file/python_intro.txt")
document = loader.load()
print(type(document))
print(document)

<class 'list'>
[Document(metadata={'source': 'data/text_file/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most widely\nused programming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, machine learning,\nautomation, scripting, and scientific computing. Its strong ecosystem of libraries makes it a top\nchoice for developers across many industries.\n')]


### Direcotry loader - for multiple file

In [9]:
from langchain_community.document_loaders import DirectoryLoader

## load all the file in the directory
dir_loader = DirectoryLoader(
    "data/text_file",
    glob="**/*.txt",        ## Pattern to match file
    loader_cls = TextLoader, ## Loader class to use
    show_progress=True
)
document1 = dir_loader.load()

print(f"length of hte document: {len(document1)}")
for i, doc in enumerate (document1):
    print(f"Document {i+1}: ")
    print(f"Source: {doc.metadata['source']}")
    print(f"Length: {len(doc.page_content)}")

100%|██████████| 2/2 [00:00<00:00, 1838.40it/s]

length of hte document: 2
Document 1: 
Source: data/text_file/python_intro.txt
Length: 640
Document 2: 
Source: data/text_file/python_advance.txt
Length: 361





In [10]:
def cha_advantages():
    print("Advantages of DirectoryLoader + Character Text Splitter:\n")
    print("1. Loads all files in a directory automatically.")
    print("2. Automatically adds useful metadata like file path and file name.")
    print("3. Works well with large numbers of documents.")
    print("4. Easy integration with character-based text splitters.")
    print("5. Supports filtering based on file extensions.")
    print("6. Useful for building RAG pipelines efficiently.")
    print("7. Handles multiple file formats through loaders.\n")

def cha_disadvantages():
    print("Disadvantages of DirectoryLoader + Character Text Splitter:\n")
    print("1. Semantic meaning can be lost when splitting text based on characters.")
    print("2. No automatic cleaning of noisy text (PDF artifacts, headers, footers).")
    print("3. Large directories may require a lot of memory.")
    print("4. Requires manual loaders for uncommon file types.")
    print("5. Slow loading for large documents (especially large PDFs).")
    print("6. No incremental loading – must reload everything if files change.")
    print("7. Deep directory structures may require custom handling.\n")


In [11]:
cha_advantages()
cha_disadvantages()

Advantages of DirectoryLoader + Character Text Splitter:

1. Loads all files in a directory automatically.
2. Automatically adds useful metadata like file path and file name.
3. Works well with large numbers of documents.
4. Easy integration with character-based text splitters.
5. Supports filtering based on file extensions.
6. Useful for building RAG pipelines efficiently.
7. Handles multiple file formats through loaders.

Disadvantages of DirectoryLoader + Character Text Splitter:

1. Semantic meaning can be lost when splitting text based on characters.
2. No automatic cleaning of noisy text (PDF artifacts, headers, footers).
3. Large directories may require a lot of memory.
4. Requires manual loaders for uncommon file types.
5. Slow loading for large documents (especially large PDFs).
6. No incremental loading – must reload everything if files change.
7. Deep directory structures may require custom handling.



## Text Spliting Strategies

In [32]:
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)


In [33]:
document1

[Document(metadata={'source': 'data/text_file/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most widely\nused programming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, machine learning,\nautomation, scripting, and scientific computing. Its strong ecosystem of libraries makes it a top\nchoice for developers across many industries.\n'),
 Document(metadata={'source': 'data/text_file/python_advance.txt'}, page_content='Advanced Python Concepts\n\nPython also supports advanced programming techniques such as:\n\n- Object-Oriented Programming (Classes & Objects)\n- Functional Programming (map, filt

In [34]:
text=document1[0].page_content
text

'Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most widely\nused programming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, machine learning,\nautomation, scripting, and scientific computing. Its strong ecosystem of libraries makes it a top\nchoice for developers across many industries.\n'

In [None]:
## Method 1- character text splitter

print("Characte Text Splitter")
char_splitter = CharacterTextSplitter(
    separator='\n',     # split on new lines
    chunk_size=200,     # max size of character per chunk
    chunk_overlap=20,   #overlap between chunks
    length_function=len # return the lenth of chunk
)
char_chunks=char_splitter.split_text(text)
print(f"number of chunks: {len(char_chunks)}\n")

for i, chunk in enumerate(char_chunks):
    print(f"{i+1} chunk: {char_chunks[i][:150]}\n") 

Characte Text Splitter
number of chunks: 4

1 chunk: Python Programming Introduction
Python is a high-level, interpreted programming language known for its simplicity and readability.

2 chunk: Created by Guido van Rossum and first released in 1991, Python has become one of the most widely
used programming languages in the world.
Key Features

3 chunk: - Extensive standard library
- Cross-platform compatibility
- Strong community support
Python is widely used in web development, data science, artific

4 chunk: automation, scripting, and scientific computing. Its strong ecosystem of libraries makes it a top
choice for developers across many industries.



In [36]:
print(char_chunks)

['Python Programming Introduction', 'Python is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most widely', 'used programming languages in the world.', 'Key Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support', 'Python is widely used in web development, data science, artificial intelligence, machine learning,\nautomation, scripting, and scientific computing. Its strong ecosystem of libraries makes it a top', 'choice for developers across many industries.']


In [None]:
## Method 2 - Recursive Character Text Spliter
print("Recursive Character Text Spliter")
char_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', ' ', ''],     # split on new lines
    chunk_size=200,     # max size of character per chunk
    chunk_overlap=20,   #overlap between chunks
    length_function=len # return the lenth of chunk
)
recur_chunks=char_splitter.split_text(text)
print(f"number of chunks: {len(recur_chunks)}\n")

for i, chunk in enumerate(recur_chunks):
    print(f"{i+1} chunk: {recur_chunks[i][:150]}\n") 

Recursive Character Text Spliter
number of chunks: 6

1 chunk: Python Programming Introduction

2 chunk: Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1

3 chunk: used programming languages in the world.

4 chunk: Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

5 chunk: Python is widely used in web development, data science, artificial intelligence, machine learning,
automation, scripting, and scientific computing. It

6 chunk: choice for developers across many industries.



In [None]:
char_splitter = RecursiveCharacterTextSplitter(
    separators=[' '],     # split on new lines
    chunk_size=200,     # max size of character per chunk
    chunk_overlap=20,   #overlap between chunks
    length_function=len # return the lenth of chunk
)
recur_chunks=char_splitter.split_text(text)
print(f"number of chunks: {len(recur_chunks)}\n")

for i, chunk in enumerate(recur_chunks):
    print(f"{i+1} chunk: {recur_chunks[i][:150]}\n") 

number of chunks: 4

1 chunk: Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido v

2 chunk: in 1991, Python has become one of the most widely
used programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard

3 chunk: Strong community support

Python is widely used in web development, data science, artificial intelligence, machine learning,
automation, scripting, an

4 chunk: strong ecosystem of libraries makes it a top
choice for developers across many industries.



In [41]:
## Method 3 - Token Text Splitter
print("Token Text Splitter")
token_splitter = TokenTextSplitter(
    chunk_size=50,     # max size of character per chunk
    chunk_overlap=10,   #overlap between chunks
)
token_splitter=char_splitter.split_text(text)
print(f"number of chunks: {len(token_splitter)}\n")

for i, chunk in enumerate(token_splitter):
    print(f"{i+1} chunk: {token_splitter[i][:150]}\n") 

Token Text Splitter
number of chunks: 4

1 chunk: Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido v

2 chunk: in 1991, Python has become one of the most widely
used programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard

3 chunk: Strong community support

Python is widely used in web development, data science, artificial intelligence, machine learning,
automation, scripting, an

4 chunk: strong ecosystem of libraries makes it a top
choice for developers across many industries.



In [42]:
def character_text_splitter_info():
    print("CharacterTextSplitter")
    print("Advantages:")
    print("1. Simple and fast splitting method.")
    print("2. Predictable chunk sizes based on characters.")
    print("3. Works well for clean and uniform text.")
    print("4. Easy to configure and use.\n")

    print("Disadvantages:")
    print("1. Can break sentences or words in the middle.")
    print("2. May lower embedding quality due to poor splits.")
    print("3. Not ideal for messy, unstructured text.")
    print("4. No semantic awareness.\n")

def recursive_character_text_splitter_info():
    print("RecursiveCharacterTextSplitter")
    print("Advantages:")
    print("1. Smarter splitting using multiple separators.")
    print("2. Maintains semantic meaning better than simple splitters.")
    print("3. Reduces fragmentation of text chunks.")
    print("4. Best choice for PDFs, articles, and long documents.\n")

    print("Disadvantages:")
    print("1. Slightly slower than CharacterTextSplitter.")
    print("2. May still split incorrectly on very messy text.")
    print("3. More configuration options, slightly more complex.")
    print("4. Larger overhead compared to naive splitters.\n")


def token_text_splitter_info():
    print("TokenTextSplitter")
    print("Advantages:")
    print("1. Splits based on tokens, not characters.")
    print("2. Avoids splitting in the middle of words.")
    print("3. Works well with LLM context limits.")
    print("4. Ideal for embedding and model input preparation.\n")

    print("Disadvantages:")
    print("1. Slower than character-based splitters.")
    print("2. Depends on tokenizer libraries (like tiktoken).")
    print("3. Token counts vary by model, making it less universal.")
    print("4. Requires extra dependencies.\n")

character_text_splitter_info()
recursive_character_text_splitter_info()
token_text_splitter_info()

CharacterTextSplitter
Advantages:
1. Simple and fast splitting method.
2. Predictable chunk sizes based on characters.
3. Works well for clean and uniform text.
4. Easy to configure and use.

Disadvantages:
1. Can break sentences or words in the middle.
2. May lower embedding quality due to poor splits.
3. Not ideal for messy, unstructured text.
4. No semantic awareness.

RecursiveCharacterTextSplitter
Advantages:
1. Smarter splitting using multiple separators.
2. Maintains semantic meaning better than simple splitters.
3. Reduces fragmentation of text chunks.
4. Best choice for PDFs, articles, and long documents.

Disadvantages:
1. Slightly slower than CharacterTextSplitter.
2. May still split incorrectly on very messy text.
3. More configuration options, slightly more complex.
4. Larger overhead compared to naive splitters.

TokenTextSplitter
Advantages:
1. Splits based on tokens, not characters.
2. Avoids splitting in the middle of words.
3. Works well with LLM context limits.
4. Id