In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
file_path = r"E:\2025\Generative_AI\Course\agentic-ai-2.0\data\syllabus.pdf"
docs = PyPDFLoader(file_path).load()

In [3]:
len(docs)

34

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.split_documents(docs)

In [7]:
len(documents)

81

In [8]:
documents[0]

Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal singh', 'containsaigeneratedcontent': 'Yes', 'source': 'E:\\2025\\Generative_AI\\Course\\agentic-ai-2.0\\data\\syllabus.pdf', 'total_pages': 34, 'page': 0, 'page_label': '1'}, page_content='MACHINE\nLEARNING\nDEEP\nLEARNING\nPYTHON +\nSTATS\nCOMPUTER VISIONNATURAL LANGUAGE PROCESSING\nGENERATIVE AI\nRETRIEVAL AUGUMENT GENERATION\nVECTOR DB')

### Using textloader

In [9]:
from langchain_community.document_loaders import TextLoader

file_path = r"E:\2025\Generative_AI\Course\agentic-ai-2.0\data\speech.txt"
docs = TextLoader(file_path).load()

In [10]:
len(docs)

1

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
documents = text_splitter.split_documents(docs)

In [12]:
len(documents)

42

In [13]:
documents[0]

Document(metadata={'source': 'E:\\2025\\Generative_AI\\Course\\agentic-ai-2.0\\data\\speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of')

#### Character Text Splitter

- This is the simplest method. This splits based on a given character sequence, which defaults to "\n\n". 

In [15]:
from langchain_community.document_loaders import TextLoader

file_path = r"E:\2025\Generative_AI\Course\agentic-ai-2.0\data\speech.txt"
docs = TextLoader(file_path).load()

In [16]:
from langchain_text_splitters import CharacterTextSplitter 

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=100, chunk_overlap=10)
documents = text_splitter.split_documents(docs)

Created a chunk of size 470, which is longer than the specified 100
Created a chunk of size 347, which is longer than the specified 100
Created a chunk of size 668, which is longer than the specified 100
Created a chunk of size 982, which is longer than the specified 100
Created a chunk of size 789, which is longer than the specified 100


In [17]:
len(documents)

7

In [18]:
documents[0].page_content

'The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'

### HTML Text Splitter

-  It is a "structure-aware" chunker

In [19]:
from langchain_text_splitters import HTMLHeaderTextSplitter

In [20]:
html_string = """ 
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Sample HTML Page</title>
</head>
<body>
    <h1>Welcome to My Sample Page</h1>
    <h2>About This Page</h2>
    <p>This is a <strong>sample HTML</strong> document used for testing purposes.</p>

    <h3>List of Features</h3>
    <ul>
        <li>HTML structure with headings</li>
        <li>Paragraphs and text formatting</li>
        <li>Links and images</li>
        <li>Forms and inputs</li>
    </ul>

    <p>Visit <a href="https://example.com">Example.com</a> for more information.</p>

    <img src="https://via.placeholder.com/150" alt="Sample Image">

    <h3>Contact Us</h3>
    <form action="/submit" method="POST">
        <label for="name">Name:</label>
        <input type="text" id="name" name="name"><br>

        <label for="email">Email:</label>
        <input type="email" id="email" name="email"><br>

        <input type="submit" value="Submit">
    </form>
</body>
</html>

"""


headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3")
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits

[Document(metadata={'Header 1': 'Welcome to My Sample Page'}, page_content='Welcome to My Sample Page'),
 Document(metadata={'Header 1': 'Welcome to My Sample Page', 'Header 2': 'About This Page'}, page_content='About This Page'),
 Document(metadata={'Header 1': 'Welcome to My Sample Page', 'Header 2': 'About This Page'}, page_content='This is a document used for testing purposes.  \nsample HTML'),
 Document(metadata={'Header 1': 'Welcome to My Sample Page', 'Header 2': 'About This Page', 'Header 3': 'List of Features'}, page_content='List of Features'),
 Document(metadata={'Header 1': 'Welcome to My Sample Page', 'Header 2': 'About This Page', 'Header 3': 'List of Features'}, page_content='HTML structure with headings  \nParagraphs and text formatting  \nLinks and images  \nForms and inputs  \nVisit for more information.  \nExample.com'),
 Document(metadata={'Header 1': 'Welcome to My Sample Page', 'Header 2': 'About This Page', 'Header 3': 'Contact Us'}, page_content='Contact Us'),
 

### How to split JSON data

In [21]:
from langchain_text_splitters import RecursiveJsonSplitter

In [24]:
json_data = {
  "user": {
    "id": 101,
    "name": "Alice Johnson",
    "email": "alice@example.com",
    "isActive": True,
    "signupDate": "2024-12-01T10:30:00Z",
    "profile": {
      "age": 29,
      "gender": "female",
      "interests": ["reading", "traveling", "technology"]
    },
    "address": {
      "street": "123 Maple Street",
      "city": "Springfield",
      "state": "IL",
      "zip": "62704"
    },
    "orders": [
      {
        "orderId": "A1001",
        "amount": 250.75,
        "status": "shipped"
      },
      {
        "orderId": "A1002",
        "amount": 89.99,
        "status": "processing"
      }
    ]
  }
}


In [27]:
splitter = RecursiveJsonSplitter(max_chunk_size=100)
json_chunks = splitter.split_json(json_data)

In [28]:
json_chunks

[{'user': {'id': 101,
   'name': 'Alice Johnson',
   'email': 'alice@example.com',
   'isActive': True}},
 {'user': {'signupDate': '2024-12-01T10:30:00Z',
   'profile': {'age': 29, 'gender': 'female'}}},
 {'user': {'profile': {'interests': ['reading', 'traveling', 'technology']}}},
 {'user': {'address': {'street': '123 Maple Street',
    'city': 'Springfield',
    'state': 'IL'}}},
 {'user': {'address': {'zip': '62704'},
   'orders': [{'orderId': 'A1001', 'amount': 250.75, 'status': 'shipped'},
    {'orderId': 'A1002', 'amount': 89.99, 'status': 'processing'}]}}]

In [29]:
len(json_chunks)

5

In [30]:
json_chunks[0]

{'user': {'id': 101,
  'name': 'Alice Johnson',
  'email': 'alice@example.com',
  'isActive': True}}