## Data Tranformer - Text Splitter

In [None]:
## PDF Loader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('./_data/gk.pdf')
docs = loader.load()
docs

In [None]:
## Required Packages
! pip install langchain-text-splitters langchain-community


### RecursiveCharacterTextSplitter
It is the recommended one for Generic Text. It's default separator list is ["\n", "\n\n", " "]

In [20]:
# Split text by characters recursively
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# chunks = text_splitter.create_documents([doc.page_content for doc in docs])

# Note: the input here is a list of Document objects, not just strings.
final_documents = text_splitter.split_documents(docs)

final_documents[::]  # Display the first two chunks



[Document(metadata={'source': './_data/speech.txt'}, page_content="Good morning everyone,\n\nToday, I want to talk about something incredibly simple, yet profoundly powerful: small steps.\n\nIn a world obsessed with big wins and overnight success, we often forget that every great achievement starts with a single small action.\n\nWhether you're trying to learn a new skill, change a habit, or build something meaningful — it always begins with the decision to take one small step forward."),
 Document(metadata={'source': './_data/speech.txt'}, page_content="Think about the tallest buildings. They're built one brick at a time. Olympic athletes? They train for years, often making tiny improvements day after day.\n\nSo, the next time you feel overwhelmed by your goals, just focus on the next step. Not the next ten, not the whole staircase — just the next one.\n\nProgress isn’t always loud. Sometimes, it whispers.\n\nBut those whispers? They build momentum.\n\nAnd that momentum? It builds succ

In [None]:
## Text Loader
from langchain_community.document_loaders import TextLoader

loader = TextLoader('./_data/speech.txt', encoding='utf-8')
docs = loader.load()

docs

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


with open('./_data/speech.txt', 'r', encoding='utf-8') as f:
    speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=80,
    chunk_overlap=20
)

#final_documents = text_splitter.split_text(speech)

# Note: the input here is a list of text strings, not Document objects.
final_documents = text_splitter.create_documents([speech])
final_documents[::]  # Display the first two chunks

# print(final_documents[0].page_content)  # Display the content of the first chunk
# print(final_documents[1].page_content)  # Display the content of the second chunk


In [18]:
from langchain.text_splitter import CharacterTextSplitter

with open('./_data/speech.txt', 'r', encoding='utf-8') as f:
    speech = f.read()

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=80,
    chunk_overlap=20
)

# Note: the input here is a list of text strings, not Document objects.
final_documents = text_splitter.create_documents([speech])
final_documents[::]


Created a chunk of size 94, which is longer than the specified 80
Created a chunk of size 136, which is longer than the specified 80
Created a chunk of size 158, which is longer than the specified 80
Created a chunk of size 155, which is longer than the specified 80
Created a chunk of size 145, which is longer than the specified 80


[Document(metadata={}, page_content='Good morning everyone,'),
 Document(metadata={}, page_content='Today, I want to talk about something incredibly simple, yet profoundly powerful: small steps.'),
 Document(metadata={}, page_content='In a world obsessed with big wins and overnight success, we often forget that every great achievement starts with a single small action.'),
 Document(metadata={}, page_content="Whether you're trying to learn a new skill, change a habit, or build something meaningful — it always begins with the decision to take one small step forward."),
 Document(metadata={}, page_content="Think about the tallest buildings. They're built one brick at a time. Olympic athletes? They train for years, often making tiny improvements day after day."),
 Document(metadata={}, page_content='So, the next time you feel overwhelmed by your goals, just focus on the next step. Not the next ten, not the whole staircase — just the next one.'),
 Document(metadata={}, page_content='Progres

### HTMLHeaderTextSplitter
It is a structure-aware chunker that splits document to HTML element levels and adds metadata for each header relevant to any given chunk.

In [19]:
from langchain.text_splitter import HTMLHeaderTextSplitter
with open('./_data/sample.html', 'r', encoding='utf-8') as f:
    html_string = f.read()

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

final_documents = html_splitter.split_text(html_string)

final_documents[::]

[Document(metadata={'Header 1': 'Introduction to AI'}, page_content='Introduction to AI'),
 Document(metadata={'Header 1': 'Introduction to AI'}, page_content='Artificial Intelligence (AI) is transforming how we live, work, and interact with the world.'),
 Document(metadata={'Header 1': 'Introduction to AI', 'Header 2': 'Applications of AI'}, page_content='Applications of AI'),
 Document(metadata={'Header 1': 'Introduction to AI', 'Header 2': 'Applications of AI'}, page_content='AI is being used in various industries including healthcare, finance, education, and entertainment.'),
 Document(metadata={'Header 1': 'Introduction to AI', 'Header 2': 'Applications of AI', 'Header 3': 'Healthcare'}, page_content='Healthcare'),
 Document(metadata={'Header 1': 'Introduction to AI', 'Header 2': 'Applications of AI', 'Header 3': 'Healthcare'}, page_content='AI helps in diagnostics, patient monitoring, and drug discovery.'),
 Document(metadata={'Header 1': 'Introduction to AI', 'Header 2': 'Applic

In [21]:
# Read the HTML content from a URL and split it by headers
url = "https://oauthv2app.azurewebsites.net/"

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

final_documents = html_splitter.split_text_from_url(url)

final_documents[::]

[Document(metadata={}, page_content='Information Bar Modal Popup AlpineJS Font Awesome ChartJS <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.3/Chart.min.js" integrity="sha256-R4pqcOYV8lt7snxMQO/HSbVCFRPMdrhAFMH+vr9giYI=" crossorigin="anonymous"></script>  \nOAuth  \nDocument  \nOAuth Overview  \nAuthorization Code Flow  \nClient Credential Flow  \nImplicit Flow  \nPassword Credential Flow  \nApi Call (Anonymous)  \nRead KV - MI  \nRead KV - FIC  \nRead KV - CS  \nRead Weather - 3rd Party  \nSPA  \nRead Users  \ndocument.addEventListener("DOMContentLoaded", function() {\n            const activeItem = getCookie("activeNavItem");\n            const activeSubmenuItem = getCookie("activeSubmenuItem");\n            if (activeItem) {\n                const activeElement = document.querySelector(`[onclick*="${activeItem}"]`);\n                if (activeElement) {\n                    activeElement.classList.add("nav-item-active");\n                    const submenu = docume

### JSONTextSplitter
It splits json data while allowing control over chunk sizes. It traverses json data depath first and builds smaller json chunks. It attempts to keep nested json objects whole but will split them if needed to keep chunks between a min_chunk_size and max_chunk_size.

In [24]:
import requests
import json
from langchain.text_splitter import RecursiveJsonSplitter

url = "https://api.smith.langchain.com/openapi.json"
response = requests.get(url)

openapi_data = ""
if response.status_code == 200:
    openapi_data = response.json()
    
splitter = RecursiveJsonSplitter(
    max_chunk_size=300,
)
json_chunks = splitter.split_json(openapi_data)
json_chunks[:3:]  # Display the first two chunks 

[{'openapi': '3.1.0',
  'info': {'title': 'LangSmith', 'version': '0.1.0'},
  'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'tags': ['tracer-sessions'],
     'summary': 'Get Tracing Project Prebuilt Dashboard',
     'description': 'Get a prebuilt dashboard for a tracing project.'}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'operationId': 'get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post',
     'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'name': 'accept',
       'in': 'header',
       'required': False,
       'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        'title': 'Accept'}}]}}}}]

In [27]:
import requests
import json
from langchain.text_splitter import RecursiveJsonSplitter

url = "https://api.smith.langchain.com/openapi.json"
response = requests.get(url)

openapi_data = ""
if response.status_code == 200:
    openapi_data = response.json()
    
splitter = RecursiveJsonSplitter(
    max_chunk_size=300,
)
final_documents = splitter.create_documents(texts=[openapi_data])
final_documents[:3:]  # Display the first two chunks 

[Document(metadata={}, page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"tags": ["tracer-sessions"], "summary": "Get Tracing Project Prebuilt Dashboard", "description": "Get a prebuilt dashboard for a tracing project."}}}}'),
 Document(metadata={}, page_content='{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"operationId": "get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'),
 Document(metadata={}, page_content='{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}')]