## Text Splitting from Docs

In [1]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/attention.pdf")
docs = loader.load()
docs



[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlu

### Split text by characters

#### RecursiveCharacter Text Splitter

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_docs = text_splitter.split_documents(docs)
len(splitted_docs)

52

In [3]:
splitted_docs[:5]

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlu

##### Open text file and convert into documents

In [4]:

with open("data/speech.txt", "r") as f:
    speech_text = f.read()

speech_text[:500]

'Down through the ages, a traditional form has evolved for this type of speech, which is: Some old fart, his best years behind him, who, over the course of his life, has made a series of dreadful mistakes (that would be me), gives heartfelt advice to a group of shining, energetic young people, with all of their best years ahead of them (that would be you).\nAnd I intend to respect that tradition.\nNow, one useful thing you can do with an old person, in addition to borrowing money from them, or aski'

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
speech_docs = text_splitter.create_documents([speech_text])
print(len(speech_docs))
print(speech_docs)

35
[Document(metadata={}, page_content='Down through the ages, a traditional form has evolved for this type of speech, which is: Some old'), Document(metadata={}, page_content='Some old fart, his best years behind him, who, over the course of his life, has made a series of'), Document(metadata={}, page_content='series of dreadful mistakes (that would be me), gives heartfelt advice to a group of shining,'), Document(metadata={}, page_content='shining, energetic young people, with all of their best years ahead of them (that would be you).'), Document(metadata={}, page_content='And I intend to respect that tradition.'), Document(metadata={}, page_content='Now, one useful thing you can do with an old person, in addition to borrowing money from them, or'), Document(metadata={}, page_content='them, or asking them to do one of their old-time “dances,” so you can watch, while laughing, is'), Document(metadata={}, page_content='is ask: “Looking back, what do you regret?” And they’ll tell you. S

#### Character Text Splitter
Need to provide the seperator

In [6]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("data/speech.txt")
documents = loader.load()
documents

[Document(metadata={'source': 'data/speech.txt'}, page_content='Down through the ages, a traditional form has evolved for this type of speech, which is: Some old fart, his best years behind him, who, over the course of his life, has made a series of dreadful mistakes (that would be me), gives heartfelt advice to a group of shining, energetic young people, with all of their best years ahead of them (that would be you).\nAnd I intend to respect that tradition.\nNow, one useful thing you can do with an old person, in addition to borrowing money from them, or asking them to do one of their old-time “dances,” so you can watch, while laughing, is ask: “Looking back, what do you regret?” And they’ll tell you. Sometimes, as you know, they’ll tell you even if you haven’t asked. Sometimes, even when you’ve specifically requested they not tell you, they’ll tell you.\nSo: What do I regret? Being poor from time to time? Not really. Working terrible jobs, like “knuckle-puller in a slaughterhouse?” (

In [7]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator="\n\n",chunk_size=100, chunk_overlap=20)
text_splitter.split_documents(documents)

[Document(metadata={'source': 'data/speech.txt'}, page_content='Down through the ages, a traditional form has evolved for this type of speech, which is: Some old fart, his best years behind him, who, over the course of his life, has made a series of dreadful mistakes (that would be me), gives heartfelt advice to a group of shining, energetic young people, with all of their best years ahead of them (that would be you).\nAnd I intend to respect that tradition.\nNow, one useful thing you can do with an old person, in addition to borrowing money from them, or asking them to do one of their old-time “dances,” so you can watch, while laughing, is ask: “Looking back, what do you regret?” And they’ll tell you. Sometimes, as you know, they’ll tell you even if you haven’t asked. Sometimes, even when you’ve specifically requested they not tell you, they’ll tell you.\nSo: What do I regret? Being poor from time to time? Not really. Working terrible jobs, like “knuckle-puller in a slaughterhouse?” (

### HTML Text Splitter

In [7]:
from langchain_text_splitters import HTMLHeaderTextSplitter
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sample Web Page</title>
</head>
<body>
    <h1>Hello, World!</h1>
    <p>This is a sample paragraph on my first HTML webpage.</p>
    <hr>
    <p>You can add a link to another page in your own website, like this: <a href="page2.html">another page</a>.</p>
    <h2>Subheader</h2>
    <p>Here is some more text under a subheader.</p>
    <h3>Another Subheader</h3>
    <p>Even more text under another subheader.</p>  
</body>
</html>
"""
headers_to_split = [("h1","Header 1"),("h2","Header 2"),("h3","Header 3")]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split)
html_header_splits = html_splitter.split_text(html_content)

In [8]:
html_header_splits

[Document(metadata={'Header 1': 'Hello, World!'}, page_content='Hello, World!'),
 Document(metadata={'Header 1': 'Hello, World!'}, page_content='This is a sample paragraph on my first HTML webpage.  \nYou can add a link to another page in your own website, like this: .  \nanother page'),
 Document(metadata={'Header 1': 'Hello, World!', 'Header 2': 'Subheader'}, page_content='Subheader'),
 Document(metadata={'Header 1': 'Hello, World!', 'Header 2': 'Subheader'}, page_content='Here is some more text under a subheader.'),
 Document(metadata={'Header 1': 'Hello, World!', 'Header 2': 'Subheader', 'Header 3': 'Another Subheader'}, page_content='Another Subheader'),
 Document(metadata={'Header 1': 'Hello, World!', 'Header 2': 'Subheader', 'Header 3': 'Another Subheader'}, page_content='Even more text under another subheader.')]

In [9]:
html_header_splits = html_splitter.split_text_from_url("https://plato.stanford.edu/entries/abilities/")
html_header_splits[:5]

[Document(metadata={}, page_content='End container NOTE: Script required for drop-down button to work (mirrors).  \nEnd header wrapper End content End footer  \nEnd header  \nEnd navigation End search  \nStanford Encyclopedia of Philosophy  \nMenu  \nBrowse  \nTable of Contents  \nWhat\'s New  \nRandom Entry  \nChronological  \nArchives  \nAbout  \nEditorial Information  \nAbout the SEP  \nEditorial Board  \nHow to Cite the SEP  \nSpecial Characters  \nAdvanced Tools  \nContact  \nSupport SEP  \nSupport the SEP  \nPDFs for SEP Friends  \nMake a Donation  \nSEPIA for Libraries  \nBegin article sidebar End article sidebar NOTE: Article content must have two wrapper divs: id="article" and id="article-content" End article NOTE: article banner is outside of the id="article" div. End article-banner  \nEntry Navigation  \nEntry Contents  \nBibliography  \nAcademic Tools  \nFriends PDF Preview  \nAuthor and Citation Info  \nBack to Top  \nEnd article-content  \nBEGIN ARTICLE HTML #aueditable D

### JSON Text Splitter

In [23]:
import json
import requests

json_data = requests.get("https://dummy-json.mock.beeceptor.com/posts").json()
json_data

[{'userId': 1,
  'id': 1,
  'title': 'Introduction to Artificial Intelligence',
  'body': 'Learn the basics of Artificial Intelligence and its applications in various industries.',
  'link': 'https://example.com/article1',
  'comment_count': 8},
 {'userId': 2,
  'id': 2,
  'title': 'Web Development with React',
  'body': 'Build modern web applications using React.js and explore its powerful features.',
  'link': 'https://example.com/article2',
  'comment_count': 12},
 {'userId': 3,
  'id': 3,
  'title': 'Data Science Fundamentals',
  'body': 'Dive into the world of Data Science and discover how to analyze and interpret data.',
  'link': 'https://example.com/article3',
  'comment_count': 5},
 {'userId': 4,
  'id': 4,
  'title': 'Machine Learning Algorithms',
  'body': 'Explore various Machine Learning algorithms and understand their workings.',
  'link': 'https://example.com/article4',
  'comment_count': 17},
 {'userId': 5,
  'id': 5,
  'title': 'Introduction to Blockchain',
  'body': '

In [24]:
len(json_data), type(json_data)

(10, list)

In [26]:
from langchain_text_splitters import RecursiveJsonSplitter
# json_data is a list at the top level; convert_lists=True handles lists
json_splitter = RecursiveJsonSplitter(max_chunk_size=200)
json_chunks = json_splitter.split_json(json_data, convert_lists=True) #If json output is a list at the top level, set convert_lists=True
json_chunks[:3]

[{'0': {'userId': 1,
   'id': 1,
   'title': 'Introduction to Artificial Intelligence',
   'body': 'Learn the basics of Artificial Intelligence and its applications in various industries.'}},
 {'0': {'link': 'https://example.com/article1', 'comment_count': 8}},
 {'1': {'userId': 2,
   'id': 2,
   'title': 'Web Development with React',
   'body': 'Build modern web applications using React.js and explore its powerful features.',
   'link': 'https://example.com/article2'}}]

In [27]:
json_docs = json_splitter.create_documents(texts=json_data, convert_lists=True)
json_docs[:3]

[Document(metadata={}, page_content='{"userId": 1, "id": 1, "title": "Introduction to Artificial Intelligence", "body": "Learn the basics of Artificial Intelligence and its applications in various industries."}'),
 Document(metadata={}, page_content='{"link": "https://example.com/article1", "comment_count": 8}'),
 Document(metadata={}, page_content='{"userId": 2, "id": 2, "title": "Web Development with React", "body": "Build modern web applications using React.js and explore its powerful features.", "link": "https://example.com/article2"}')]

In [29]:
json_texts = json_splitter.split_text(json_data, convert_lists=True)
json_texts[:3]

['{"0": {"userId": 1, "id": 1, "title": "Introduction to Artificial Intelligence", "body": "Learn the basics of Artificial Intelligence and its applications in various industries."}}',
 '{"0": {"link": "https://example.com/article1", "comment_count": 8}}',
 '{"1": {"userId": 2, "id": 2, "title": "Web Development with React", "body": "Build modern web applications using React.js and explore its powerful features.", "link": "https://example.com/article2"}}']