In [1]:
#
# LangChain is supporting different type of text_splitter
# • CharacterTextSplitter
# • RecursiveCharacterTextSplitter
# • TokenTextSplitter
# • HTMLHeaderTextSplitter
# • MarkdownHeaderTextSplitter
# • PythonCodeTextSplitter
#

In [1]:
!pip install langchain
!pip install llama-index
!pip install pypdf

Collecting langchain
  Downloading langchain-0.1.17-py3-none-any.whl (867 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.5-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.36 (from langchain)
  Downloading langchain_community-0.0.36-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.48 (from langchain)
  Downloading langchain_core-0.1.48-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downl

In [None]:
from langchain.document_loaders import PyPDFLoader

pdf_url = "https://arxiv.org/pdf/2005.11401.pdf"
loader = PyPDFLoader(pdf_url)
pages = loader.load()
len(pages)

19

In [None]:
# text splitter: RecursiveCharacterTextSplitter, CharacterTextSplitter

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

text = '''..hi i am brijesh. i \n
am a machine learning engineer'''

In [None]:
chunk_size = 10 #how many character each chunk should get  (!!ideally)
chunk_overlap = 0

character_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap, #When you want n characters at the end of the sequence to be repeated in the following one
separator='.' #only allows one separator
)

chunks = character_splitter.split_text(text)  #each chunk is a string
# by default the character is the "newline character"
chunks



['hi i am brijesh', 'i \n\nam a machine learning engineer']

In [None]:
# it tries to split text on separators in order until the chunks are small enough
recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=10,
chunk_overlap=chunk_overlap,
separators = ['.','\n']  #default ["\n\n", "\n", " ", ""]
)
chunks = recursive_splitter.split_text(text)
chunks

['.', '.hi i am brijesh', '. i', '\nam a machine learning engineer']

In [None]:
# chunking the PDF
chunk_size = 1500
chunk_overlap = 150

r_splitter = RecursiveCharacterTextSplitter(
  chunk_size=chunk_size,
  chunk_overlap=chunk_overlap,
  separators = ['\n\n','\n','.']
)
chunks = r_splitter.split_documents(pages)
#each chunk is a document type in which there are the page_content and the metadata sources.

In [None]:
chunks[:2]

[Document(page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research;‡University College London;⋆New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-\nstream NLP tasks. However, their ability to access and precisely manipulate knowl-\nedge is still limited, and hence on knowledge-intensive tasks, their performance\nlags behind task-speciﬁc architectures. Additionally, providing provenance for their\ndecisions and updating their world knowledge remain open research problems. Pre-\ntrained models with a differentiable access mechanism to explicit non-parametric\nmemory have so far been only inv

In [None]:
print(f"Number of pages={len(pages)}")
print(f"Number of chunks={len(chunks)}")
print(f"in the second page: the text length is {len(pages[1].page_content)}")
print(f"in the second chunk: the text length is {len(chunks[1].page_content)}")

Number of pages=19
Number of chunks=57
in the second page: the text length is 4564
in the second chunk: the text length is 1364


In [None]:
chunks[0].page_content[-200:]

'and the non-parametric memory is a dense\nvector index of Wikipedia, accessed with a pre-trained neural retriever. We com-\npare two RAG formulations, one which conditions on the same retrieved passages'

In [None]:
chunks[1].page_content[:200]

'pare two RAG formulations, one which conditions on the same retrieved passages\nacross the whole generated sequence, and another which can use different passages\nper token. We ﬁne-tune and evaluate our'

In [3]:
# Lets see different leves of chunking

# Levels Of Text Splitting

# Level 1: Character Splitting - Simple static character chunks of data (seen above)
# Level 2: Recursive Character Text Splitting - Recursive chunking based on a list of separators (seen above)
# Level 3: Document Specific Splitting - Various chunking methods for different document types (PDF, Python, Markdown)
# Level 4: Semantic Chunking - Embedding based splitting
# Level 5: Agent based Spliiting(LLM Based) (we'll see later)

In [19]:
# lets start chunking manually
text = "This is the text I would like to chunk up. It is the example text for this exercise"

chunks = []
chunk_size = 35 # Characters

# Run through the a range with the length of your text and iterate every chunk_size you want
for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)

print(chunks)

['This is the text I would like to ch', 'unk up. It is the example text for ', 'this exercise']


In [20]:
# Level 1: Character Splitting
#   Pros: Easy & Simple
#   Cons: Very rigid and doesn't take into account the structure of your text

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size = 35,
    chunk_overlap=0,
    separator='.',  #when empty partitions it on chunk size else partitions it on the separator
    strip_whitespace=False)

text_splitter.create_documents([text])  #expects the list of strings



[Document(page_content='This is the text I would like to chunk up'),
 Document(page_content=' It is the example text for this exercise')]

In [23]:
text_splitter = CharacterTextSplitter(
    chunk_size = 35,
    chunk_overlap=4,  ## tail of chunk #1 will be the same as head of chunk #2
    separator='',
    strip_whitespace=False  #langchin removes the trailing whitespace by default
    )

text_splitter.create_documents([text])

[Document(page_content='This is the text I would like to ch'),
 Document(page_content='o chunk up. It is the example text '),
 Document(page_content='ext for this exercise')]

<langchain_community.document_loaders.pdf.PyPDFLoader at 0x7fc1613cad40>

In [29]:
#Lets use llama_index  which provide node relationships out of the box which can aid in retrieval later.
from llama_index.core.text_splitter import  SentenceSplitter
from llama_index.core import SimpleDirectoryReader

splitter = SentenceSplitter(
    chunk_size=200,
    chunk_overlap=15,
)

documents = SimpleDirectoryReader(
    input_files=["/content/User.java"]
).load_data()

nodes = splitter.get_nodes_from_documents(documents)
nodes[0]

TextNode(id_='1ab19740-b168-44c7-8d1d-6a09a7f2c753', embedding=None, metadata={'file_path': '/content/User.java', 'file_name': 'User.java', 'file_type': 'text/x-java', 'file_size': 2981, 'creation_date': '2024-05-01', 'last_modified_date': '2024-05-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='16d962cd-6b4b-43dd-815e-ebdc8c06f274', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/content/User.java', 'file_name': 'User.java', 'file_type': 'text/x-java', 'file_size': 2981, 'creation_date': '2024-05-01', 'last_modified_date': '2024-05-01'}, hash='217a6a8b6959b0fc4911dc6b55c524980fa8c273c9b37515b05b3bd81ec6c79a'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='c7b31ae7-5a7d-4018-be

In [59]:
# Level 2: Recursive Character Text Splitting

#splitting happens on the nearest splitter and the chunk length
from langchain.text_splitter import RecursiveCharacterTextSplitter

text = """
One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear.
Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business.

It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1]
"""

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 65,
    chunk_overlap=0,
    separators=['\n\n', '\n', '.', ' ', '.'],
    strip_whitespace=False
    )

chunks = text_splitter.create_documents([text])

In [60]:
chunks[0].page_content  #62 and space character

"\nOne of the most important things I didn't understand about the"

In [61]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 160,
    chunk_overlap=0
    )

documents = text_splitter.create_documents([text])

In [62]:
print(f"Document 1 : {documents[0].page_content}")
print(f"Length of Document 1: {len(documents[0].page_content)}") #splitting happened on based of '\n'

Document 1 : One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear.
Length of Document 1: 155


In [63]:
print(f"Document 2 : {documents[1].page_content}")
print(f"Length of Document 2: {len(documents[1].page_content)}") #split happened based on the space because it
#does not contain either '\n\n' or '\n' or '.' at the nearest

Document 2 : Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is
Length of Document 2: 153


In [75]:
# Level 3: Document Specific Splitting
# Lets partition the python code

# \nclass - Classes first
# \ndef - Functions next
# \n\tdef - Indented functions
# \n\n - Double New lines
# \n - New Lines
# " " - Spaces
# "" - Characters




from langchain.text_splitter import PythonCodeTextSplitter

python_text = """
class Person:
  def __init__(self, name, age):
    self.name = name
    self.age = age

p1 = Person("John", 36)

for i in range(10):
    print (i)
"""

python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)
documents = python_splitter.create_documents([python_text])

##you must play with the chunk_size to get results like this for you application

In [74]:
documents[0].page_content

'class Person:\n  def __init__(self, name, age):\n    self.name = name\n    self.age = age\n\np1 = Person("John", 36)'

In [15]:
# Level 4: Semantic Chunking - Embedding based splitting
# At a high level, this splits into sentences, then groups into groups of 3 sentences,
# and then merges one that are similar in the embedding space

#courtesy to Greg Kmardt

!pip install --quiet langchain_experimental sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
# Load pdf
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('/content/STEVEJOBS.pdf')
pages = loader.load()
pages = pages[1:] #remove the first unnecessary page

In [9]:
len(pages)

5

In [16]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

In [17]:
text_splitter = SemanticChunker(HuggingFaceEmbeddings())  #by default BERT model is used

documents = text_splitter.split_documents(pages)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
documents[0].page_content[300:]

'y we all live and work. How Steve Jobs Started – The Winding Path  \nAs people around the world wondered if innovation at Apple had stopped wi th Steve \nJobs, we want to share with you a snapshot of the genius’s life. How did Steve Jobs start? His life story is not a straight line, but more like a winding \npath. From his early years it’s clear that Jobs had no grand plan in the  beginning. His searc h for himself took Jobs through India, Buddhism, psychedelic use, attempts to \nbecome an  astronaut  and start a computer company in the Soviet Union. However, winding his path at time, Jobs did find inspiration and creativity in himself at \ncertain periods of  his life. If there is a pattern of creativity and genius that his life can \nreveal, here is his timeline. Keep Looking, Don’t Settle  \nSteve Jobs summarized his guiding principle in life in 2005 at the commencement at \nStanford in a talk titled  “How  to Live Before  You Die” .'

In [27]:
documents[1].page_content

'He said,  you’ve got to find what \nyou love. And that is as true for your work as it is for your lovers. Your work is going  to \nfill a large part of your life, and the only way to be truly satisfied is to do what you \nbelieve is great work. And the only way to do great work is to love what you do. If you \nhaven’t found it yet, keep looking.'

In [None]:
#This chunker works by determining when to “break” apart sentences.
#This is done by looking for differences in embeddings between any two sentences.
#When that difference is past some threshold, then they are split.
#There are a few ways to determine what that threshold is.

text_splitter = SemanticChunker(
    HuggingFaceEmbeddings(), breakpoint_threshold_type="standard_deviation"
)

#breakpoint_threshold_type='percentile' ---> splits when diff is greter than X percentile
#breakpoint_threshold_type='interquartile'  --> splits when diff is greter than X quartile range