In [None]:
import sys
sys.path.append("..")
from chonkie import TokenChunker, SentenceChunker, SemanticChunker
from src.core.embeddings.algorithms import HFDenseTextEmbeddingsGenerator
from src.core.embeddings.wrapper import ChonkieEmbeddingWrapper

# Text chunking with Chonkie

In [1]:
sample_text = """
Rabbits are small mammals known for their long ears, fluffy tails, and strong hind legs. They belong to the family Leporidae and are found in many parts of the world. Rabbits are herbivores, feeding mainly on grasses, leaves, and vegetables. Their diet is rich in fiber, which helps maintain their digestive health.
In the wild, rabbits live in burrows called warrens, which provide protection from predators and harsh weather. These social animals communicate through various vocalizations, body language, and scent marking. Rabbits are also known for their impressive reproductive abilities; a female rabbit, called a doe, can give birth to several litters each year.
Domesticated rabbits have become popular pets due to their gentle nature and ease of care. They require a balanced diet, regular exercise, and social interaction to thrive. Grooming is important to prevent matting, especially in long-haired breeds. Owners should also provide safe environments free from hazards.
Rabbits play an important role in ecosystems as prey for many predators, including foxes, hawks, and owls. They also contribute to soil health through their digging activities, which aerate the soil and help with plant growth. In some regions, however, introduced rabbit populations have become invasive, causing damage to crops and native vegetation.
Interestingly, rabbits have a unique digestive system that allows them to reingest certain types of droppings, called cecotropes, to extract additional nutrients. This behavior, known as coprophagy, is essential for their health.
Overall, rabbits are fascinating creatures with complex behaviors and adaptations that have allowed them to thrive in diverse environments.
"""

## Token Chunker

In [None]:
tc = TokenChunker(chunk_size=64, chunk_overlap=8)
tc.chunk(sample_text)

[Chunk(text='
 Rabbits are small mammals known for their long ears, fluffy tai', token_count=64, start_index=0, end_index=64),
 Chunk(text='uffy tails, and strong hind legs. They belong to the family Lepo', token_count=64, start_index=56, end_index=120),
 Chunk(text='ily Leporidae and are found in many parts of the world. Rabbits ', token_count=64, start_index=112, end_index=176),
 Chunk(text='Rabbits are herbivores, feeding mainly on grasses, leaves, and v', token_count=64, start_index=168, end_index=232),
 Chunk(text='s, and vegetables. Their diet is rich in fiber, which helps main', token_count=64, start_index=224, end_index=288),
 Chunk(text='lps maintain their digestive health.
 In the wild, rabbits live i', token_count=64, start_index=280, end_index=344),
 Chunk(text='s live in burrows called warrens, which provide protection from ', token_count=64, start_index=336, end_index=400),
 Chunk(text='on from predators and harsh weather. These social animals commun', token_count=64, sta

## Sentence Chunker

In [None]:
sc = SentenceChunker(chunk_size=64, chunk_overlap=8)
sc.chunk(sample_text)

[SentenceChunk(text=
 Rabbits are small mammals known for their long ears, fluffy tails, and strong hind legs. , start_index=0, end_index=90, token_count=90, sentences=[Sentence(text=
 Rabbits are small mammals known for their long ears, fluffy tails, and strong hind legs. , start_index=0, end_index=90, token_count=90)]),
 SentenceChunk(text=They belong to the family Leporidae and are found in many parts of the world. , start_index=90, end_index=168, token_count=78, sentences=[Sentence(text=They belong to the family Leporidae and are found in many parts of the world. , start_index=90, end_index=168, token_count=78)]),
 SentenceChunk(text=Rabbits are herbivores, feeding mainly on grasses, leaves, and vegetables. , start_index=168, end_index=243, token_count=75, sentences=[Sentence(text=Rabbits are herbivores, feeding mainly on grasses, leaves, and vegetables. , start_index=168, end_index=243, token_count=75)]),
 SentenceChunk(text=Their diet is rich in fiber, which helps maintain their 

## Semantic Chunker with custom model

In [None]:
embedder = HFDenseTextEmbeddingsGenerator(model_name="intfloat/multilingual-e5-small", dim=384)
chonkie_embedder=ChonkieEmbeddingWrapper(embedder=embedder)
sec = SemanticChunker(embedding_model=chonkie_embedder, chunk_size=64)
sec.chunk(sample_text)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


[Chunk(text='
 Rabbits are small mammals known for their long ears, fluffy tails, and strong hind legs. They belong to the family Leporidae and are found in many parts of the world. ', token_count=45, start_index=0, end_index=168),
 Chunk(text='Rabbits are herbivores, feeding mainly on grasses, leaves, and vegetables. Their diet is rich in fiber, which helps maintain their digestive health.
 ', token_count=41, start_index=168, end_index=317),
 Chunk(text='In the wild, rabbits live in burrows called warrens, which provide protection from predators and harsh weather. ', token_count=28, start_index=317, end_index=429),
 Chunk(text='These social animals communicate through various vocalizations, body language, and scent marking. Rabbits are also known for their impressive reproductive abilities; a female rabbit, called a doe, can give birth to several litters each year.
 ', token_count=56, start_index=429, end_index=671),
 Chunk(text='Domesticated rabbits have become popular pets due to th