In [1]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

df = pd.read_json('data/preprocessed/reviews.jsonl', lines=True)

In [2]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [3]:
from pprint import pprint
doc = df.iloc[0]['body']
pprint(doc)

('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy. Painstaking '
 'measurements of the relative positions of two stars in resolved optical '
 'pairs over the course of decades allowed Herschel (1803) to infer that most '
 'of the pairs he studied were orbiting one another. By monitoring the motion '


In [4]:
splits = splitter.split_text(doc)
print(f"Number of splits: {len(splits)}")

Number of splits: 112


In [5]:
for i in range(3):
    print(f"Split {i}:")
    pprint(splits[i])
    print()

Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy. Painstaking '
 'measurements of the relative positions of two stars in resolved optical '
 'pairs over the course of decades allowed Herschel (1803) to infer that most '
 'of the pairs he studied were orbiting one another. By monitoring the 

In [6]:
from semantic_text_splitter import TextSplitter

splitter = TextSplitter(capacity=1000, overlap=200)
chunks = splitter.chunks(doc)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 113


In [7]:
for i in range(3):
    print(f"Split {i}:")
    pprint(chunks[i])
    print()

Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy. Painstaking '
 'measurements of the relative positions of two stars in resolved optical '
 'pairs over the course of decades allowed Herschel (1803) to infer that most '
 'of the pairs he studied were orbiting one another.')

Split 1:
('By m

Semchunk

In [8]:
from transformers import AutoTokenizer
import semchunk

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")
chunker = semchunk.chunkerify(tokenizer, chunk_size=256)

In [9]:
semchunks = chunker(doc)
print(f"Number of semchunks: {len(semchunks)}")

Number of semchunks: 87


In [10]:
for i in range(3):
    print(f"Split {i}:")
    pprint(semchunks[i])
    print()

Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy. Painstaking '
 'measurements of the relative positions of two stars in resolved optical '
 'pairs over the course of decades allowed Herschel (1803) to infer that most '
 'of the pairs he studied were orbiting one another. By monitoring the 

## `semantic_text_splitter` with HuggingFace Tokenizer

In [11]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, 256, overlap=50)

In [12]:
chunks = splitter.chunks(doc)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 96


In [13]:
for i in range(3):
    print(f"Split {i}:")
    pprint(chunks[i])
    print()

Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy. Painstaking '
 'measurements of the relative positions of two stars in resolved optical '
 'pairs over the course of decades allowed Herschel (1803) to infer that most '
 'of the pairs he studied were orbiting one another. By monitoring the 

In [14]:
bge_tokenizer = Tokenizer.from_pretrained("BAAI/bge-small-en")
bge_splitter = TextSplitter.from_huggingface_tokenizer(bge_tokenizer, 256, overlap=32)
bge_chunks = bge_splitter.chunks(doc)
print(f"Number of BGE chunks: {len(bge_chunks)}")
for i in range(3):
    print(f"BGE Split {i}:")
    pprint(bge_chunks[i])
    print()

Number of BGE chunks: 90
BGE Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy. Painstaking '
 'measurements of the relative positions of two stars in resolved optical '
 'pairs over the course of decades allowed Herschel (1803) to infer that most '
 'of the pairs he studied were orbiting on

Only issue with these tokenizer-based semantic splitters is the overlap: it's there whether it's needed or not

In [42]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-small-en"
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "mps"},
    encode_kwargs={"normalize_embeddings": True},
    )

lc_splitter = SemanticChunker(
    embeddings=hf_embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=80,
    min_chunk_size=64,
)

In [43]:
splits = lc_splitter.split_text(doc)
print(f"Number of langchain splits: {len(splits)}")

Number of langchain splits: 105


In [44]:
for i in range(5):
    print(f"Langchain Split {i}:")
    pprint(splits[i])
    print()

Langchain Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood. Astrometry has played '
 'a particularly important role for binary star astronomy.')

Langchain Split 1:
('Painstaking measurements of the relative positions of two stars in resolved '
 'optical pairs over the course of decades allowed Herschel (1803) to infer '
 'that most of the pairs he studied were orbiting o

In [58]:
grad_splitter = SemanticChunker(
    embeddings=hf_embeddings,
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=80.0,
    min_chunk_size=100,
)

grad_splits = grad_splitter.split_text(doc)
print(f"Number of langchain standard deviation splits: {len(grad_splits)}")

Number of langchain standard deviation splits: 122


In [59]:
for i in range(3):
    print(f"Langchain Split {i}:")
    pprint(grad_splits[i])
    print()

Langchain Split 0:
('1 Introduction Binary stars have long played a foundational role in '
 'astrophysics. They underpin precision measurements of stellar physical '
 'parameters, enable robust tests of general relativity, and give rise to an '
 'extraordinary zoo of observational phenomenology. Millennia after their '
 'discovery (e.g. Jetsu and Porceddu, 2015 ), binaries remain at the heart of '
 'many of the interesting open questions in astrophysics: binary evolution '
 'modeling is key for understanding the origin of gravitational wave events, '
 'the spectral energy distributions of high redshift galaxies, and the '
 'demographics of exoplanets in the solar neighborhood.')

Langchain Split 1:
('Astrometry has played a particularly important role for binary star '
 'astronomy. Painstaking measurements of the relative positions of two stars '
 'in resolved optical pairs over the course of decades allowed Herschel (1803) '
 'to infer that most of the pairs he studied were orbiting o

In [57]:
len(grad_splits[0])

81