### Semantic Chunking
- Semantic Chunker is a document splitter that uses embedding similarity between sentences to decide chunk boundaries.
- It ensures that each chunk is semantically coherent and not cut off mid-thought like traditional character/token splitters

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
from langchain_openai import OpenAIEmbeddings

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
from sentence_transformers import SentenceTransformer, util

## Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

## Sample text
text = """
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

## Step 1 : Split into sentences
sentences = [s.strip() for s in text.split("\n") if s.strip()]

## Step 2: Embed each sentence
embeddings = model.encode(sentences)

## Step 3: Initialize parameters
threshold = 0.7  # control chunk tightness
chunks = []
current_chunk = [sentences[0]]

## Step 4: Semantic grouping based on threshold
for i in range(1, len(sentences)):
    sim = util.cos_sim(
        [embeddings[i - 1]],
        [embeddings[i]]
    )[0][0]

    if sim >= threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk = [sentences[i]]

# Append the last chunk
chunks.append(" ".join(current_chunk))

# Output the chunks
print("\n📌 Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")



📌 Semantic Chunks:

Chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2:
You can create chains, agents, memory, and retrievers.

Chunk 3:
The Eiffel Tower is located in Paris.

Chunk 4:
France is a popular tourist destination.
