### Semantic Chunking 
- Semantic Chunker is a document splitter that uses embedding similarity between sentences to decide chunk boundries
- It ensures that each chunk is semantically coherent and not cut off mid-thought like traditional character/token splitters.


In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [3]:
## Intialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

## Sample text
text = """
Langchain is a framework for building application with LLMs.
Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is loacated in Paris.
France is a popular tourist destination.
"""


## Step 1:Split into sentences

sentences = [s.strip() for s in text.split("\n") if s.strip()]
print(sentences)
## Step 2: Embed each sentence
embedding = model.encode(sentences)
# print(embedding)
# Step 3: Initialize parameters
threshold = 0.7
chunks = []
current_chunk=[sentences[0]]

## Step 4: Semantic grouping basend on threshold

for i in range(1,len(sentences)):
    sim = cosine_similarity(
        [embedding[i-1]],
        [embedding[i]]
    )[0][0]

    if sim >= threshold:
        current_chunk.append(sentences[i])
        print(f"Chunks: {chunks}")
        print(f"current_chunk: {current_chunk}")
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]
        print(f"Chunks: {chunks}")
        print(f"current_chunk: {current_chunk}")

print(chunks)
# Append the last chunk
chunks.append(" ".join(current_chunk))
print(chunks)
# output chunks
print("\nðŸ“Œ Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}: \n{chunk}")
  

['Langchain is a framework for building application with LLMs.', 'Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.', 'You can create chains, agents, memory, and retrievers.', 'The Eiffel Tower is loacated in Paris.', 'France is a popular tourist destination.']
Chunks: []
current_chunk: ['Langchain is a framework for building application with LLMs.', 'Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.']
Chunks: ['Langchain is a framework for building application with LLMs. Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.']
current_chunk: ['You can create chains, agents, memory, and retrievers.']
Chunks: ['Langchain is a framework for building application with LLMs. Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.', 'You can create chains, agents, memory, and retrievers.']
current_chunk: ['The Eiffel Tower is loacate

### RAG Pipeline in Modular Coding

In [10]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_classic.schema import Document
from langchain_classic.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model
from langchain_classic.schema.runnable import RunnableLambda, RunnableMap
from langchain_classic.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")


In [5]:
## Custom semantic chunker with Threshold

class ThresholdSemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2", threshold=0.7):
        self.model = SentenceTransformer(model_name)
        self.threshold = threshold
    
    def split(self, text: str):
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        embeddings = self.model.encode(sentences)
        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1,len(sentences)):
            sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentences[i]]
        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_doucments(self, docs):
        result=[]
        for doc in docs:
            for chunk in  self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))
                
        return result

In [6]:
## Sample text
sample_text = """
Langchain is a framework for building application with LLMs.
Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is loacated in Paris.
France is a popular tourist destination.
"""

doc = Document(page_content=sample_text)

doc

Document(metadata={}, page_content='\nLangchain is a framework for building application with LLMs.\nLangchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is loacated in Paris.\nFrance is a popular tourist destination.\n')

In [7]:
## Chunkking 
chunker = ThresholdSemanticChunker(threshold=0.7)
chunks = chunker.split_doucments([doc])
chunks


[Document(metadata={}, page_content='Langchain is a framework for building application with LLMs. Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={}, page_content='The Eiffel Tower is loacated in Paris.'),
 Document(metadata={}, page_content='France is a popular tourist destination.')]

In [15]:
### Vector Store
from  langchain_huggingface import HuggingFaceEmbeddings

## Inialize a simple Embedding model(no API key needed!)
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vector_store = FAISS.from_documents(chunks,embedding)

retriever = vector_store.as_retriever()

In [16]:
## Prompt Template 
template = """
Answer the question based on the following context:
{context}
Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nAnswer the question based on the following context:\n{context}\nQuestion: {question}\n')

In [19]:
### LLM 
llm = init_chat_model(
    model="groq:openai/gpt-oss-20b",
    temperature = 0.4
)

### LCEL chain with retrival
rag_chain=(
    RunnableMap(
        {
            "context": lambda x: retriever.invoke(x["question"]),
            "question": lambda x: x["question"]
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

query={"question": "what is Langchain used for?"}
result = rag_chain.invoke(query)

print(result)


LangChain is a framework for building applications that use large language models (LLMs). It provides modular abstractions to combine LLMs with tools such as OpenAI and Pinecone, enabling developers to create chains, agents, memory, and retrievers for more powerful, integrated AI solutions.


### Semantic Chunker with Langchain



In [20]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_classic.document_loaders import TextLoader

In [21]:
### Load the documents
loader= TextLoader("langchain_intro.txt")
docs = loader.load()

## Intialize embedding model
## Inialize a simple Embedding model(no API key needed!)
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

## Create the semantic chunker
chunker = SemanticChunker(embedding)

## Split the documents
chunks=chunker.split_documents(docs)

## Result 
for i,chunk in enumerate(chunks):
    print(f"\n chunk {i+1} \n {chunk.page_content}")


 chunk 1 
 Langchain is a framework for building application with LLMs. Langchain Provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone. You can create chains, agents, memory, and retrievers.

 chunk 2 
 The Eiffel Tower is loacated in Paris. France is a popular tourist destination. 
