### Semantic Chunking
- SemanticChunker is a document splitter that uses embedding similarity between sentences to decide chunk boundaries.

- It ensures that each chunk is semantically coherent and not cut off mid-thought like traditional character/token splitters.

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain_classic.schema.runnable import RunnableLambda, RunnableMap
from langchain_classic.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os



In [5]:
## Initialize the model
model = SentenceTransformer("all-MiniLM-L6-v2")

## Sample text
text="""
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

## Step 1: Split into sentences
sentences = [s.strip() for s in text.split("\n") if s.strip()]

## Step 2: Embed each sentence
embeddings = model.encode(sentences)

## Step 3: Initialize parameters
threshold = 0.7  # control chunk tightness
chunks = []

current_chunk = [sentences[0]]

## Step 4: Semantic grouping based on threshold
for i in range(1, len(sentences)):
    sim = cosine_similarity(
        [embeddings[i-1]],
        [embeddings[i]]
    )

    if sim > threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk = [sentences[i]]

# Append the last chunk
chunks.append(" ".join(current_chunk))

# Output the chunks
print("\nðŸ“Œ Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")


ðŸ“Œ Semantic Chunks:

Chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2:
You can create chains, agents, memory, and retrievers.

Chunk 3:
The Eiffel Tower is located in Paris.

Chunk 4:
France is a popular tourist destination.


In [7]:
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

In [8]:
### Custom Semantic Chunker With Threshold

class ThresholdSematicChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2", threshold=0.7):
        self.model = SentenceTransformer(model_name)
        self.threshold = threshold
    
    def split(self, text:str):
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        embeddings = self.model.encode(sentences)
        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentences[i]]

        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_document(self, docs):
        result = []
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(
                    Document(
                        page_content=chunk,
                        metadata = doc.metadata
                    )
                )
        return result

In [9]:
sample_text = """
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

doc = Document(page_content=sample_text)
doc

Document(metadata={}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [10]:
chunker = ThresholdSematicChunker()
chunks = chunker.split_document([doc])
chunks

[Document(metadata={}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={}, page_content='France is a popular tourist destination.')]

In [11]:
from openai import vector_stores


os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
embedding_open_ai = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=1536
)

vector_stores = FAISS.from_documents(chunks, embedding_open_ai)
retriver = vector_stores.as_retriever(
    search_type="similarity", 
    search_kwargs={"k":3}
)



In [12]:
vector_stores.save_local("faiss_vectorstore")

In [14]:
loaded_vector_stores = FAISS.load_local(
    "faiss_vectorstore", 
    embedding_open_ai,
    allow_dangerous_deserialization=True
)

loaded_vector_stores

<langchain_community.vectorstores.faiss.FAISS at 0x71439da42a20>

In [15]:
## Prompt Template

# --- 5. Prompt Template ---
template = """Answer the question based on the following context:

{context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\n')

In [24]:
llm = init_chat_model(
    model="groq:llama-3.1-8b-instant"
)

In [25]:
def get_context(input):
    return retriver.invoke(input["question"])

def get_question(input):
    return input["question"]

rag_chain = (
    RunnableMap(
        {
            "context": get_context,
            "question": get_question,
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)


In [26]:
# --- 8. Run Query ---
result = rag_chain.invoke({"question": "What is LangChain used for?"})
result

'LangChain is a framework for building applications with LLMs (Large Language Models). It provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'