## Semantic Chunking

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# sample text
text ="""
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

# step 1 - split the sentences
sentences = [s.strip() for s in text.split('\n') if s.strip()]

# step 2- embed each sentence
embeddings=model.encode(sentences)

# step 3 - Initialize parameters
threshold = 0.7
chunks = []
current_chunk = [sentences[0]]

# step 4 - Semantic grouping based on threshold
for i in range(1, len(sentences)):
    similarity = cosine_similarity(
        [embeddings[i-1]],
        [embeddings[i]]
    )[0][0]

    if similarity>=threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(' '.join(current_chunk))
        current_chunk=[sentences[i]]
    
chunks.append(" ".join(current_chunk))

# output
print("\nSemantic chunk ")
for index, chunk in enumerate(chunks):
    print(f"\nChunk {index+1}\nChunk: {chunk}")



Semantic chunk 

Chunk 1
Chunk: LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2
Chunk: You can create chains, agents, memory, and retrievers.

Chunk 3
Chunk: The Eiffel Tower is located in Paris.

Chunk 4
Chunk: France is a popular tourist destination.



## RAG Pipeline

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from langchain_core.runnables import RunnableLambda, RunnableMap
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

import os
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

In [None]:
## Custom Semantic chunk

class ThresholdSemanticChunker:
    def __init__(self, model_name='all-MiniLM-L6-v2', threshold=0.7):
        self.model=SentenceTransformer(model_name)
        self.threshold=threshold
    
    def split(self, text:str):
        sentences = [s.strip() for s in text.split('\n') if s.strip()]
        embeddings=model.encode(sentences)
        threshold = 0.7
        chunks = []
        current_chunk = [sentences[0]]
        for i in range(1, len(sentences)):
            similarity = cosine_similarity(
                [embeddings[i-1]],
                [embeddings[i]]
            )[0][0]

            if similarity>=threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(' '.join(current_chunk))
                current_chunk=[sentences[i]]
    
        chunks.append(" ".join(current_chunk))
        return chunks
    
    def split_documents(self, docs):
        result=[]
        for doc in docs:
            for chunk_text in self.split(doc.page_content):
                result.append(
                    Document(
                        page_content=chunk_text,
                        metadata=doc.metadata 
                    )
                )
        return result

In [20]:
sample_text ="""
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

doc = Document(page_content=sample_text)
doc

Document(metadata={}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [21]:
## chunking
chunker = ThresholdSemanticChunker()
chunks = chunker.split_documents([doc])
chunks

[Document(metadata={'chunk_id': 0, 'chunk_method': 'threshold_semantic'}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={'chunk_id': 1, 'chunk_method': 'threshold_semantic'}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={'chunk_id': 2, 'chunk_method': 'threshold_semantic'}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={'chunk_id': 3, 'chunk_method': 'threshold_semantic'}, page_content='France is a popular tourist destination.')]

In [24]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()

vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever()

In [26]:
## PROMPT TEMPLATE
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

template = """Answere the question based on the following context"""
prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "Context:\n{context}\n\nQuestion:\n{question}")
])
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='Answere the question based on the following context'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Context:\n{context}\n\nQuestion:\n{question}'), additional_kwargs={})])

In [31]:
# create document chain
# LLM
llm = ChatOpenAI(
    #model="gpt-4o-mini",
    model="groq:gemma2-9b-it",
    temperature=0.4
)

# RAG chain (NO document chain needed)
rag_chain = (
    {
        "context": retriever,
        "question": lambda x: x
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [33]:
query = "What is LangChain used for?"
answer = rag_chain.invoke(query)

print(answer)

NotFoundError: Error code: 404 - {'error': {'message': 'The model `groq:gemma2-9b-it` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}