##### Semantic chunking

In [26]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

In [27]:
with open(file='langchain_intro.txt',mode='r') as f:
    text = f.read()
text

'Machine learning is a branch of artificial intelligence focused on learning patterns from data.Supervised learning uses labeled datasets to train predictive models.In unsupervised learning, algorithms discover hidden structures in unlabeled data.Reinforcement learning trains agents by rewarding good actions and penalizing bad ones.Deep learning relies on multi-layered neural networks to solve complex problems.Transformers have become the backbone of modern NLP models.Convolutional Neural Networks are widely used in image recognition tasks.NLP enables computers to understand, process, and generate human language.Sentiment analysis is an NLP task that identifies positive, negative, or neutral opinions in text.Machine translation allows systems to convert text from one language to another.Text summarization condenses large documents into concise versions while preserving meaning.Question answering systems use NLP to fetch and generate relevant responses.Large Language Models like GPT-4 a

In [None]:
# initialize the model
model = SentenceTransformer(
    'all-miniLM-L6-v2'
)

# step:1 split into sentences
sentences = [sentence.strip() for sentence in text.split("\n") if sentence.strip()]

# step:2 Embed each sentence
embeddings = model.encode(sentences)

# step:3 Initialize parameters 
threshold = 0.6
chunks = []
current_chunk = [sentences[0]]

# step:4 Semantic grouping based on threshold
for i in range(1,len(sentences)):
    similarity = cosine_similarity(
        [embeddings[i-1]],
        [embeddings[i]]
    )[0][0]
    if similarity >= threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]

# append the last chunk
chunks.append(" ".join(current_chunk))


print("Semantice chunks")
for idx,chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")

Semantice chunks

Chunk 1:
Machine learning is a branch of artificial intelligence focused on learning patterns from data.

Chunk 2:
Supervised learning uses labeled datasets to train predictive models.

Chunk 3:
In unsupervised learning, algorithms discover hidden structures in unlabeled data.

Chunk 4:
Reinforcement learning trains agents by rewarding good actions and penalizing bad ones.

Chunk 5:
Deep learning relies on multi-layered neural networks to solve complex problems.

Chunk 6:
Transformers have become the backbone of modern NLP models.

Chunk 7:
Convolutional Neural Networks are widely used in image recognition tasks.

Chunk 8:
NLP enables computers to understand, process, and generate human language.

Chunk 9:
Sentiment analysis is an NLP task that identifies positive, negative, or neutral opinions in text.

Chunk 10:
Machine translation allows systems to convert text from one language to another.

Chunk 11:
Text summarization condenses large documents into concise versio

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.chat_models import init_chat_model
from langchain.schema.runnable import RunnableLambda,RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

In [28]:
### custom semantic chunker with threshold

class ThresholdSemanticChunker:
    def __init__(self,model_name='all-miniLM-L6-v2',threshold=0.7):
        self.model = SentenceTransformer(model_name)
        self.threshold = threshold

    def split(self,text:str):
        sentences = [sentence for sentence in text.split(".") if sentence.strip()]
        embeddings = self.model.encode(sentences)
        chunks = []
        current_chunk = [sentences[0]]
        for i in range(1,len(sentences)):
            similarity = cosine_similarity([embeddings[i-1]],[embeddings[i]])[0][0]
            if similarity >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk=[sentences[i]]

        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_documents(self,docs):
        result=[]
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk,metadata=doc.metadata))
            
        return result

In [29]:
doc = Document(page_content=text)

In [30]:
## chunking
chunker = ThresholdSemanticChunker(threshold=0.7)
chunks = chunker.split_documents([doc])
chunks

[Document(metadata={}, page_content='Machine learning is a branch of artificial intelligence focused on learning patterns from data.'),
 Document(metadata={}, page_content='Supervised learning uses labeled datasets to train predictive models.'),
 Document(metadata={}, page_content='In unsupervised learning, algorithms discover hidden structures in unlabeled data.'),
 Document(metadata={}, page_content='Reinforcement learning trains agents by rewarding good actions and penalizing bad ones.'),
 Document(metadata={}, page_content='Deep learning relies on multi-layered neural networks to solve complex problems.'),
 Document(metadata={}, page_content='Transformers have become the backbone of modern NLP models.'),
 Document(metadata={}, page_content='Convolutional Neural Networks are widely used in image recognition tasks.'),
 Document(metadata={}, page_content='NLP enables computers to understand, process, and generate human language.'),
 Document(metadata={}, page_content='Sentiment analys

In [32]:
### vector store
from langchain_openai import OpenAIEmbeddings
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model='text-embedding-3-small') 
vector_store = FAISS.from_documents(chunks,embeddings)
retriever = vector_store.as_retriever()

In [33]:
## prompt template
template = """You are an AI assistant. Answer the question strictly based on the provided context. 
- If the answer is in the context, give it clearly and concisely. 
- If the answer cannot be found in the context, reply with "I don’t know based on the given context." 
Do not use outside knowledge.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate.from_template(template)

In [35]:
## LLM
llm = init_chat_model(model='groq:gemma2-9b-it')

# LCEL chain with retrieval

rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["question"]),
        "question": lambda x: x["question"],
    })
    | prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke({"question": "What is deep learning?"})
print(answer)


Deep learning relies on multi-layered neural networks to solve complex problems.  



##### Semantic chunker with Langchain

In [36]:
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import TextLoader

In [38]:
## load the documents
loader = TextLoader('langchain_intro.txt')
docs=loader.load()

# initialize embedding model
embedding = OpenAIEmbeddings()

chunker = SemanticChunker(embedding)

chunks = chunker.split_documents(docs)

for i,chunk in enumerate(chunks):
    print(f"\n chunk {i+1}:\n {chunk.page_content}")


 chunk 1:
 Machine learning is a branch of artificial intelligence focused on learning patterns from data. Supervised learning uses labeled datasets to train predictive models. In unsupervised learning, algorithms discover hidden structures in unlabeled data. Reinforcement learning trains agents by rewarding good actions and penalizing bad ones. Deep learning relies on multi-layered neural networks to solve complex problems. Transformers have become the backbone of modern NLP models. Convolutional Neural Networks are widely used in image recognition tasks. NLP enables computers to understand, process, and generate human language. Sentiment analysis is an NLP task that identifies positive, negative, or neutral opinions in text. Machine translation allows systems to convert text from one language to another. Text summarization condenses large documents into concise versions while preserving meaning. Question answering systems use NLP to fetch and generate relevant responses. Large Langu