In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
import os
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough, RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv

load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')


In [None]:
# chstom semantic chunk
 
class SemanticChunk:
    def __init__(self, model_name='all-MiniLM-L6-v2', threshold=0.7):
        self.model = SentenceTransformer(model=model_name)
        self.threshold = threshold

    def split(self, text: str):

        ## Step 1 : Split into sentences
        sentences = [s.strip() for s in text.split("\n") if s.strip()]

        ## Step 2: Embed each sentence
        embeddings = self.model.encode(sentences)

        for i in range(1, len(sentences)):
            sim = util.cos_sim(
                [embeddings[i - 1]],
                [embeddings[i]]
            )[0][0]

            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentences[i]]

        # Append the last chunk
        chunks.append(" ".join(current_chunk))

        return chunks


In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import TextLoader

# load the docs
loader = TextLoader(r"E:\GenAI-Interview-Preparation\Track_5_AgenticAI_RAG\5_chunking_and_preprocessing\langchain_intro.txt")
docs = loader.load()

# embedding
embeddings = OpenAIEmbeddings()

# create semantic chunker
chunker = SemanticChunker(embeddings)

# split docs
chunks = chunker.split_documents(docs)

In [7]:
for i, chunk in enumerate(chunks):
    print(f"\n chunk {i+1}:\n{chunk.page_content}")


 chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

 chunk 2:
You can create chains, agents, memory, and retrievers. The Eiffel Tower is located in Paris. France is a popular tourist destination.
