## Import Dependencies

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter  
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains.retrieval import create_retrieval_chain
from typing import List, Dict, Any
import os
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
    raise EnvironmentError("OPENAI_API_KEY environment variable not set")
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Chunking

In [4]:
def chunks_text(text: str, chunk_size: int = 500, overlap:int=50) -> List[str]:
    """ Split text into overlapping chunks """
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    chunks = splitter.split_text(text)
    return chunks

## convert the chunks into langchain documents

In [5]:
def convert_to_document(chunks: List[str]) -> List[Document]:
    documents = []
    for chunk in chunks:
        doc = Document(page_content=chunk)
        documents.append(doc)
    return documents

## create hybrid retriever

In [6]:
def create_retriever(docs: List[Document]):

    ## dense retriever
    embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
    ds_vectorstore = FAISS.from_documents(docs, embeddings)
    ds_retriever = ds_vectorstore.as_retriever(search_kwargs = {"k": 10})

    # spare retriver
    sparse_retriever = BM25Retriever.from_documents(docs)
    sparse_retriever.k = 3

    hybrid_retriever = EnsembleRetriever(retrievers=[ds_retriever, sparse_retriever], weights=[0.7, 0.3])

    ## return hybrid retriever
    return hybrid_retriever

## RAG chain and Summarize Using LLM

In [7]:
def build_rag_chain(retriever):
    prompt = PromptTemplate.from_template(
        """Summarize the following content clearly and concisely:

        {context}
        """
    )

    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0.4
    )

    document_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=prompt
    )

    rag_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=document_chain
    )

    return rag_chain


## Summarizer Function

In [8]:
def summarize_text(text: str) -> str:
    chunks = chunks_text(text)
    docs = convert_to_document(chunks)
    retriever = create_retriever(docs)
    rag_chain = build_rag_chain(retriever)
    result = rag_chain.invoke({
        "input": "Summarize the document"
    })

    return result["answer"]
