In [68]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Load .env for API keys
load_dotenv()

# Step 1: Load PDF and Split into Chunks
loader = PyPDFLoader("../data/celsia_ebook.pdf")
data = loader.load()  # Entire PDF loaded as a single Document
print("Total number of pages:", len(data))  # We have 32 pages

# Step 2: Split the loaded PDF into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)
print("Total number of documents (chunks): ", len(docs))

# Step 3: Initialize Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Step 4: Create Vector Store from Documents
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

# Step 5: Create Retriever with Similarity Search
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

# Step 6: Setup LLM for Answer Generation
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3, max_tokens=10000)

# Step 7: Define System Prompt for Contextualized Answer Generation
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use as many sentences as needed to answer the question. "
    "\n\n"
    "{context}"
)

# Step 8: Create Chat Prompt Template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# Step 9: Create the RAG Chain for Document-Based Answer Generation
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Step 10: Function to Query and Get Contextual Answer
def get_answer(query):
    # Step 10.1: Retrieve relevant documents based on the query
    retrieved_docs = retriever.invoke(query)
    
    # Step 10.2: Extract the page content of the retrieved documents
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    
    # Step 10.3: Generate an answer based on the retrieved documents
    response = rag_chain.invoke({"input": query, "context": context})
    
    return response["answer"]

# Step 11: Example Query
query = "Exactly how do you create a CSRD report? what are the steps?"
answer = get_answer(query)
print(answer)


Total number of pages: 32
Total number of documents (chunks):  55
The provided text does not describe the steps on how to create a CSRD report. Therefore, I cannot answer your question. 

