# RAG Decoder

In [8]:
import os
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic

# Load environment variables
load_dotenv()

# Load and process the PDF
file_path = "../../company_policy.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

# Set up embeddings
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Create vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

# Create retriever with k=3 to get top 3 results
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Set up the language model
os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")
llm = ChatAnthropic(model="claude-3-haiku-20240307", max_tokens=4092)

# Enhanced system prompt to work with top 3 documents
system_prompt = """
You are an assistant for question-answering tasks. 
Use the following 5 most relevant pieces of retrieved context to answer the question.
If the context doesn't contain enough information, say that you don't know.
Please synthesize the information from all provided contexts to create a comprehensive answer.

Retrieved contexts:
{context}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

# Create the chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Function to show both retrieved documents and final answer
def get_answer_with_sources(question):
    results = rag_chain.invoke({"input": question})

    
    print("Retrieved Documents:")
    for i, doc in enumerate(results['context'], 1):
        print(f"\nDocument {i}:")
        print(doc.page_content[:200] + "...")
    
    print("\nFinal Answer:")
    print(results['answer'])
    
    return results

# Example usage
question = "What is social media policy in my organisation?"
results = get_answer_with_sources(question)
# print(results["context"][0].page_content)

Retrieved Documents:

Document 1:
the advice of a Legal Department.  
 
g) Employees should get appropriate permission before referring to or posting images of 
current or former employees, members, vendors or suppliers.  
 
h) Employ...

Document 2:
the advice of a Legal Department.  
 
g) Employees should get appropriate permission before referring to or posting images of 
current or former employees, members, vendors or suppliers.  
 
h) Employ...

Document 3:
the advice of a Legal Department.  
 
g) Employees should get appropriate permission before referring to or posting images of 
current or former employees, members, vendors or suppliers.  
 
h) Employ...

Document 4:
the advice of a Legal Department.  
 
g) Employees should get appropriate permission before referring to or posting images of 
current or former employees, members, vendors or suppliers.  
 
h) Employ...

Document 5:
the advice of a Legal Department.  
 
g) Employees should get appropriate permission before referr