In [None]:
# LangChain RAG Exercise: Build a "Chat with Your Document" System
# =================================================================
# This notebook walks you through building a simple RAG system step-by-step

# SETUP: Run this first to install required packages
!pip install langchain langchain-community langchain-openai pypdf chromadb openai tiktoken

# ============================================================================
# PART 1: SETUP AND IMPORTS
# ============================================================================

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Set your OpenAI API key (get from https://platform.openai.com/api-keys)
# IMPORTANT: Never commit your API key to version control!
os.environ["OPENAI_API_KEY"] = "your-api-key-here"

print("‚úÖ Setup complete! Libraries imported successfully.")

# ============================================================================
# PART 2: LOAD YOUR PDF DOCUMENT
# ============================================================================
# Exercise 1: Load a PDF file
# Instructions: Place a PDF file in the same directory as this notebook
# or provide the full path to your PDF

# TODO: Replace 'sample_document.pdf' with your PDF filename
pdf_path = "sample_document.pdf"

# Load the PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"‚úÖ Loaded {len(documents)} pages from the PDF")
print(f"üìÑ First page preview:\n{documents[0].page_content[:500]}...")

# ============================================================================
# PART 3: SPLIT DOCUMENTS INTO CHUNKS
# ============================================================================
# Exercise 2: Experiment with chunk sizes
# Why chunking? LLMs have token limits, and smaller chunks improve retrieval accuracy

# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # TODO: Try different values (500, 1000, 1500)
    chunk_overlap=200,      # TODO: Try different overlaps (100, 200, 300)
    length_function=len,
)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)

print(f"‚úÖ Split into {len(chunks)} chunks")
print(f"üìù First chunk preview:\n{chunks[0].page_content[:300]}...")

# ============================================================================
# PART 4: CREATE EMBEDDINGS AND STORE IN VECTOR DATABASE
# ============================================================================
# Exercise 3: Understanding embeddings
# Embeddings convert text into numerical vectors that capture meaning

# Initialize embeddings model
embeddings = OpenAIEmbeddings()

# Create vector store (using Chroma, a simple local database)
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"  # Saves to disk
)

print("‚úÖ Vector database created successfully!")
print(f"üìä Total vectors stored: {vectorstore._collection.count()}")

# ============================================================================
# PART 5: TEST SIMILARITY SEARCH
# ============================================================================
# Exercise 4: See how similarity search works
# This is the "retrieval" part of RAG

test_query = "What is this document about?"  # TODO: Change this question

# Search for similar chunks
similar_docs = vectorstore.similarity_search(test_query, k=3)

print(f"\nüîç Top 3 most relevant chunks for: '{test_query}'")
print("=" * 80)
for i, doc in enumerate(similar_docs, 1):
    print(f"\n--- Result {i} ---")
    print(doc.page_content[:300])
    print("...")

# ============================================================================
# PART 6: BUILD THE Q&A SYSTEM
# ============================================================================
# Exercise 5: Create the complete RAG chain

# Initialize the LLM
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",  # TODO: Try "gpt-4" for better results
    temperature=0,               # TODO: Experiment with 0.0 to 1.0
)

# Create a custom prompt template
prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer concise.

Context: {context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" means put all context into one prompt
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

print("‚úÖ Q&A system ready!")

# ============================================================================
# PART 7: INTERACTIVE Q&A
# ============================================================================
# Exercise 6: Ask questions about your document!

def ask_question(question):
    """Helper function to ask questions and display results"""
    result = qa_chain.invoke({"query": question})
    
    print("\n" + "=" * 80)
    print(f"‚ùì Question: {question}")
    print("=" * 80)
    print(f"üí° Answer: {result['result']}")
    print("\nüìö Sources used:")
    for i, doc in enumerate(result['source_documents'], 1):
        print(f"\nSource {i}:")
        print(doc.page_content[:200] + "...")
    print("=" * 80)

# Example questions - TODO: Replace with questions relevant to YOUR document
questions = [
    "What is the main topic of this document?",
    "Can you summarize the key points?",
    "What are the most important takeaways?"
]

# Ask each question
for question in questions:
    ask_question(question)

# ============================================================================
# BONUS: INTERACTIVE MODE
# ============================================================================
# Exercise 7: Create an interactive chat loop

def chat_with_document():
    """Interactive chat session"""
    print("\nü§ñ Chat with Your Document (type 'quit' to exit)")
    print("=" * 80)
    
    while True:
        user_question = input("\nYour question: ").strip()
        
        if user_question.lower() in ['quit', 'exit', 'q']:
            print("üëã Goodbye!")
            break
        
        if not user_question:
            continue
        
        ask_question(user_question)

# Uncomment the line below to start interactive mode
# chat_with_document()

# ============================================================================
# STUDENT EXERCISES
# ============================================================================

print("\n" + "=" * 80)
print("üìù STUDENT EXERCISES")
print("=" * 80)
print("""
1. EXPERIMENT WITH CHUNK SIZE:
   - Change chunk_size to 500, 1000, and 1500
   - Ask the same question with each setting
   - Which works best for your document?

2. TRY DIFFERENT QUESTIONS:
   - Write 5 questions about your document
   - Ask questions that require:
     a) Direct facts
     b) Summaries
     c) Comparisons
   - Which types work best?

3. MODIFY THE PROMPT:
   - Change the prompt_template to be more specific
   - Try: "Answer like you're explaining to a 5-year-old"
   - Or: "Answer in bullet points"
   - How does this change responses?

4. EXPERIMENT WITH TEMPERATURE:
   - Try temperature values: 0, 0.5, 1.0
   - Temperature controls creativity/randomness
   - Which is better for factual Q&A?

5. TEST WITH DIFFERENT DOCUMENTS:
   - Try a textbook chapter
   - Try a research paper
   - Try a news article
   - Which type works best with RAG?

6. ANALYZE RETRIEVAL:
   - For each question, look at the source documents
   - Are they actually relevant?
   - Try increasing/decreasing k (number of chunks retrieved)

7. ADVANCED: Add conversation memory
   - Research LangChain's ConversationBufferMemory
   - Implement it so the system remembers previous questions
   - Test with follow-up questions

8. CHALLENGE: Build a comparison tool
   - Load TWO different PDFs
   - Create separate vector stores
   - Ask the same question to both
   - Compare answers
""")

# ============================================================================
# CLEAN UP (OPTIONAL)
# ============================================================================

# Uncomment to delete the vector database and start fresh
# import shutil
# shutil.rmtree("./chroma_db")
# print("üóëÔ∏è Vector database deleted")

print("\n‚úÖ Notebook complete! Happy learning! üéì")