# Advanced RAG Experimentation Notebook
## Harry Potter Book Analysis with Multi-Strategy Retrieval

This notebook demonstrates an advanced RAG system that analyzes Harry Potter books using multiple retrieval strategies, question rewriting, chain-of-thought reasoning, and a sophisticated plan-and-execute agent.

---

## üîß Environment Setup

Load environment variables and set up API keys for OpenAI and Groq.

In [None]:
# --- Environment and API Setup ---
import os
from dotenv import load_dotenv

# Load environment variables (e.g., API keys)
load_dotenv(override=True)

# Set API Keys for OpenAI and Groq
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

# Optional: Set environment variable for debugging (increases timeout)
os.environ["PYDEVD_WARN_EVALUATION_TIMEOUT"] = "100000"

print("‚úÖ Environment variables loaded successfully!")

## 1. üìñ Data Loading & Preprocessing

### 1.1 Load Harry Potter PDF and Split into Chapters

In [None]:
# --- Import Libraries for Document Loading ---
from langchain.document_loaders import PyPDFLoader
from helper_functions import split_into_chapters, replace_t_with_space

# Define the path to the Harry Potter PDF file
hp_pdf_path = "Harry_Potter_Book_1_The_Sorcerers_Stone.pdf"

# Split the PDF into chapters and preprocess the text
print("üìö Loading and splitting PDF into chapters...")
chapters = split_into_chapters(hp_pdf_path)
chapters = replace_t_with_space(chapters)

print(f"‚úÖ Extracted {len(chapters)} chapters from the book.")

### 1.2 Extract Book Quotes as Separate Documents

In [None]:
# --- Extract Book Quotes ---
from helper_functions import extract_book_quotes_as_documents

# Load the PDF and extract quotes
print("üìñ Extracting book quotes...")
loader = PyPDFLoader(hp_pdf_path)
document = loader.load()
document_cleaned = replace_t_with_space(document)
book_quotes_list = extract_book_quotes_as_documents(document_cleaned)

print(f"‚úÖ Extracted {len(book_quotes_list)} quotes from the book.")

## 2. üß† Chapter Summarization with LLMs

Generate summaries for each chapter using GPT-3.5-turbo to enable high-level retrieval.

In [None]:
# --- Summarization Setup ---
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from time import monotonic

# Define the prompt template for summarization
summarization_prompt = PromptTemplate(
    template="""Write an extensive summary of the following:

{text}

SUMMARY:""",
    input_variables=["text"]
)

def create_chapter_summary(chapter):
    """Creates a summary of a chapter using GPT-3.5-turbo."""
    chapter_txt = chapter.page_content
    
    # Initialize the LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125")
    
    # Load the summarization chain
    chain = load_summarize_chain(
        llm,
        chain_type="stuff",
        prompt=summarization_prompt,
        verbose=False
    )
    
    # Generate the summary
    start_time = monotonic()
    doc_chapter = Document(page_content=chapter_txt)
    summary_result = chain.invoke([doc_chapter])
    
    print(f"‚ú® Summarized in {monotonic() - start_time:.2f} seconds")
    
    # Clean and return the summary
    from helper_functions import replace_double_lines_with_one_line
    summary_text = replace_double_lines_with_one_line(summary_result["output_text"])
    return Document(page_content=summary_text, metadata=chapter.metadata)

# Generate summaries for all chapters
print("üìù Generating summaries for each chapter...")
chapter_summaries = []

for i, chapter in enumerate(chapters):
    print(f"\n--- Processing Chapter {i+1} ---")
    summary = create_chapter_summary(chapter)
    chapter_summaries.append(summary)
    print(f"Chapter {i+1} summary length: {len(summary.page_content)} characters")

print(f"\n‚úÖ Generated {len(chapter_summaries)} chapter summaries!")

## 3. üîç Vector Store Creation

Create three separate vector stores: book chunks, chapter summaries, and book quotes. Each serves a different retrieval purpose.

In [None]:
# --- Vector Store Imports ---
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

def encode_book(path, chunk_size=1000, chunk_overlap=200):
    """Encodes a PDF book into a FAISS vector store."""
    loader = PyPDFLoader(path)
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)
    
    embeddings = OpenAIEmbeddings()
    return FAISS.from_documents(cleaned_texts, embeddings)

# Create embeddings instance
embeddings = OpenAIEmbeddings()

# Check if vector stores exist on disk
vector_store_paths = {
    "chunks": "chunks_vector_store",
    "summaries": "chapter_summaries_vector_store", 
    "quotes": "book_quotes_vectorstore"
}

if all(os.path.exists(path) for path in vector_store_paths.values()):
    print("üìÇ Loading existing vector stores...")
    chunks_vector_store = FAISS.load_local(vector_store_paths["chunks"], embeddings, allow_dangerous_deserialization=True)
    chapter_summaries_vector_store = FAISS.load_local(vector_store_paths["summaries"], embeddings, allow_dangerous_deserialization=True)
    book_quotes_vectorstore = FAISS.load_local(vector_store_paths["quotes"], embeddings, allow_dangerous_deserialization=True)
else:
    print("üîÑ Creating new vector stores...")
    # Encode and save the vector stores
    chunks_vector_store = encode_book(hp_pdf_path, chunk_size=1000, chunk_overlap=200)
    chapter_summaries_vector_store = FAISS.from_documents(chapter_summaries, embeddings)
    book_quotes_vectorstore = FAISS.from_documents(book_quotes_list, embeddings)
    
    # Save to disk for future use
    chunks_vector_store.save_local(vector_store_paths["chunks"])
    chapter_summaries_vector_store.save_local(vector_store_paths["summaries"])
    book_quotes_vectorstore.save_local(vector_store_paths["quotes"])
    
print("‚úÖ Vector stores ready!")

In [None]:
# --- Create Retrievers from Vector Stores ---
# Different 'k' values for different retrieval strategies
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 3})
chapter_summaries_query_retriever = chapter_summaries_vector_store.as_retriever(search_kwargs={"k": 2})
book_quotes_query_retriever = book_quotes_vectorstore.as_retriever(search_kwargs={"k": 5})

print("üîç Retrievers created with different configurations:")
print("- Book chunks: k=3 (detailed information)")
print("- Chapter summaries: k=2 (high-level overview)")
print("- Book quotes: k=5 (specific evidence)")

## 4. üéØ Basic RAG Pipeline

Test a simple RAG query to verify everything works.

In [None]:
# --- Basic RAG Function ---
def basic_rag_query(question):
    """Simple RAG query using all retrievers."""
    # Retrieve from all sources
    docs = chunks_query_retriever.get_relevant_documents(question)
    docs_summaries = chapter_summaries_query_retriever.get_relevant_documents(question)
    docs_quotes = book_quotes_query_retriever.get_relevant_documents(question)
    
    # Combine all contexts
    context = " ".join([doc.page_content for doc in docs + docs_summaries + docs_quotes])
    
    # Generate answer using LLM
    from helper_functions import escape_quotes
    context = escape_quotes(context)
    
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nProvide a concise answer:"
    
    return llm.invoke(prompt).content

# Test basic RAG
print("üß™ Testing Basic RAG Pipeline:")
test_question = "Who is Fluffy?"
result = basic_rag_query(test_question)
print(f"Question: {test_question}")
print(f"Answer: {result}")

## 5. üîÑ Advanced RAG with LangGraph

### 5.1 Question Rewriting for Better Retrieval

Rewrite questions to improve vector store retrieval quality.

In [None]:
# --- Question Rewriting Setup ---
from langchain_groq import ChatGroq
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class RewriteQuestion(BaseModel):
    rewritten_question: str = Field(description="Improved question for retrieval")
    explanation: str = Field(description="Explanation of changes")

rewrite_llm = ChatGroq(
    temperature=0,
    model_name="llama3-70b-8192",
    groq_api_key=groq_api_key,
    max_tokens=4000
)

rewrite_chain = PromptTemplate(
    template="""You are a question re-writer that converts an input question to a better version optimized for vectorstore retrieval.
Analyze the input question {question} and try to reason about the underlying semantic intent / meaning.

{format_instructions}""",
    input_variables=["question"],
    partial_variables={"format_instructions": JsonOutputParser(pydantic_object=RewriteQuestion).get_format_instructions()}
) | rewrite_llm | JsonOutputParser(pydantic_object=RewriteQuestion)

# Test question rewriting
print("‚úçÔ∏è Testing Question Rewriting:")
test_question = "stuff about the three-headed dog"
rewritten = rewrite_chain.invoke({"question": test_question})
print(f"Original: {test_question}")
print(f"Rewritten: {rewritten['rewritten_question']}")
print(f"Explanation: {rewritten['explanation']}")

### 5.2 Chain-of-Thought Answering

Use step-by-step reasoning to answer questions.

In [None]:
# --- Chain-of-Thought Setup ---
class QuestionAnswerFromContext(BaseModel):
    answer_based_on_content: str = Field(description="Answer based on context")

cot_chain = PromptTemplate(
    template="""Answer this question using chain-of-thought reasoning.

Context: {context}
Question: {question}

Think step by step and provide your reasoning before the final answer.""",
    input_variables=["context", "question"]
) | ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=2000) | JsonOutputParser(pydantic_object=QuestionAnswerFromContext)

# Test CoT answering
print("üß† Testing Chain-of-Thought Answering:")
test_context = "Harry Potter is a young wizard. He discovers he is famous in the wizarding world for surviving an attack by Voldemort as a baby."
test_q = "Why is Harry Potter famous?"

result = cot_chain.invoke({"context": test_context, "question": test_q})
print(f"Question: {test_q}")
print(f"Answer: {result['answer_based_on_content']}")

## 6. üó∫Ô∏è Sophisticated Pipeline with Plan-and-Execute

### 6.1 Question Anonymization

Replace named entities with variables to create unbiased plans.

In [None]:
# --- Question Anonymization Setup ---
class AnonymizeQuestion(BaseModel):
    anonymized_question: str = Field(description="Question with entities replaced by variables")
    mapping: dict = Field(description="Mapping of variables to original entities")
    explanation: str = Field(description="Explanation of the process")

anonymize_chain = PromptTemplate(
    template="""You are a question anonymizer. Replace all name entities in the question with variables.

Example: "who is harry potter?" ‚Üí "who is X?" with mapping {"X": "harry potter"}

Question: {question}

{format_instructions}""",
    input_variables=["question"],
    partial_variables={"format_instructions": JsonOutputParser(pydantic_object=AnonymizeQuestion).get_format_instructions()}
) | ChatOpenAI(temperature=0, model_name="gpt-4o") | JsonOutputParser(pydantic_object=AnonymizeQuestion)

# Test anonymization
print("üé≠ Testing Question Anonymization:")
test_q = "how did harry beat quirrell?"
result = anonymize_chain.invoke({"question": test_q})
print(f"Original: {test_q}")
print(f"Anonymized: {result['anonymized_question']}")
print(f"Mapping: {result['mapping']}")

### 6.2 Multi-Step Planning

Create step-by-step plans to answer complex questions.

In [None]:
# --- Planning Setup ---
from typing import List

class Plan(BaseModel):
    steps: List[str] = Field(description="Ordered list of steps to answer the question")

planner_chain = PromptTemplate(
    template="""Create a step-by-step plan to answer this question.

Question: {question}

Rules:
1. Each step should be executable by retrieval or answering
2. Include all necessary information
3. Don't skip steps""",
    input_variables=["question"]
) | ChatOpenAI(temperature=0, model_name="gpt-4o") | JsonOutputParser(pydantic_object=Plan)

# Test planning
print("üìã Testing Multi-Step Planning:")
complex_question = "What class does the professor who helped the villain teach?"
plan = planner_chain.invoke({"question": complex_question})
print(f"Question: {complex_question}")
print("Plan Steps:")
for i, step in enumerate(plan['steps'], 1):
    print(f"{i}. {step}")

### 6.3 Plan Refinement

Refine plans to be more specific and executable.

In [None]:
# --- Plan Refinement Setup ---
class DeAnonymizePlan(BaseModel):
    plan: List[str] = Field(description="Plan with variables replaced by original entities")

deanonymize_chain = PromptTemplate(
    template="""Replace variables in this plan with the mapped words.

Plan: {plan}
Mapping: {mapping}

Return the updated plan list.""",
    input_variables=["plan", "mapping"]
) | ChatOpenAI(temperature=0, model_name="gpt-4o") | JsonOutputParser(pydantic_object=DeAnonymizePlan)

# Test full pipeline: anonymize ‚Üí plan ‚Üí deanonymize
print("üîó Testing Full Planning Pipeline:")
question = "how did the main character beat the villain?"

# Step 1: Anonymize
anon_result = anonymize_chain.invoke({"question": question})
print(f"Anonymized: {anon_result['anonymized_question']}")

# Step 2: Plan
plan_result = planner_chain.invoke({"question": anon_result['anonymized_question']})
print(f"Plan for anonymized question: {plan_result['steps']}")

# Step 3: De-anonymize
deanon_result = deanonymize_chain.invoke({
    "plan": plan_result['steps'],
    "mapping": anon_result['mapping']
})
print(f"Final plan: {deanon_result['plan']}")

## 7. üß™ Experimental Section: Testing Different Strategies

Compare different retrieval strategies to see which works best for different question types.

In [None]:
# --- Retrieval Strategy Comparison ---
def compare_retrieval_strategies(question):
    """Compare different retrieval approaches for the same question."""
    strategies = {
        "Basic Chunks": chunks_query_retriever,
        "Chapter Summaries": chapter_summaries_query_retriever,
        "Book Quotes": book_quotes_query_retriever,
        "Combined": "all"  # Special flag to combine all
    }
    
    results = {}
    
    for strategy_name, retriever in strategies.items():
        print(f"\nüéØ Testing {strategy_name}...")
        
        if strategy_name == "Combined":
            # Combine all contexts
            docs = chunks_query_retriever.get_relevant_documents(question)
            docs_summaries = chapter_summaries_query_retriever.get_relevant_documents(question)
            docs_quotes = book_quotes_query_retriever.get_relevant_documents(question)
            context = " ".join([doc.page_content for doc in docs + docs_summaries + docs_quotes])
        else:
            docs = retriever.get_relevant_documents(question)
            context = " ".join([doc.page_content for doc in docs])
        
        print(f"Retrieved {len(context)} characters of context")
        
        # Generate answer
        llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
        prompt = f"Context: {context[:1000]}...\n\nQuestion: {question}\n\nAnswer:"
        answer = llm.invoke(prompt).content
        
        results[strategy_name] = answer
        print(f"Answer: {answer[:100]}...")
    
    return results

# Run comparison
test_question = "What spell does Hermione use to fix Harry's glasses?"
print(f"üß™ Comparing strategies for: {test_question}")
comparison_results = compare_retrieval_strategies(test_question)

print("\n" + "="*60)
print("üìä FINAL COMPARISON:")
print("="*60)
for strategy, answer in comparison_results.items():
    print(f"\n{strategy}:")
    print(f"  ‚Üí {answer}")

In [None]:
# --- Test Question Complexity Levels ---
complex_questions = [
    "Simple: Who is Harry Potter?",
    "Moderate: What house is Harry in and why?",
    "Complex: How does Harry's relationship with Dumbledore evolve?",
    "Reasoning: What does the Mirror of Erised reveal about human nature?"
]

def test_complexity_levels():
    """Test how the system handles different question complexities."""
    results = {}
    
    for question in complex_questions:
        print(f"\n{'='*60}")
        print(f"Testing: {question}")
        print(f"{'='*60}")
        
        # Extract actual question (remove prefix)
        actual_question = question.split(": ", 1)[1]
        
        # Test basic RAG
        print("\n[Basic RAG]")
        basic_answer = basic_rag_query(actual_question)
        print(f"Answer: {basic_answer}")
        
        # Test with question rewriting
        print("\n[With Question Rewriting]")
        rewritten = rewrite_chain.invoke({"question": actual_question})
        rewritten_answer = basic_rag_query(rewritten['rewritten_question'])
        print(f"Rewritten Q: {rewritten['rewritten_question']}")
        print(f"Answer: {rewritten_answer}")
        
        results[question] = {
            "basic": basic_answer,
            "rewritten": rewritten_answer
        }
    
    return results

# Run complexity test
print("üß† Testing Different Question Complexity Levels")
complexity_results = test_complexity_levels()

# Save results for analysis
import json
with open("complexity_test_results.json", "w") as f:
    json.dump(complexity_results, f, indent=2)
print("\nüíæ Results saved to complexity_test_results.json")

## 8. üìä Evaluation with Ragas

Use Ragas framework to quantitatively evaluate our RAG system.

In [None]:
# --- Ragas Evaluation Setup ---
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    answer_relevancy,
    context_recall,
    answer_similarity
)

# Evaluation questions and ground truth answers
eval_questions = [
    "What is the name of the three-headed dog?",
    "Who gave Harry his first broomstick?",
    "Which house did the Sorting Hat initially consider for Harry?",
    "What is the name of Harry's owl?",
    "How did Harry and his friends get past Fluffy?"
]

ground_truth_answers = [
    "Fluffy",
    "Professor McGonagall",
    "Slytherin",
    "Hedwig",
    "They played music to put Flummy to sleep"
]

print("üéØ Starting Ragas Evaluation...")
print(f"Evaluating {len(eval_questions)} questions")

In [None]:
# --- Generate Answers and Collect Contexts ---
generated_answers = []
retrieved_contexts = []

print("\nü§ñ Generating answers for evaluation questions...")
for i, question in enumerate(eval_questions):
    print(f"\n[{i+1}/{len(eval_questions)}] Q: {question}")
    
    # Generate answer using basic RAG
    answer = basic_rag_query(question)
    generated_answers.append(answer)
    print(f"A: {answer}")
    
    # Collect contexts used
    docs = chunks_query_retriever.get_relevant_documents(question)
    contexts = [doc.page_content for doc in docs]
    retrieved_contexts.append(contexts)
    print(f"Retrieved {len(contexts)} context chunks")

print(f"\n‚úÖ Generated {len(generated_answers)} answers")

In [None]:
# --- Run Ragas Evaluation ---
# Prepare data for Ragas
data_samples = {
    'question': eval_questions,
    'answer': generated_answers,
    'contexts': retrieved_contexts,
    'ground_truth': ground_truth_answers
}

dataset = Dataset.from_dict(data_samples)

print("üìä Running Ragas evaluation...")
# Initialize LLM for Ragas evaluation
eval_llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)

# Run evaluation
results = evaluate(
    dataset=dataset,
    metrics=[
        answer_correctness,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity
    ],
    llm=eval_llm
)

# Convert to DataFrame and display
results_df = results.to_pandas()
print("\nüìà Evaluation Results:")
print("="*60)
print(results_df.to_string(index=False))

# Calculate and display mean scores
print(f"\nüìä Mean Scores:")
for metric in ['answer_correctness', 'faithfulness', 'answer_relevancy', 'context_recall', 'answer_similarity']:
    mean_score = results_df[metric].mean()
    print(f"  {metric}: {mean_score:.3f}")

## 9. üî¨ Advanced Experiments

### 9.1 Test Hallucination Detection

In [None]:
# --- Hallucination Detection Test ---
def test_hallucination_scenarios():
    """Test the system's ability to detect and prevent hallucinations."""
    
    test_cases = [
        {
            "question": "What is Harry's favorite color?",
            "context": "Harry Potter is a wizard who attends Hogwarts School of Witchcraft and Wizardry.",
            "type": "insufficient_context"
        },
        {
            "question": "Who is Hagrid?",
            "context": "Rubeus Hagrid is the gamekeeper at Hogwarts and a loyal friend to Harry.",
            "type": "sufficient_context"
        },
        {
            "question": "What is Voldemort's favorite ice cream flavor?",
            "context": "Lord Voldemort is the main antagonist in the Harry Potter series.",
            "type": "not_in_book"
        }
    ]
    
    print("üß™ Testing Hallucination Detection Scenarios")
    print("="*60)
    
    for i, test in enumerate(test_cases, 1):
        print(f"\nTest {i}: {test['type'].replace('_', ' ').title()}")
        print(f"Q: {test['question']}")
        print(f"Context: {test['context']}")
        
        # Try to answer and see if it hallucinates
        try:
            answer = basic_rag_query(test['question'])
            print(f"Generated Answer: {answer}")
            
            # Check if answer is grounded in context
            if test['type'] == 'insufficient_context' or test['type'] == 'not_in_book':
                print("‚ö†Ô∏è  Should ideally respond with 'I don't have enough information'")
            else:
                print("‚úÖ Should provide a factual answer")
                
        except Exception as e:
            print(f"‚ùå Error: {e}")

# Run hallucination tests
test_hallucination_scenarios()

### 9.2 Chain-of-Thought Analysis

Deep dive into the reasoning process.

In [None]:
# --- Chain-of-Thought Analysis ---
def analyze_cot_detailed(question, context):
    """Get detailed chain-of-thought reasoning for a question."""
    
    prompt = f"""Analyze this step by step:

Context: {context}
Question: {question}

Provide:
1. Key facts from context
2. Missing information (if any)
3. Logical connections
4. Final answer with reasoning"""
    
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
    response = llm.invoke(prompt)
    return response.content

# Test CoT analysis on a complex question
cot_test_question = "Why does Harry survive Voldemort's curse?"
cot_test_context = "Harry's mother's love protected him, creating a powerful magical shield. This ancient magic is stronger than any dark curse."

print(f"üîç Chain-of-Thought Analysis:")
print(f"Question: {cot_test_question}")
print(f"\nContext: {cot_test_context}")
print("\n" + "="*60)

detailed_analysis = analyze_cot_detailed(cot_test_question, cot_test_context)
print(detailed_analysis)

## 10. üìà Results Analysis & Visualization

In [None]:
# --- Visualization of Ragas Results ---
import matplotlib.pyplot as plt
import numpy as np

def visualize_ragas_results(results_df):
    """Create visual summary of Ragas evaluation results."""
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('RAGAS Evaluation Metrics', fontsize=16, fontweight='bold')
    
    x_pos = np.arange(len(results_df))
    
    # Answer Correctness
    ax1.bar(x_pos, results_df['answer_correctness'], color='skyblue', alpha=0.8)
    ax1.set_title('Answer Correctness', fontweight='bold')
    ax1.set_ylabel('Score (0-1)')
    ax1.set_ylim(0, 1)
    
    # Faithfulness
    ax2.bar(x_pos, results_df['faithfulness'], color='lightcoral', alpha=0.8)
    ax2.set_title('Faithfulness', fontweight='bold')
    ax2.set_ylabel('Score (0-1)')
    ax2.set_ylim(0, 1)
    
    # Answer Relevancy
    ax3.bar(x_pos, results_df['answer_relevancy'], color='lightgreen', alpha=0.8)
    ax3.set_title('Answer Relevancy', fontweight='bold')
    ax3.set_ylabel('Score (0-1)')
    ax3.set_ylim(0, 1)
    
    # Context Recall
    ax4.bar(x_pos, results_df['context_recall'], color='gold', alpha=0.8)
    ax4.set_title('Context Recall', fontweight='bold')
    ax4.set_ylabel('Score (0-1)')
    ax4.set_ylim(0, 1)
    
    # Set x-axis labels for all subplots
    for ax in [ax1, ax2, ax3, ax4]:
        ax.set_xlabel('Question Index')
        ax.set_xticks(x_pos)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nüìä Summary Statistics:")
    print(results_df.describe().round(3))

# Visualize the results
if 'results_df' in globals():
    visualize_ragas_results(results_df)
else:
    print("‚ùå Run the Ragas evaluation first to generate results!")

In [None]:
# --- Performance Comparison Table ---
def create_performance_summary():
    """Create a summary table of different RAG approaches tested."""
    
    # This would normally come from systematic testing
    # For now, create a sample summary
    
    summary_data = {
        "Approach": [
            "Basic RAG (Chunks Only)",
            "Basic RAG (All Sources)",
            "RAG + Question Rewriting",
            "RAG + CoT Reasoning",
            "Plan-and-Execute Agent"
        ],
        "Answer Quality": ["‚≠ê‚≠ê", "‚≠ê‚≠ê‚≠ê", "‚≠ê‚≠ê‚≠ê", "‚≠ê‚≠ê‚≠ê‚≠ê", "‚≠ê‚≠ê‚≠ê‚≠ê"],
        "Speed": ["‚ö°‚ö°‚ö°‚ö°", "‚ö°‚ö°‚ö°", "‚ö°‚ö°", "‚ö°", "‚ö°"],
        "Complexity": ["Low", "Low", "Medium", "Medium", "High"],
        "Best For": [
            "Simple fact lookup",
            "General questions",
            "Poorly formulated questions",
            "Reasoning questions",
            "Multi-hop questions"
        ]
    }
    
    summary_df = pd.DataFrame(summary_data)
    print("üìà RAG Approaches Comparison:")
    print("="*80)
    return summary_df

try:
    comparison_table = create_performance_summary()
    print(comparison_table.to_string(index=False))
except:
    print("üìä Performance comparison table requires pandas")
    print("Install with: pip install pandas")

## 11. üéØ Key Takeaways & Experimentation Ideas

### 11.1 What Works Well
- ‚úÖ **Multiple retrieval strategies** provide comprehensive coverage
- ‚úÖ **Chapter summaries** help with high-level understanding and navigation
- ‚úÖ **Book quotes** provide specific, verifiable evidence
- ‚úÖ **Question rewriting** improves retrieval quality for ambiguous queries
- ‚úÖ **Chain-of-thought reasoning** enhances answer quality for complex questions
- ‚úÖ **Plan-and-execute agents** handle multi-hop reasoning effectively

### 11.2 Areas for Experimentation

In [None]:
# --- Experimentation Ideas ---
experimentation_ideas = """
üî¨ FUTURE EXPERIMENTS:

1. CHUNK SIZE OPTIMIZATION
   - Test chunk sizes: [500, 1000, 1500, 2000, 2500]
   - Measure impact on retrieval quality and speed

2. EMBEDDING MODEL COMPARISON
   - Compare OpenAI embeddings vs. open-source alternatives
   - Test multilingual embeddings for non-English questions

3. RETRIEVAL PARAMETER TUNING
   - Vary k-values: [1, 3, 5, 10, 15]
   - Test different similarity thresholds

4. ADVANCED TECHNIQUES
   - Add reranking with cross-encoders
   - Implement hybrid search (semantic + keyword)
   - Try query expansion with generated synonyms
   - Add contextual compression for long contexts

5. MULTI-MODAL EXTENSIONS
   - Add movie stills/images to vector store
   - Include audio clips from audiobooks

6. SCALABILITY TESTS
   - Index all 7 Harry Potter books
   - Test with larger document collections
   - Measure latency vs. dataset size
"""

print(experimentation_ideas)

### 11.3 Quick Experiment Templates

In [None]:
# --- Quick Experiment: Chunk Size Comparison ---
def experiment_chunk_sizes():
    """Quick template for testing different chunk sizes."""
    
    chunk_sizes = [500, 1000, 1500, 2000]
    results = {}
    
    test_question = "What is the function of a Remembrall?"
    
    for size in chunk_sizes:
        print(f"\nTesting chunk size: {size}")
        
        # Create vector store with specific chunk size
        # vector_store = encode_book(hp_pdf_path, chunk_size=size)
        # retriever = vector_store.as_retriever(search_kwargs={"k": 3})
        
        # For this demo, we'll simulate
        answer = basic_rag_query(test_question)
        results[size] = answer
        print(f"Answer: {answer[:100]}...")

    return results

# Uncomment to run
# chunk_experiment_results = experiment_chunk_sizes()
# print("\nChunk Size Experiment Results:")
# for size, answer in chunk_experiment_results.items():
#     print(f"{size}: {answer}")

In [None]:
# --- Quick Experiment: Model Comparison ---
def experiment_different_llms():
    """Compare different LLMs for answering."""
    
    models = {
        "gpt-3.5-turbo": ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
        "gpt-4o": ChatOpenAI(temperature=0, model_name="gpt-4o"),
        # "llama3-70b": ChatGroq(temperature=0, model_name="llama3-70b-8192", groq_api_key=groq_api_key)
    }
    
    question = "Explain the significance of Harry's scar"
    print(f"Question: {question}\n")
    
    for model_name, llm in models.items():
        # Get context
        docs = chunks_query_retriever.get_relevant_documents(question)
        context = " ".join([doc.page_content for doc in docs])
        
        # Generate answer
        prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
        answer = llm.invoke(prompt).content
        
        print(f"--- {model_name.upper()} ---")
        print(f"{answer}\n")

# Uncomment to run model comparison
# experiment_different_llms()