# Phase 2: Policy Document Indexing for RAG

In [4]:
import pandas as pd
import numpy as np
import os
import re, unicodedata

from PyPDF2 import PdfReader
from collections import defaultdict
from rank_bm25 import BM25Okapi

## Text Extraction

In [5]:
def extract_text_from_pdfs(directory):
    pdf_texts = {}

    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory, filename)

            with open(filepath, 'rb') as file:
                pdf_reader = PdfReader(file)

                page_texts = []
                for page in pdf_reader.pages:
                    page_texts.append(page.extract_text())

                pdf_texts[filename[:-4]] = page_texts

    max_pages = max(len(pages) for pages in pdf_texts.values())

    df = pd.DataFrame(columns=pdf_texts.keys(), index=[f'Page {i+1}' for i in range(max_pages)])

    for filename, pages in pdf_texts.items():
        for i, text in enumerate(pages):
            df.at[f'Page {i+1}', filename] = text

    return df

In [6]:
pdf_reader = extract_text_from_pdfs("assets/benefits")

## Text Preprocessing

In [7]:
def normalize_text(s):
    if s is None:
        return ''
    s = unicodedata.normalize('NFC',str(s)).strip()
    return re.sub(r"\s+", " ", s)


def chunk_text(text, chunk_size):
    text = normalize_text(text)
    if not text:
        return []

    words = text.split(" ")
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


def build_chunks(df_long,chunk_size, doc_col, page_col, text_col):
    rows = []

    for doc_id, group in df_long.groupby(doc_col):
        groupe_sorted = group.sort_values(page_col)
        full_text = " ".join([normalize_text(t) for t in groupe_sorted[text_col] if pd.notna(t) ])
        chunks = chunk_text(full_text,chunk_size)
        for i, ch in enumerate(chunks):
            rows.append({
                'doc_id' : doc_id,
                'chunk_id' : i,
                'n_words': len(ch.split(" ")),
                'text' : ch
            })
    return pd.DataFrame(rows, columns=['doc_id', 'chunk_id', 'n_words', 'text'])


In [8]:
df_long = pdf_reader.reset_index().melt(id_vars=['index'], var_name='document', value_name='text')
df_long = df_long.rename(columns={'index': 'page'})
df_long['page'] = df_long['page'].str.replace('Page','').astype(int)

df_long.head()

Unnamed: 0,page,document,text
0,1,401k-retirement-policy,TechLance Retirement Plan (401k) Policy\nIntro...
1,2,401k-retirement-policy,TechLance’s matching formula is designed to re...
2,3,401k-retirement-policy,the contribution amounts. You can opt out of a...
3,4,401k-retirement-policy,Investment performance and expense ratios are ...
4,5,401k-retirement-policy,The 401(k) plan allows loans for participants ...


In [9]:
df_chunks = build_chunks(df_long, 200, 'document', 'page', 'text')
df_chunks.head(15)

Unnamed: 0,doc_id,chunk_id,n_words,text
0,401k-retirement-policy,0,200,TechLance Retirement Plan (401k) Policy Introd...
1,401k-retirement-policy,1,200,can contribute between 1% and 100% of their sa...
2,401k-retirement-policy,2,200,"always 100% vested in your own contributions, ..."
3,401k-retirement-policy,3,200,of automatic escalation or adjust the increase...
4,401k-retirement-policy,4,200,"and small-cap funds, international developed a..."
5,401k-retirement-policy,5,200,"beneﬁt, but qualiﬁed withdrawals in retirement..."
6,401k-retirement-policy,6,200,impact your long-term retirement savings. You ...
7,401k-retirement-policy,7,200,"withdrawal penalties, and you’re suspended fro..."
8,401k-retirement-policy,8,200,roll it over to a new employer’s plan or indiv...
9,401k-retirement-policy,9,200,"accounts, and taxable investment accounts. Cre..."


In [10]:
# Check number of chunk per document 
print(df_chunks.groupby('doc_id').size())

doc_id
401k-retirement-policy          13
childcare-policy                11
gym-policy                      11
health-insurance-policy         11
life-insurance-policy           11
tuition-reimbursement-policy    11
vacation-policy                 10
work-from-home-policy           11
dtype: int64


In [11]:
# Save Dataframe into csv file 
df_chunks.to_csv('assets/benefits/policy_chunks.csv', index=False)

## Vector Store Creation

In [12]:
import getpass

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv("env.txt")

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [13]:
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

def build_faiss_store(chunks):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

    documents = []
    for _, row in chunks.iterrows():
        doc = Document(
            page_content=row['text'],
            metadata={
                'doc_id': row['doc_id'],
                'chunk_id': row['chunk_id'],
                'n_words': row['n_words']
            }
        )
        documents.append(doc)

    vectorstore = FAISS.from_documents(
        documents=documents,
        embedding=embeddings
    )

    vectorstore.save_local("assets/faiss_db")

    return vectorstore, embeddings

vectorstore, embeddings = build_faiss_store(df_chunks)

  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


## Retrieval Testing

In [14]:
queries = [
    "What is the eligibility for Tuition Reimbursement?",
    "How many vacation days do employees get?",
    "What gym membership discounts can we expect?",
    "What are the work from home eligibity criteria?",
    "What health insurance options are available?"
]

for query in queries:
    print(f"\nQuery: {query}")
    print("-" * 50)
    docs = vectorstore.similarity_search(query, k=3)
    
    for i, doc in enumerate(docs, 1):
        print(f"   Document {i}: {doc.metadata['doc_id']}, Chunk: {doc.metadata['chunk_id']}")
        print(f"   Content: {doc.page_content[:200]}...\n")


Query: What is the eligibility for Tuition Reimbursement?
--------------------------------------------------
   Document 1: tuition-reimbursement-policy, Chunk: 1
   Content: to continued employment that helps both individual career development and organizational stability. Eligibility Requirements and Performance Standards To be eligible for tuition reimbursement, employe...

   Document 2: tuition-reimbursement-policy, Chunk: 8
   Content: and institutions that have been evaluated for quality and relevance, though employees can request consideration of other programs through the application process. HR can help you determine whether a s...

   Document 3: tuition-reimbursement-policy, Chunk: 9
   Content: multiple calendar years to maximize beneﬁts. Do I have to pay tuition upfront and wait for reimbursement? Yes, employees typically pay all educational expenses upfront and submit for reimbursement aft...


Query: How many vacation days do employees get?
----------------------------

### Individual Chunk Ratings (1-5 scale where 5 is highly relevant)

##### **Query 1: Tuition Reimbursement Eligibility**
- **Chunk 1**: **5/5** - Directly mentions "Eligibility Requirements"
- **Chunk 2**: **3/5** - Related but about program evaluation, not eligibility
- **Chunk 3**: **2/5** - About payment process, not eligibility criteria

##### **Query 2: Vacation Days**
- **Chunk 1**: **5/5** - Specific numbers: 25 days for 6+ years, 200 hours
- **Chunk 2**: **2/5** - About floating holidays, not vacation days
- **Chunk 3**: **1/5** - Generic introduction, no actual day counts

##### **Query 3: Gym Membership Discounts**
- **Chunk 1**: **5/5** - Exact answer: 50% off Tier 1, names specific gyms
- **Chunk 2**: **3/5** - About information sessions, indirectly relevant
- **Chunk 3**: **4/5** - Good info about family eligibility for discounts

##### **Query 4: Work From Home Eligibility**
- **Chunk 1**: **5/5** - Lists specific eligible roles (developers, analysts, etc.)
- **Chunk 2**: **1/5** - Generic introduction, no eligibility criteria
- **Chunk 3**: **3/5** - Technical requirements, part of eligibility

##### **Query 5: Health Insurance Options**
- **Chunk 1**: **2/5** - About enrollment timing, not plan options
- **Chunk 2**: **4/5** - Describes PPO plan features and flexibility
- **Chunk 3**: **4/5** - Mentions HDHP option and premium costs

#### **Average chunk relevance: 3.2/5** — There is room for improvement with better retrieval techniques.


## Advanced RAG Methods

### Metadata Filtering

In [15]:
from collections import defaultdict
from rank_bm25 import BM25Okapi



### Query Expansion

In [16]:
from haystack import Document as HaystackDocument
from haystack.document_stores.in_memory import InMemoryDocumentStore
from typing import List, Optional
from haystack import Pipeline, component
from haystack.components.builders import ChatPromptBuilder
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.dataclasses import ChatMessage, Document
from haystack_experimental.components.query.query_expander import QueryExpander

haystack_documents = []
for _, row in df_chunks.iterrows():
    doc = HaystackDocument(
        content=row['text'],
        meta={
            'doc_id': row['doc_id'],
            'chunk_id': row['chunk_id'],
            'n_words': row['n_words']
        }
    )
    haystack_documents.append(doc)

doc_store = InMemoryDocumentStore()
doc_store.write_documents(haystack_documents)

@component
class MultiQueryInMemoryBM25Retriever:
    def __init__(self, retriever: InMemoryBM25Retriever, top_k: int = 3):
        self.retriever = retriever
        self.top_k = top_k

    @component.output_types(documents=List[Document])
    def run(self, queries: List[str], top_k: int = None):
        if top_k != None:
            self.top_k = top_k

        all_docs = {}
        for query in queries:
            result = self.retriever.run(query = query, top_k = self.top_k)
            for doc in result['documents']:
                all_docs[doc.id] = doc

        all_docs = list(all_docs.values())
        all_docs.sort(key=lambda x: x.score, reverse=True)
        return {"documents": all_docs}

chat_message = ChatMessage.from_user(
    text="""You are part of an information system that summarises related documents.
            You answer a query using the textual content from the documents retrieved for the
            following query.
            You build the summary answer based only on quoting information from the documents.
            You should reference the documents you used to support your answer.
            ###
            Original Query: "{{query}}"
            Retrieved Documents: {{documents}}
            Summary Answer:
            """
    )

query_expander = QueryExpander()
retriever = MultiQueryInMemoryBM25Retriever(InMemoryBM25Retriever(document_store=doc_store))
chat_prompt_builder = ChatPromptBuilder(template=[chat_message], required_variables="*")
llm = OpenAIChatGenerator()

query_expanded_rag_pipeline = Pipeline()
query_expanded_rag_pipeline.add_component("expander", query_expander)
query_expanded_rag_pipeline.add_component("keyword_retriever", retriever)
query_expanded_rag_pipeline.add_component("chat_prompt_builder", chat_prompt_builder)
query_expanded_rag_pipeline.add_component("llm", llm)

query_expanded_rag_pipeline.connect("expander.queries", "keyword_retriever.queries")
query_expanded_rag_pipeline.connect("keyword_retriever.documents", "chat_prompt_builder.documents")
query_expanded_rag_pipeline.connect("chat_prompt_builder.prompt", "llm.messages")

def get_extended_queries(query):
    expander = query_expanded_rag_pipeline.run({"query": query, "top_k": 3}, include_outputs_from=["keyword_retriever", "expander"])
    return expander['expander']['queries']

In [17]:
# EXAMPLE

get_extended_queries(queries[0])

['tuition reimbursement eligibility criteria',
 'who qualifies for tuition reimbursement',
 'requirements for tuition reimbursement',
 'tuition reimbursement qualification guidelines',
 'What is the eligibility for Tuition Reimbursement?']

### Hybrid Search (BM25 + Dense Vector)

In [18]:
# Tokenize text for BM25 processing(lowercase, ignore punctuation)
def tokenize_text(text):
    return re.findall(r'[A-Za-z0-9\-]+', text.lower())

# Reciprocal Rank Fusion (RFF)
def rrf_fuse(rankings, k):
    fused = defaultdict(float)
    for ranking in rankings:
        for rank, (rid, _score) in enumerate(ranking, start=1):
            fused[rid] += 1.0 / (k + rank)
    return sorted(fused.items(), key=lambda x: x[1], reverse=True)

def split_id(rid):
    d,c = rid.split('::')
    return d,c

In [19]:
# ----- Build BM25 index for keyword-based search-----

# 1. Exctract text content and create unique IDs for each chunk
bm25_texts = df_chunks['text'].tolist()
bm25_ids = [f'{d}::{c}' for d,c in zip(df_chunks['doc_id'], df_chunks['chunk_id'])]

#2. Tokenize all text chunks for BM25 processing
bm25_text_tokenize = [tokenize_text(t) for t in bm25_texts]

#3. Build index for BM25 
bm25_index = BM25Okapi(bm25_text_tokenize)

#4. Create mapping from chunks IDs to original text 
id_to_text = dict(zip(bm25_ids, bm25_texts))

In [20]:
# ----- Dense Search with FAISS (Sementic similarity)-----
def dense_search(query,k):
    # Perform sementic search using vector embeddings
    results = []
    docs = vectorstore.similarity_search(query,k)
    for rank, doc in enumerate(docs, start=1): 
        rid = f"{doc.metadata['doc_id']}::{doc.metadata['chunk_id']}"
        results.append((rid, 1.0/rank))
    return results

# ----- BM25 Search (keyword-based)-----
def bm25_search(query,k):
    # Perfom keyword-based search using BM25 algorithm
    q_tokens = tokenize_text(query)
    scores = bm25_index.get_scores(q_tokens)
    top_idx = np.argsort(scores)[::-1][:k]
    return [(bm25_ids[i], float(scores[i])) for i in top_idx]


In [21]:
# ----- Hybrid Search combining Dense + RM25 with RRF Fusion -----

def hybrid_search(query, k_dense, k_bm25, top_k_final):
    dense_results = dense_search(query,k_dense)
    bm25_results = bm25_search(query,k_bm25)

    fused = rrf_fuse([dense_results, bm25_results], 60)

    return fused[:top_k_final]

In [22]:
def show_results(result):
    for rank, (rid, score) in enumerate(results, start=1):
        doc_id, chunk_id = split_id(rid)
        text = id_to_text[rid]

        print(f"#{rank} - {doc_id} (chunk {chunk_id}) - RRF score = {score:.4f}")
        print("    ", text, "    \n")

In [23]:
# EXAMPLE

for q in queries: 
    print(f"\nQuery: {q}")
    print("-" * 50)   
    results = hybrid_search(q,50,50,5)
    show_results(results)


Query: What is the eligibility for Tuition Reimbursement?
--------------------------------------------------
#1 - tuition-reimbursement-policy (chunk 9) - RRF score = 0.0323
     multiple calendar years to maximize beneﬁts. Do I have to pay tuition upfront and wait for reimbursement? Yes, employees typically pay all educational expenses upfront and submit for reimbursement after successful course completion with appropriate documentation. What if my course costs exceed the annual maximum? You’re responsible for any costs above the annual maximum limits. Consider planning your educational timeline to stay within annual limits or spreading courses across multiple years. Can I take courses that aren’t directly related to my current job? Courses should relate to your current role or reasonable career progression within TechLance. Liberal arts or unrelated courses require special approval with strong business justiﬁcation. What happens to my service commitment if I get promoted? Service co

## TESTING UNDERNEATH

In [84]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

extended_queries = []

def expand_query(original_query):
    llm = ChatOpenAI(temperature=0.3)

    expansion_prompt = PromptTemplate(
        input_variables=["query"],
        template="Expand this query with synonyms and related terms: {query}\nExpanded query:"
    )

    prompt = expansion_prompt.format(query=original_query)
    expanded = llm.invoke(prompt)

    return expanded.content

# Examples
for q in queries:
    extended_queries.append(expand_query(q))

extended_queries[0]

'What are the requirements for Tuition Reimbursement? What are the qualifications for Tuition Reimbursement? What are the criteria for Tuition Reimbursement? What are the guidelines for Tuition Reimbursement? What are the conditions for Tuition Reimbursement? What are the prerequisites for Tuition Reimbursement?'

In [56]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)
retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), 
    llm=llm
)

retriever.invoke("What is the vacation policy?")

[Document(id='18552270-689b-47fc-8609-77fe012aa8d7', metadata={'doc_id': 'vacation-policy', 'chunk_id': 5, 'n_words': 200}, page_content='or payroll deduction if employment ends before the time is earned. We also provide ﬂoating holidays speciﬁcally for religious and cultural observances that may not align with our standard company holidays. Each employee receives two ﬂoating holidays annually, which operate similarly to vacation days but are speciﬁcally intended for religious or cultural observances. If additional time is needed for religious holidays, employees may use vacation time or arrange ﬂexible scheduling with their manager. For employees who exhaust their vacation time but need additional time oﬀ, unpaid personal leave may be available with manager approval for up to 30 days. We also oﬀer sabbatical opportunities for employees with ﬁve or more years of service who wish to pursue personal or professional development opportunities that require extended time away from work. Best

In [53]:
from typing import List, Optional
from haystack import Pipeline, component

from haystack.components.builders import ChatPromptBuilder
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.writers import DocumentWriter

from haystack.dataclasses import ChatMessage, Document

from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

from haystack_experimental.components.query.query_expander import QueryExpander

In [52]:
@component
class MultiQueryInMemoryBM25Retriever:

    def __init__(self, retriever: InMemoryBM25Retriever, top_k: int = 3):
        self.retriever = retriever
        self.top_k = top_k

    @component.output_types(documents=List[Document])
    def run(self, queries: List[str], top_k: int = None):
        if top_k != None:
            self.top_k = top_k

        all_docs = {}
        for query in queries:
            result = self.retriever.run(query = query, top_k = self.top_k)
            for doc in result['documents']:
                all_docs[doc.id] = doc

        all_docs = list(all_docs.values())
        all_docs.sort(key=lambda x: x.score, reverse=True)
        return {"documents": all_docs}

In [54]:
query_expander = QueryExpander()
retriever = MultiQueryInMemoryBM25Retriever(InMemoryBM25Retriever(document_store=doc_store))
chat_prompt_builder = ChatPromptBuilder(template=[chat_message], required_variables="*")
llm = OpenAIChatGenerator()

chat_message = ChatMessage.from_user(
    text="""You are part of an information system that summarises related documents. 
            You answer a query using the textual content from the documents retrieved for the following query. 
            You build the summary answer based only on quoting information from the documents.
            You should reference the documents you used to support your answer.
            ###
            Original Query: "{{query}}"
            Retrieved Documents: {{documents}}
            Summary Answer:
            """
    )

query_expander = QueryExpander()
retriever = MultiQueryInMemoryBM25Retriever(InMemoryBM25Retriever(document_store=doc_store))
chat_prompt_builder = ChatPromptBuilder(template=[chat_message], required_variables="*")
llm = OpenAIChatGenerator()

query_expanded_rag_pipeline = Pipeline()
query_expanded_rag_pipeline.add_component("expander", query_expander)
query_expanded_rag_pipeline.add_component("keyword_retriever", retriever)
query_expanded_rag_pipeline.add_component("chat_prompt_builder", chat_prompt_builder)
query_expanded_rag_pipeline.add_component("llm", llm)

query_expanded_rag_pipeline.connect("expander.queries", "keyword_retriever.queries")
query_expanded_rag_pipeline.connect("keyword_retriever.documents", "chat_prompt_builder.documents")
query_expanded_rag_pipeline.connect("chat_prompt_builder.prompt", "llm.messages")

NameError: name 'doc_store' is not defined

In [None]:
import numpy as np
from langchain.retrievers import ParentDocumentRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# 1. Metadata Filtering RAG
class MetadataFilteringRAG:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(embedding_function=self.embeddings)
        
    def add_documents(self, documents, metadata):
        # Add documents with metadata to vector store
        self.vectorstore.add_texts(documents, metadatas=metadata)
        
    def query(self, query, metadata_filter):
        # Retrieve documents matching metadata filter
        docs = self.vectorstore.similarity_search(
            query,
            filter=metadata_filter
        )
        return docs

# 2. Query Expansion RAG 
class QueryExpansionRAG:
    def __init__(self):
        self.llm = OpenAI()
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(embedding_function=self.embeddings)
        
        # Prompt for query expansion
        expansion_template = """Generate 3 different versions of the following query 
        that capture the same meaning but use different words:
        Query: {query}
        
        Different versions:"""
        
        self.expansion_prompt = PromptTemplate(
            input_variables=["query"],
            template=expansion_template
        )
        self.expansion_chain = LLMChain(llm=self.llm, prompt=self.expansion_prompt)
        
    def expand_query(self, query):
        # Generate variations of the query
        expanded = self.expansion_chain.run(query)
        expanded_queries = [query] + expanded.strip().split("\n")
        return expanded_queries
    
    def query(self, query):
        # Get expanded queries
        expanded_queries = self.expand_query(query)
        
        # Search with all query versions and combine results
        all_docs = []
        for q in expanded_queries:
            docs = self.vectorstore.similarity_search(q)
            all_docs.extend(docs)
            
        # Remove duplicates
        seen = set()
        unique_docs = []
        for doc in all_docs:
            if doc.page_content not in seen:
                seen.add(doc.page_content)
                unique_docs.append(doc)
                
        return unique_docs

# 3. Hypothetical Document Embeddings (HyDE) RAG
class HyDERAG:
    def __init__(self):
        self.llm = OpenAI()
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(embedding_function=self.embeddings)
        
        # Prompt to generate hypothetical document
        hyde_template = """Given a question, write a hypothetical passage 
        that would contain the answer to the question.
        
        Question: {query}
        
        Hypothetical passage:"""
        
        self.hyde_prompt = PromptTemplate(
            input_variables=["query"],
            template=hyde_template
        )
        self.hyde_chain = LLMChain(llm=self.llm, prompt=self.hyde_prompt)
        
    def generate_hypothetical_doc(self, query):
        # Generate hypothetical document that would answer query
        return self.hyde_chain.run(query)
    
    def query(self, query):
        # Generate hypothetical document
        hypothetical_doc = self.generate_hypothetical_doc(query)
        
        # Use hypothetical doc embedding to find similar real docs
        docs = self.vectorstore.similarity_search(hypothetical_doc)
        return docs

# Example usage and testing
def test_rag_methods():
    # Test documents
    documents = [
        "The sky is blue because of Rayleigh scattering.",
        "Photosynthesis is how plants convert sunlight to energy.",
        "The theory of relativity was proposed by Einstein."
    ]
    
    metadata = [
        {"topic": "physics", "difficulty": "basic"},
        {"topic": "biology", "difficulty": "intermediate"}, 
        {"topic": "physics", "difficulty": "advanced"}
    ]
    
    # Test metadata filtering
    meta_rag = MetadataFilteringRAG()
    meta_rag.add_documents(documents, metadata)
    physics_docs = meta_rag.query(
        "What physics concepts?", 
        {"topic": "physics"}
    )
    
    # Test query expansion
    expand_rag = QueryExpansionRAG()
    expand_rag.vectorstore.add_texts(documents)
    expanded_results = expand_rag.query("How do plants make food?")
    
    # Test HyDE
    hyde_rag = HyDERAG()
    hyde_rag.vectorstore.add_texts(documents)
    hyde_results = hyde_rag.query("What causes the sky's color?")
    
    return physics_docs, expanded_results, hyde_results

if __name__ == "__main__":
    physics_docs, expanded_results, hyde_results = test_rag_methods()