# Phase 2: Policy Document Indexing for RAG

In [11]:
import pandas as pd
import os
import re, unicodedata

from PyPDF2 import PdfReader

## Text Extraction

In [12]:
def extract_text_from_pdfs(directory):
    pdf_texts = {}

    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory, filename)

            with open(filepath, 'rb') as file:
                pdf_reader = PdfReader(file)

                page_texts = []
                for page in pdf_reader.pages:
                    page_texts.append(page.extract_text())

                pdf_texts[filename[:-4]] = page_texts

    max_pages = max(len(pages) for pages in pdf_texts.values())

    df = pd.DataFrame(columns=pdf_texts.keys(), index=[f'Page {i+1}' for i in range(max_pages)])

    for filename, pages in pdf_texts.items():
        for i, text in enumerate(pages):
            df.at[f'Page {i+1}', filename] = text

    return df

In [13]:
pdf_reader = extract_text_from_pdfs("assets/benefits")

## Text Preprocessing

In [14]:
def normalize_text(s):
    if s is None:
        return ''
    s = unicodedata.normalize('NFC',str(s)).strip()
    return re.sub(r"\s+", " ", s)


def chunk_text(text, chunk_size):
    text = normalize_text(text)
    if not text:
        return []

    words = text.split(" ")
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


def build_chunks(df_long,chunk_size, doc_col, page_col, text_col):
    rows = []

    for doc_id, group in df_long.groupby(doc_col):
        groupe_sorted = group.sort_values(page_col)
        full_text = " ".join([normalize_text(t) for t in groupe_sorted[text_col] if pd.notna(t) ])
        chunks = chunk_text(full_text,chunk_size)
        for i, ch in enumerate(chunks):
            rows.append({
                'doc_id' : doc_id,
                'chunk_id' : i,
                'n_words': len(ch.split(" ")),
                'text' : ch
            })
    return pd.DataFrame(rows, columns=['doc_id', 'chunk_id', 'n_words', 'text'])


In [15]:
df_long = pdf_reader.reset_index().melt(id_vars=['index'], var_name='document', value_name='text')
df_long = df_long.rename(columns={'index': 'page'})
df_long['page'] = df_long['page'].str.replace('Page','').astype(int)

df_long.head()

Unnamed: 0,page,document,text
0,1,401k-retirement-policy,TechLance Retirement Plan (401k) Policy\nIntro...
1,2,401k-retirement-policy,TechLance’s matching formula is designed to re...
2,3,401k-retirement-policy,the contribution amounts. You can opt out of a...
3,4,401k-retirement-policy,Investment performance and expense ratios are ...
4,5,401k-retirement-policy,The 401(k) plan allows loans for participants ...


In [16]:
df_chunks = build_chunks(df_long, 200, 'document', 'page', 'text')
df_chunks.head(15)

Unnamed: 0,doc_id,chunk_id,n_words,text
0,401k-retirement-policy,0,200,TechLance Retirement Plan (401k) Policy Introd...
1,401k-retirement-policy,1,200,can contribute between 1% and 100% of their sa...
2,401k-retirement-policy,2,200,"always 100% vested in your own contributions, ..."
3,401k-retirement-policy,3,200,of automatic escalation or adjust the increase...
4,401k-retirement-policy,4,200,"and small-cap funds, international developed a..."
5,401k-retirement-policy,5,200,"beneﬁt, but qualiﬁed withdrawals in retirement..."
6,401k-retirement-policy,6,200,impact your long-term retirement savings. You ...
7,401k-retirement-policy,7,200,"withdrawal penalties, and you’re suspended fro..."
8,401k-retirement-policy,8,200,roll it over to a new employer’s plan or indiv...
9,401k-retirement-policy,9,200,"accounts, and taxable investment accounts. Cre..."


In [17]:
# Check number of chunk per document 
print(df_chunks.groupby('doc_id').size())

doc_id
401k-retirement-policy          13
childcare-policy                11
gym-policy                      11
health-insurance-policy         11
life-insurance-policy           11
tuition-reimbursement-policy    11
vacation-policy                 10
work-from-home-policy           11
dtype: int64


In [18]:
# Save Dataframe into csv file 
df_chunks.to_csv('assets/benefits/policy_chunks.csv', index=False)

## Vector Store Creation

In [19]:
import getpass

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv("env.txt")

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [None]:
from langchain_community.vectorstores import FAISS

def build_faiss_store(chunks, metadatas):
    embeddings = OpenAIEmbeddings(model="text-embedding-4-large")
    
    documents = [Document(page_content=chunk, metadata=meta) for chunk, meta in zip(chunks, metadatas)]
    
    vectorstore = FAISS.from_documents(
        documents=documents,
        embedding=embeddings
    )
    
    vectorstore.save_local("./faiss_db")
    
    return vectorstore, embeddings

def load_faiss_store():
    embeddings = OpenAIEmbeddings(model="text-embedding-4-large")
    vectorstore = FAISS.load_local("./faiss_db", embeddings, allow_dangerous_deserialization=True)
    
    return vectorstore, embeddings

## Retrieval Testing

In [None]:
queries = ["What is the eligibility for Tuition Reimbursement?", 
           "What is the eligibility for Tuition Reimbursement?",
           "What is the eligibility for Tuition Reimbursement?",
           "What is the eligibility for Tuition Reimbursement?",
           "What is the eligibility for Tuition Reimbursement?"]

for query in queries:
    vectorstore.similarity_search(query)

## Advanced RAG Methods

In [None]:
Metadata filtering, Content-based filtering, Query expansion, HyDE, Reranking, Hybrid search, Context Distillation, Multi-hop question answering

In [None]:
import numpy as np
from langchain.retrievers import ParentDocumentRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# 1. Metadata Filtering RAG
class MetadataFilteringRAG:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(embedding_function=self.embeddings)
        
    def add_documents(self, documents, metadata):
        # Add documents with metadata to vector store
        self.vectorstore.add_texts(documents, metadatas=metadata)
        
    def query(self, query, metadata_filter):
        # Retrieve documents matching metadata filter
        docs = self.vectorstore.similarity_search(
            query,
            filter=metadata_filter
        )
        return docs

# 2. Query Expansion RAG 
class QueryExpansionRAG:
    def __init__(self):
        self.llm = OpenAI()
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(embedding_function=self.embeddings)
        
        # Prompt for query expansion
        expansion_template = """Generate 3 different versions of the following query 
        that capture the same meaning but use different words:
        Query: {query}
        
        Different versions:"""
        
        self.expansion_prompt = PromptTemplate(
            input_variables=["query"],
            template=expansion_template
        )
        self.expansion_chain = LLMChain(llm=self.llm, prompt=self.expansion_prompt)
        
    def expand_query(self, query):
        # Generate variations of the query
        expanded = self.expansion_chain.run(query)
        expanded_queries = [query] + expanded.strip().split("\n")
        return expanded_queries
    
    def query(self, query):
        # Get expanded queries
        expanded_queries = self.expand_query(query)
        
        # Search with all query versions and combine results
        all_docs = []
        for q in expanded_queries:
            docs = self.vectorstore.similarity_search(q)
            all_docs.extend(docs)
            
        # Remove duplicates
        seen = set()
        unique_docs = []
        for doc in all_docs:
            if doc.page_content not in seen:
                seen.add(doc.page_content)
                unique_docs.append(doc)
                
        return unique_docs

# 3. Hypothetical Document Embeddings (HyDE) RAG
class HyDERAG:
    def __init__(self):
        self.llm = OpenAI()
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(embedding_function=self.embeddings)
        
        # Prompt to generate hypothetical document
        hyde_template = """Given a question, write a hypothetical passage 
        that would contain the answer to the question.
        
        Question: {query}
        
        Hypothetical passage:"""
        
        self.hyde_prompt = PromptTemplate(
            input_variables=["query"],
            template=hyde_template
        )
        self.hyde_chain = LLMChain(llm=self.llm, prompt=self.hyde_prompt)
        
    def generate_hypothetical_doc(self, query):
        # Generate hypothetical document that would answer query
        return self.hyde_chain.run(query)
    
    def query(self, query):
        # Generate hypothetical document
        hypothetical_doc = self.generate_hypothetical_doc(query)
        
        # Use hypothetical doc embedding to find similar real docs
        docs = self.vectorstore.similarity_search(hypothetical_doc)
        return docs

# Example usage and testing
def test_rag_methods():
    # Test documents
    documents = [
        "The sky is blue because of Rayleigh scattering.",
        "Photosynthesis is how plants convert sunlight to energy.",
        "The theory of relativity was proposed by Einstein."
    ]
    
    metadata = [
        {"topic": "physics", "difficulty": "basic"},
        {"topic": "biology", "difficulty": "intermediate"}, 
        {"topic": "physics", "difficulty": "advanced"}
    ]
    
    # Test metadata filtering
    meta_rag = MetadataFilteringRAG()
    meta_rag.add_documents(documents, metadata)
    physics_docs = meta_rag.query(
        "What physics concepts?", 
        {"topic": "physics"}
    )
    
    # Test query expansion
    expand_rag = QueryExpansionRAG()
    expand_rag.vectorstore.add_texts(documents)
    expanded_results = expand_rag.query("How do plants make food?")
    
    # Test HyDE
    hyde_rag = HyDERAG()
    hyde_rag.vectorstore.add_texts(documents)
    hyde_results = hyde_rag.query("What causes the sky's color?")
    
    return physics_docs, expanded_results, hyde_results

if __name__ == "__main__":
    physics_docs, expanded_results, hyde_results = test_rag_methods()