# Library Imports

In [1]:

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import TextNode
from llama_index.core.settings import Settings
from llama_index.core.schema import Document
from llama_index.core.chat_engine.types import ChatMessage
from llama_index.core.base.llms.types import MessageRole
from collections import defaultdict
from datetime import datetime
import re
import os
import json
import random
from dotenv import load_dotenv
import chromadb
import time
from rank_bm25 import BM25Okapi
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize






[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


## Setup: Load the Environment, Initialize LLM, Embedding, and Vector Store

We sets up the environment and core components for the RAG pipeline. It loads environment variables (such as the Mistral API key), initializes the GPT-4o-mini model from OpenAI for downstream tasks, and configures a persistent ChromaDB vector store to manage document embeddings. The embedding model used is from MistralAI, and batch processing parameters like `CHUNK_SIZE`, `BATCH_SIZE`, and `TOP_K` are also defined here as global paraemeters, to control retrieval behavior and performance during document indexing and querying.

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get API keys from environment variables
mistral_api_key = os.getenv('MISTRAL_API_KEY')
print(f"Mistral API key loaded: {'Yes' if mistral_api_key else 'No'}")
llm = OpenAI(model="gpt-4o-mini", temperature=0.1)
persist_dir = "../index/vector_chunk_size_4"
os.makedirs(persist_dir, exist_ok=True)

# Initialize Chroma client and collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("congressional_docs")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Initialize embedding model
embed_model = MistralAIEmbedding(api_key=mistral_api_key)

# Set batch processing parameters
BATCH_SIZE = 10
DELAY = 1  # seconds between batches
index = None
CHUNK_SIZE = 10 # or whatever default you want
TOP_K = 3 

Mistral API key loaded: Yes


## Load and Clean Data

We load the data, which is in PDF format, using 'SimpleDirectoryReader', which automatically selects the best file reader based on the file extensions. 

After loading, we proceed with cleaning the data by removing any sequence of whitespace characters, such as excessive spaces, tabs, or line breaks, and replacing them with a single white space. This step helps ensure consistent formatting before further processing.

In [3]:
def clean_text(text):
    """Clean text by normalizing whitespace."""
    return re.sub(r'\s+', ' ', text).strip()

def load_and_clean_documents(path):
    """Load and clean PDF documents from the specified path, grouping by file."""
    raw_docs = SimpleDirectoryReader(
        input_dir=path,
        required_exts=[".pdf"],
        filename_as_id=True
    ).load_data()

    # Group texts by file 
    grouped = defaultdict(list)
    for doc in raw_docs:
        file_key = doc.metadata.get('file_name', doc.doc_id)
        grouped[file_key].append(doc.text)

    cleaned_docs = [
        Document(text=clean_text(" ".join(texts)), metadata={"file_name": file_key}, doc_id=file_key)
        for file_key, texts in grouped.items()
    ]
    return cleaned_docs

# Prints use for debugging to show the three documents created
documents = load_and_clean_documents("/Users/alexnikoloudis/Desktop/research project/development/document")
print(f"Loaded and cleaned {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(doc.text[:500])
    print("-" * 80)

Ignoring wrong pointing object 126 0 (offset 0)
Ignoring wrong pointing object 79 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)


Loaded and cleaned 3 documents

Document 1:
--------------------------------------------------------------------------------

Document 2:
--------------------------------------------------------------------------------

Document 3:
--------------------------------------------------------------------------------


##  BM25 Indexing and Chunk Validation

This script processes a collection of cleaned document objects by:

1. **Chunking** each document into overlapping windows of sentences  
   - Window sizes: 4, 5, or 6 sentences per chunk  
   - Overlap: 1 sentence between chunks

2. **Tokenizing** each chunk into lowercase words

3. **Indexing** the tokenized chunks using the **BM25 algorithm**  
   - This supports lexical information retrieval based on keyword matching

The script also prints out:
- A sample chunk for each window size
- The number of sentences in each sample

This helps validate the correctness and behavior of the chunking process.

In [4]:
#custom splitter for the documents, to split the documents into sentences that will be used for the chunking
def split_into_sentences(text):
    """Split text into sentences using punctuation."""
    return re.split(r'(?<=[.!?]) +', text)

def chunk_documents_manual(documents, window_size):
    nodes = []
    for doc in documents:
        sentences = split_into_sentences(doc.text)
        for i in range(0, len(sentences), window_size):
            group = sentences[i : i + window_size]
            if not group:
                continue
            chunk_text = " ".join(group).strip()
            if chunk_text:
                nodes.append(TextNode(text=chunk_text, metadata=doc.metadata))
    return nodes

# Apply for multiple window sizes
for size in [4,5,6, 8]:
    print(f"\n--- Window size = {size} ---")
    nodes = chunk_documents_manual(documents, window_size=size)
    
    # Print the first node and its sentence count
    sample_text = nodes[0].text
    sent_count = len(split_into_sentences(sample_text))
    print("\nSample chunk (first node):")
    print(sample_text)
    print("Sentence count in this node:", sent_count)


--- Window size = 4 ---

Sample chunk (first node):
Sentence count in this node: 4

--- Window size = 5 ---

Sample chunk (first node):
Sentence count in this node: 5

--- Window size = 6 ---

Sample chunk (first node):
Sentence count in this node: 6

--- Window size = 8 ---

Sample chunk (first node):
Sentence count in this node: 8


## Purpose of Batched Ingestion with Retry Logic

This part of the pipeline is responsible for gradually adding all preprocessed document chunks into the system that supports question answering. Instead of uploading everything at once, the data is split into smaller groups and added in steps. This helps prevent issues that might occur if too much information is sent at once.

To ensure the system doesn’t break when it gets overwhelmed, it also includes a waiting mechanism. If the system signals that it’s too busy (due to too many requests), the process waits and tries again after a short pause. This retry approach helps make the upload process more stable and reliable, especially when working with large amounts of data.

In [None]:
max_retries = 5
nodes = chunk_documents_manual(documents, window_size=CHUNK_SIZE)
for i in range(0, len(nodes), BATCH_SIZE):
    batch = nodes[i:i + BATCH_SIZE]
    print(f"Processing batch {i//BATCH_SIZE + 1} of {(len(nodes) + BATCH_SIZE - 1)//BATCH_SIZE}")

    if index is None:
        index = VectorStoreIndex(
            batch,
            storage_context=storage_context,
            embed_model=embed_model
        )
    else:
        for attempt in range(max_retries):
            try:
                index.insert_nodes(batch)
                break  # success 
            except Exception as e:
                if "429" in str(e) or "rate limit" in str(e).lower():
                    wait_time = (2 ** attempt) + random.uniform(0, 1)
                    print(f"Rate limit hit. Retrying in {wait_time:.2f} seconds...")
                    time.sleep(wait_time)
                else:
                    raise

    if i + BATCH_SIZE < len(nodes):
        print(f"Waiting {DELAY} seconds before next batch...")
        time.sleep(DELAY)

Processing batch 1 of 44
Waiting 1 seconds before next batch...
Processing batch 2 of 44
Waiting 1 seconds before next batch...
Processing batch 3 of 44


## BM25 Retrieval


This part implements the BM25 Retrieval, wich filters the document collection using keyword matching. It identifies and ranks passages that share terms with the user’s query, selecting the top-scoring ones for further processing.

In [6]:
#bm25 retrieval function
def get_bm25_results(query: str, top_k: TOP_K):
    """Get top k results from BM25 search."""
    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)

    results = []
    valid_indices = [(idx, score) for idx, score in enumerate(scores) if score > 0 and idx < len(nodes)]

    # Sort by score descending
    sorted_indices = sorted(valid_indices, key=lambda x: x[1], reverse=True)[:top_k]

    for idx, score in sorted_indices:
        results.append({
            "node": nodes[idx],
            "score": float(score),
            "text": nodes[idx].text
        })

    return results

## Retrieval Strategies

This step compares the BM25 and Vector Store Retrieval for finding relevant information. It runs both methods on the same user query and prints their top results side by side, allowing us to observe the differences in how each method identifies relevant content, together with the rank of each result.

In [7]:
tokenized_corpus = [node.text.lower().split() for node in nodes]
bm25 = BM25Okapi(tokenized_corpus)

def get_separate_retrievals(query: str):
    """Get separate results from BM25 and vector store retrievers."""
    print("=== BM25 Results ===")
    bm25_results = get_bm25_results(query, top_k=TOP_K)
    if bm25_results:
        for i, result in enumerate(bm25_results, 1):
            print(f"\n{i}. Score: {result['score']:.2f}")
            print(f"Text: {result['text'][:200]}...")
    
    print("\n=== Vector Store Results ===")
    # Create a query engine from the index
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    print(f"\nResponse: {response.response}")
    print("\nSource Nodes:")
    for i, node in enumerate(response.source_nodes, 1):
        print(f"\n{i}. Score: {node.score:.2f}")
        print(f"Text: {node.node.text[:200]}...")

# Example query for debugging and manual inspection
query = "what are the key issues discussed in the congressional hearings related to coronavirus?"
get_separate_retrievals(query)

=== BM25 Results ===

1. Score: 19.62
Text: Is that what we did? Dr. Walke. That is what--we had a multiprong approach to try to communicate the risk related to vaccines. But, again, I think we have a--there is a number of different communities...

2. Score: 18.16
Text: And so, when we're talking about lessons learned, these are the kind of systemic things within agencies that I would like to hear: What are we doing to fix these things? What are we doing to make sure...

3. Score: 17.64
Text: Cloud. That's my point, though. My point is that needs to change. So, you know, we brought this up in a number of Committee hearings how broke this process is. Are you doing anything to fix those thin...

=== Vector Store Results ===

Response: Key issues discussed in the congressional hearings related to coronavirus include the need for stronger oversight, better accountability, and improved structure within agencies to ensure they act within their statutory responsibilities and subject-matter ex

## Comparing Top Answers from BM25 and Vector Store (Debugging)

This function is used for **debugging and manual inspection** of the best answers retrieved by the two retrieval methods.

For a given query, the script prints the **top result** from each method, allowing for side-by-side comparison. This helps assess which retrieval strategy provides more relevant or contextually appropriate information and guides further tuning of retrieval settings.

In [8]:
def get_best_answers(query: str):
    """Get the best answer from both BM25 and vector store retrievers."""
    
    # 1. Get BM25 results
    print("=== Best BM25 Answer ===")
    bm25_results = get_bm25_results(query, top_k=TOP_K)  # Get only the best result
    if bm25_results:
        best_bm25 = bm25_results[0]
        print(f"Score: {best_bm25['score']:.2f}")
        print(f"Text: {best_bm25['text']}")
    
    # 2. Get Vector Store results
    print("\n=== Best Vector Store Answer ===")
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    print(f"Response: {response.response}")
    if response.source_nodes:
        best_source = response.source_nodes[0]  # Get the best source
        print(f"\nBest Source Score: {best_source.score:.2f}")
        print(f"Source Text: {best_source.node.text}")

# Example usage
query = "what are the key issues discussed in the congressional hearing related to coronavirus?"
get_best_answers(query)

=== Best BM25 Answer ===
Score: 19.62
Text: Is that what we did? Dr. Walke. That is what--we had a multiprong approach to try to communicate the risk related to vaccines. But, again, I think we have a--there is a number of different communities in our country, and we need to do better, I believe, at trying to reach those communities to talk to them about what are some of the issues they have with vaccine. Mr. Mfume. Dr. Tabak---- Dr. Wenstrup.

=== Best Vector Store Answer ===
Response: Key issues discussed in the congressional hearing related to coronavirus include the need for stronger oversight, better accountability, and improved structure within agencies to ensure they act within their statutory responsibilities and subject-matter expertise. Additionally, there was a focus on the transparency, consistency, and credibility of public-health decisions, as well as the importance of accepting both natural and lab-origin theories regarding the virus to enhance lab standards and prepared

## LLM-Based Reranker

This step uses OPENAI **GPT-4 Turbo** as a large language model to **rerank retrieved passages** based on their relevance to a given query. The goal is to refine the set of candidate chunks by prioritizing the most relevant ones.

This step provides a **more context-aware ranking** that combines the passges retrieved from the BM25 and Vector retrieval. It is used to improve the quality of the answers generated by the RAG model.

In [9]:
# Instantiate the new LLM that will be used for the reranking
llm_gpt45 = OpenAI(model="gpt-4-turbo", temperature=0)

def rerank_with_llm(query, candidates, llm, top_k=3):
    prompt = (
        f"You are a helpful assistant tasked with reranking text passages for a user query.\n"
        f"Query: \"{query}\"\n\n"
        f"Here are {len(candidates)} candidate passages:\n"
    )
    for i, passage in enumerate(candidates, 1):
        prompt += f"{i}. {passage.strip()}\n\n"

    prompt += (
        "Based on their relevance to the query, return the top "
        f"{top_k} most relevant passage numbers in descending order of relevance.\n"
        "Example format: 3, 7, 1\n\n"
        "Top passages:"
    )

    response = llm.complete(prompt).text.strip()

    # Extract passage indices from the LLM's response
    match = re.findall(r'\d+', response)
    top_indices = [int(i) for i in match[:top_k] if int(i) <= len(candidates)]

    # Return the actual top-k passages
    return [candidates[i - 1] for i in top_indices]

# --- Example usage ---

query = "what are the key issues discussed in the congressional hearings related to coronavirus?"

# Retrieve passages
bm25_results = get_bm25_results(query, top_k=TOP_K)
vector_engine = index.as_query_engine()
vector_response = vector_engine.query(query)
vector_results = [node.node.text for node in vector_response.source_nodes[:3]]

# Merge and deduplicate
bm25_texts = [r['text'] for r in bm25_results]
candidates = list(dict.fromkeys(bm25_texts + vector_results))  # Keeps order, removes duplicates

# Rerank
reranked_passages = rerank_with_llm(query, candidates, llm_gpt45, top_k=3)

# Output
for i, passage in enumerate(reranked_passages, 1):
    print(f"Top {i}: {passage[:300]}...\n")

Top 1: Stronger oversight, better accountability, and improved structure within our agencies are essential. Congress must consider a dedicated authority to oversee agency practices, ensuring that agencies act solely within their areas of statutory responsibilities and subject-matter expertise and insisting...

Top 2: And so, when we're talking about lessons learned, these are the kind of systemic things within agencies that I would like to hear: What are we doing to fix these things? What are we doing to make sure that we don't tread on people's civil rights again? What are we doing to make sure that we don't th...

Top 3: Cloud. That's my point, though. My point is that needs to change. So, you know, we brought this up in a number of Committee hearings how broke this process is. Are you doing anything to fix those things? Are you doing anything to fix these other--before you come in and ask is for money, we still hav...



## Main Question Answering Pipeline (Interactive Chatbot)

This is the **primary QA interface** of the system, enabling users to ask questions about congressional hearings and receive grounded responses. It integrates all key components of the RAG pipeline, combining retrieval, reranking, and generation in a conversational loop.

### Key Functionalities:
- **User Query Input:** Accepts free-form questions, including both open-ended and multiple-choice formats.
- **Dual Retrieval:** Retrieves relevant passages using two methods:
  - **BM25 (sparse)** — retrieves based on keyword overlap.
  - **Vector Store (dense)** — retrieves based on semantic similarity.
- **LLM-based Reranking:** Combines and reranks the retrieved passages using a powerful LLM to select the most relevant context for response generation.
- **Answer Generation:** Generates responses using the selected passages, adapting the prompt based on question type.
- **Multiple Response Views:** Returns answers generated from:
  - BM25-only passages
  - Vector-based passages
  - Combined and reranked passages (primary answer)
- **Contextual Dialogue Support:** Retains recent conversation history to support more coherent multi-turn interactions.

> This serves as the **core QA experience** for users, integrating all system components into a coherent and interactive retrieval-augmented generation workflow.

In [19]:

def is_multiple_choice(question):
    """Looks for at least two options like 'A)', 'B)', etc."""
    options = re.findall(r'\b[A-Z]\)', question, re.IGNORECASE)
    return len(options) >= 2

conversation_history = []  # List of (user_question, llm_reply)

def synthesize_answer_with_llm(question, passages, llm, is_mc=False, history=None):
    messages = []
    # Add conversation history if provided
    if history:
        for user_q, llm_a in history:
            messages.append(ChatMessage(role=MessageRole.USER, content=user_q))
            messages.append(ChatMessage(role=MessageRole.ASSISTANT, content=llm_a))
    if is_mc:
        system_prompt = (
            "You are a concise assistant. Only answer using the information in the provided passages. "
            "If the answer is not contained in the passages, reply: 'The answer is not available in the provided texts.' "
            "If the user asks a multiple choice question, reply ONLY with the letter (A, B, C, etc.) of the best answer, unless otherwise specified. "
            "If you are unsure, reply with the most likely answer."
        )
        user_prompt = (
            f"Question: {question}\n\n"
            "Use the following passages to answer:\n" +
            "\n".join([f"[{i+1}] {p}" for i, p in enumerate(passages)]) +
            "\n\nPlease reply with the letter (A, B, C, etc.) of the best answer only."
        )
    else:
        system_prompt = (
            "You are a concise assistant. Only answer using the information in the provided passages. "
            "If the answer is not contained in the passages, reply: 'The answer is not available in the provided texts.' "
            "Provide short, factual answers to user questions. When a one-word or short phrase answer is sufficient, such as yes or no, or an answer, avoid elaboration. "
            "When more information is needed, provide a more detailed answer."
        )
        user_prompt = (
            f"Question: {question}\n\n"
            "Use the following passages to answer:\n" +
            "\n".join([f"[{i+1}] {p}" for i, p in enumerate(passages)]) +
            "\n\nPlease provide a concise and informative answer."
        )
    messages.insert(0, ChatMessage(role=MessageRole.SYSTEM, content=system_prompt))
    messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt))
    response = llm.chat(messages)
    return response.message.content

# Chat loop
print("\n=== Congressional RAG Chatbot ===")
print("Type your questions about congressional documents.")
print("================================\n")

#exit phrases to terminate the chat
exit_phrases = {"exit", "done", "quit", "the conversation is finished", "conversation over", "end chat", ""}

while True:
    question = input("\nYour question: ").strip()
    if question.lower() in exit_phrases:
        print("Goodbye!")
        break

    bm25_results = get_bm25_results(question, top_k=TOP_K)
    bm25_texts = [r["text"] for r in bm25_results]
    vector_response = index.as_query_engine().query(question)
    vector_texts = [node.node.text for node in vector_response.source_nodes[:3]]

    candidates = list({*bm25_texts, *vector_texts})
    mc_flag = is_multiple_choice(question)
    history = conversation_history[-5:]

    # 1. Answer with only BM25 passages
    bm25_answer = synthesize_answer_with_llm(question, bm25_texts, llm, is_mc=mc_flag, history=history)
    # 2. Answer with only vector store passages
    vector_answer = synthesize_answer_with_llm(question, vector_texts, llm, is_mc=mc_flag, history=history)
    # 3. Answer with all combined passages
    reranked_passage = rerank_with_llm(question, candidates, llm)
    combined_answer = synthesize_answer_with_llm(question, [reranked_passage], llm, is_mc=mc_flag, history=history)

    print("\n LLM Answer from BM25 passages only:\n", bm25_answer)
    print("\n LLM Answer from Vedctor Store passages only:\n", vector_answer)
    print("\n LLM Answer from combined sources:\n", combined_answer)

    # Add to conversation history (using the combined answer)
    conversation_history.append((question, combined_answer))


=== Congressional RAG Chatbot ===
Type your questions about congressional documents.


 LLM Answer from BM25 passages only:
 B

 LLM Answer from Vedctor Store passages only:
 B

 LLM Answer from combined sources:
 B
Goodbye!


## Model Evaluation

This function is used to automatically generate questions from congressional transcript segments. By leveraging OpenAI’s GPT-3.5-Turbo, it prompts the model to produce **a single, clear, and specific question** grounded in the content of a given passage. This is primarily used for constructing an **evaluation dataset**, ensuring that all questions are contextually relevant and answerable based solely on the provided text. The approach enables scalable, consistent generation of questions for use in both automated and human evaluation settings.


In [11]:
def generate_question(text):
    try:
        llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
        
        messages = [
            ChatMessage(
                role=MessageRole.SYSTEM,
                content="You are an expert question generator. Generate exactly one clear, specific, and answerable question based strictly on the passage provided."
            ),
            ChatMessage(
                role=MessageRole.USER,
                content=f"Passage:\n{text}\n\nPlease generate one question."
            )
        ]
        
        response = llm.chat(messages)
        return response.message.content.strip()
    except Exception as e:
        print(f"Error generating question: {e}")
        return "No question generated"

This segment constructs a dataset of question-context pairs for evaluation purposes. It iterates through the previously chunked nodes, filtering out segments that are too short to generate meaningful questions. For each valid chunk, it invokes the question generation function to create one question per passage. The process is rate-limited to comply with API usage policies and continues until the desired number of questions is reached. Finally, the dataset is saved in JSON format for later use in LLM-based or human evaluation pipelines.

In [12]:

# Initialize list to store question-context pairs
evaluation_dataset = []
questions_window_size = 10

# Generate up to 50 questions from chunked nodes
for idx, node in enumerate(nodes):
    context = node.text.strip()
    
    # Ensure context is of sufficient length
    if len(context.split()) < 20:
        continue  # skip overly short chunks
    
    print(f"Processing node {idx + 1}/500")
    
    question = generate_question(context)
    
    if question != "No question generated":
        evaluation_dataset.append({
            "context": context,
            "question": question
        })
    
    time.sleep(1)  # Basic rate limiting

# Save the improved dataset
output_file = f"congress_qc_dataset_window{questions_window_size}.json"
with open(output_file, "w") as f:
    json.dump(evaluation_dataset, f, indent=2)

print(f"✅ Saved {len(evaluation_dataset)} improved question-context pairs to {output_file}")


Processing node 1/500
Processing node 2/500
Processing node 3/500
Processing node 4/500


KeyboardInterrupt: 

This section runs the full question-answering evaluation loop using a specified retrieval method — either BM25, dense vector retrieval, or a reranked version. For each question in the dataset, it retrieves relevant passages based on the chosen method and feeds them to the LLM to generate an answer. If reranking is enabled, the retrieved passages are refined using a second LLM pass. Retry logic is incorporated to handle potential rate limits, and final answers are saved along with their corresponding metadata for further evaluation.

In [12]:
# Choose retrieval method: "bm25", "vector", or "rerank2" for the answers to be generated
retrieval_method = "rerank2"  # Change this to switch methods based on the above

# Load your generated questions
with open(f"congress_qc_dataset_window{10}.json", "r") as f:
    qc_pairs = json.load(f)

# Initialize results container to keep track of the answers
answers = []

# Iterate over questions
for idx, pair in enumerate(qc_pairs):
    question = pair["question"]
    print(f"Answering question {idx + 1}/{len(qc_pairs)}")

    for attempt in range(3):  # Retry logic
        try:
            # Perform retrieval according to selected method
            if retrieval_method == "bm25":
                bm25_results = get_bm25_results(question, top_k=TOP_K)
                passages = [r["text"] for r in bm25_results if "text" in r]

            elif retrieval_method == "vector":
                vector_response = index.as_query_engine().query(question)
                passages = [node.node.text for node in vector_response.source_nodes[:TOP_K]] if vector_response.source_nodes else []

            #combined retrieval method is not used in the final report howeever it can be used in case more than one retrieval method is used and want to be combined
            elif retrieval_method == "combined" or retrieval_method == "rerank2":
                bm25_results = get_bm25_results(question, top_k=TOP_K)
                bm25_passages = [r["text"] for r in bm25_results if "text" in r]

                vector_response = index.as_query_engine().query(question)
                vector_passages = [node.node.text for node in vector_response.source_nodes[:TOP_K]] if vector_response.source_nodes else []

                passages = list(set(bm25_passages + vector_passages))

                # Apply reranking if requested
                if retrieval_method == "rerank2":
                    reranked_passage = rerank_with_llm(question, passages, llm)
                    print("ok")
                    passages = [reranked_passage]

            # Skip if no passages retrieved
            if not passages:
                print(f"No passages retrieved for question {idx + 1}, skipping.")
                answers.append({
                    "question": question,
                    "answer": "No relevant context retrieved.",
                    "window_size": CHUNK_SIZE,
                    "retrieval_method": retrieval_method
                })
                break

            # Synthesize answer
            answer = synthesize_answer_with_llm(question, passages, llm)

            # Save result
            answers.append({
                "question": question,
                "answer": answer,
                "window_size": CHUNK_SIZE,
                "retrieval_method": retrieval_method
            })
            #time.sleep(2)
            break  # Success

        except Exception as e:
            if "429" in str(e) or "rate limit" in str(e).lower():
                wait_time = (2 ** attempt) + random.uniform(0.5, 1.5)
                print(f" Rate limit. Retrying in {wait_time:.2f}s...")
                time.sleep(wait_time)
            else:
                print(f"↻ Attempt {attempt + 1} failed: {e}")
                if attempt == 2:
                    answers.append({
                        "question": question,
                        "answer": "Error",
                        "window_size": CHUNK_SIZE,
                        "retrieval_method": retrieval_method
                    })

# Save results
output_file = f"answers_{retrieval_method}_window{CHUNK_SIZE}.json"
with open(output_file, "w") as f:
    json.dump(answers, f, indent=2)

print(f"Answers saved to {output_file}")

Answering question 1/701
ok
Answering question 2/701
ok
Answering question 3/701
ok
Answering question 4/701
ok
Answering question 5/701
ok
Answering question 6/701
ok
Answering question 7/701
ok
Answering question 8/701
ok
Answering question 9/701
ok
Answering question 10/701
ok
Answering question 11/701
ok
Answering question 12/701
ok
Answering question 13/701
ok
Answering question 14/701
ok
Answering question 15/701
ok
Answering question 16/701
ok
Answering question 17/701
ok
Answering question 18/701
ok
Answering question 19/701
ok
Answering question 20/701
ok
Answering question 21/701
ok
Answering question 22/701
ok
Answering question 23/701
ok
Answering question 24/701
ok
Answering question 25/701
ok
Answering question 26/701
ok
Answering question 27/701
ok
Answering question 28/701
ok
Answering question 29/701
ok
Answering question 30/701
ok
Answering question 31/701
ok
Answering question 32/701
ok
Answering question 33/701
ok
Answering question 34/701
ok
Answering question 35/7

To assess the quality of answers generated by the system, an automatic evaluation pipeline is applied. For each question-answer pair, the system uses OpenAI’s GPT-3.5 to score the answer across three key aspects:

- **Completeness**: Whether the response fully addresses the question with appropriate details.
- **Relevance**: Whether the response stays focused on the original query without introducing unrelated information.
- **Faithfulness**: Whether the response stays truthful to the given context and avoids hallucinations.

Each aspect is rated on a strict 0–5 scale based on a predefined rubric, and a justification is provided for each score. The evaluation results are stored and summarized, including average scores across the dataset. This automated scoring provides a consistent and scalable way to measure model performance under different retrieval configurations which is used further in the report.

In [17]:
def evaluate_response(query: str, context: str, response: str) -> dict:
    """
    Evaluate a response based on Completeness, Relevance, and Faithfulness using a moderately strict rubric.
    """
    # Check if response indicates no answer available - automatically score as 0
    if response.strip().lower() == "the answer is not available in the provided texts.":
        return {
            "completeness": {"score": 0, "explanation": "No answer provided - system indicated answer not available"},
            "relevance": {"score": 0, "explanation": "No answer provided - system indicated answer not available"},
            "faithfulness": {"score": 0, "explanation": "No answer provided - system indicated answer not available"}
        }
    
    llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

    evaluation_prompt = f"""You are a critical but fair evaluator. Evaluate the following response based on three metrics:

1. Completeness (0-5):
- 5: Fully answers the question with all key details and specific elements from the context. Minor omissions that don't alter the core are acceptable.
- 4: Good response that answers the question but lacks some secondary specifics or elaboration.
- 3: Partial answer that misses important content or is too generic.
- 2: Barely addresses the question or lacks coherence.
- 1: Mentions the topic but provides little to no substance.
- 0: Completely fails to address the question or answer is not available in the provided texts.

2. Relevance (0-5):
- 5: Response is tightly focused on the question with no off-topic content.
- 4: Mostly relevant, with only minor digressions or extra details.
- 3: Includes relevant content but is diluted by tangents or unrelated points.
- 2: Weak focus on the question; mostly off-topic.
- 1: Barely relevant.
- 0: Irrelevant or answer is not available in the provided texts.

3. Faithfulness (0-5):
- 5: Fully grounded in the provided context. All claims are supported.
- 4: Minor inferences or stylistic interpretations, but no factual hallucinations.
- 3: Some information not found in context or assumptions present.
- 2: Deviates clearly from the facts in the context.
- 1: Mostly incorrect based on the context.
- 0: Entirely misrepresents the context or answer is not available in the provided texts.

Question: {query}

Context: {context}

Response: {response}

Instructions:
- Assign a score from 0-5 for each metric.
- Justify each score with a clear explanation.
- Be fair but avoid giving perfect scores unless justified.

Return the result as JSON with this format:
{{
    "completeness": {{
        "score": int,
        "explanation": "string"
    }},
    "relevance": {{
        "score": int,
        "explanation": "string"
    }},
    "faithfulness": {{
        "score": int,
        "explanation": "string"
    }}
}}"""

    messages = [
        ChatMessage(
            role=MessageRole.SYSTEM,
            content="You are a fair and methodical evaluator of NLP model outputs. Follow the rubric exactly."
        ),
        ChatMessage(
            role=MessageRole.USER,
            content=evaluation_prompt
        )
    ]

    try:
        evaluation = llm.chat(messages)
        return json.loads(evaluation.message.content)
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return {
            "completeness": {"score": 0, "explanation": "Evaluation failed"},
            "relevance": {"score": 0, "explanation": "Evaluation failed"},
            "faithfulness": {"score": 0, "explanation": "Evaluation failed"}
        }

def save_evaluations_to_json(answers_rerank_window, retrieval_method, output_file=None):
    """
    Evaluate and save results for all answers in the rerank window to a JSON file.
    Uses context from congress_qc_dataset_window10.json for each question.
    """
    if output_file is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"evaluation_results_{retrieval_method}_{timestamp}.json"
    
    # Load the original dataset to get the context for each question
    print(f"Loading original dataset to get question-specific context for {retrieval_method}...")
    try:
        with open('congress_qc_dataset_window10.json', 'r', encoding='utf-8') as f:
            original_dataset = json.load(f)
        
        # Create a mapping of questions to their original context
        question_to_context = {}
        for item in original_dataset:
            question_to_context[item['question']] = item['context']
        
        print(f"Loaded {len(question_to_context)} question-context pairs from original dataset")
    except FileNotFoundError:
        print("Warning: congress_qc_dataset_window10.json not found. Using fallback context.")
        question_to_context = {}
    except Exception as e:
        print(f"Error loading original dataset: {e}. Using fallback context.")
        question_to_context = {}
    
    # Evaluate all answers
    answers_to_evaluate = answers_rerank_window
    results = []
    
    print(f"Processing all {len(answers_to_evaluate)} answers for {retrieval_method}...")
    
    for idx, answer in enumerate(answers_to_evaluate):
        # Extract fields using the correct keys from your JSON
        query = answer.get('question', '')
        response = answer.get('answer', '')
        
        if not all([query, response]):
            print(f"Warning: Missing required fields in answer {idx}. Skipping...")
            continue
        
        # Get the original context for this question
        original_context = question_to_context.get(query, "Context not found in original dataset")
        
        print(f"Evaluating answer {idx + 1}/{len(answers_to_evaluate)} for {retrieval_method}")
        evaluation = evaluate_response(
            query=query,
            context=original_context,  # Use the original context from the dataset
            response=response
        )
        
        # Create a more concise result entry
        result_entry = {
            "question": query,
            "answer": response,
            "retrieval_method": retrieval_method,
            "original_context": original_context[:200] + "..." if len(original_context) > 200 else original_context,
            "scores": {
                "completeness": evaluation["completeness"]["score"],
                "relevance": evaluation["relevance"]["score"],
                "faithfulness": evaluation["faithfulness"]["score"]
            },
            "explanations": {
                "completeness": evaluation["completeness"]["explanation"],
                "relevance": evaluation["relevance"]["explanation"],
                "faithfulness": evaluation["faithfulness"]["explanation"]
            }
        }
        results.append(result_entry)
    
    if not results:
        print(f"No valid answers were found to evaluate for {retrieval_method}!")
        return None
    
    # Calculate average scores
    avg_scores = {
        "completeness": sum(r["scores"]["completeness"] for r in results) / len(results),
        "relevance": sum(r["scores"]["relevance"] for r in results) / len(results),
        "faithfulness": sum(r["scores"]["faithfulness"] for r in results) / len(results)
    }
    
    # Create final output structure
    output = {
        "timestamp": datetime.now().isoformat(),
        "retrieval_method": retrieval_method,
        "total_answers": len(results),
        "average_scores": avg_scores,
        "answers": results  # List of answers with their scores
    }
    
    # Save to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    
    print(f"\nEvaluation results saved to: {output_file}")
    print(f"\nAverage Scores for {retrieval_method}:")
    for metric, score in avg_scores.items():
        print(f"{metric.capitalize()}: {score:.2f}")
    
    return output_file

# Define all retrieval methods to evaluate
retrieval_methods = ["bm25", "vector", "rerank2"]

# Load the original dataset once
print("Loading original dataset to get question-specific context...")
try:
    with open('congress_qc_dataset_window10.json', 'r', encoding='utf-8') as f:
        original_dataset = json.load(f)
    
    # Create a mapping of questions to their original context
    question_to_context = {}
    for item in original_dataset:
        question_to_context[item['question']] = item['context']
    
    print(f"Loaded {len(question_to_context)} question-context pairs from original dataset")
except FileNotFoundError:
    print("Warning: congress_qc_dataset_window10.json not found. Using fallback context.")
    question_to_context = {}
except Exception as e:
    print(f"Error loading original dataset: {e}. Using fallback context.")
    question_to_context = {}

# Run evaluation for all retrieval methods
all_results = {}

for retrieval_method in retrieval_methods:
    print(f"\n{'='*60}")
    print(f"EVALUATING {retrieval_method.upper()} RETRIEVAL METHOD")
    print(f"{'='*60}")
    
    try:
        filename = f'answers_{retrieval_method}_window{CHUNK_SIZE}.json'
        with open(filename, 'r', encoding='utf-8') as f:
            answers_rerank_window = json.load(f)
        
        output_filename = f"evaluation_scores_{retrieval_method}_window{CHUNK_SIZE}_all_questions.json"

        output_file = save_evaluations_to_json(
            answers_rerank_window,
            retrieval_method,
            output_filename
        )
        
        if output_file:
            all_results[retrieval_method] = output_file
            
    except FileNotFoundError:
        print(f"Error: answers file for {retrieval_method} not found!")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in answers file for {retrieval_method}!")
    except Exception as e:
        print(f"An unexpected error occurred for {retrieval_method}: {e}")

# Print summary of all results
print(f"\n{'='*60}")
print("EVALUATION SUMMARY")
print(f"{'='*60}")
print(f"Successfully evaluated {len(all_results)} out of {len(retrieval_methods)} retrieval methods:")
for method, file_path in all_results.items():
    print(f" {method}: {file_path}")

if len(all_results) == len(retrieval_methods):
    print("\n All retrieval methods have been successfully evaluated!")
else:
    print(f"\n  {len(retrieval_methods) - len(all_results)} retrieval method(s) failed to evaluate.")

Loading original dataset to get question-specific context...
Loaded 699 question-context pairs from original dataset

EVALUATING BM25 RETRIEVAL METHOD
Loading original dataset to get question-specific context for bm25...
Loaded 699 question-context pairs from original dataset
Processing all 701 answers for bm25...
Evaluating answer 1/701 for bm25
Evaluating answer 2/701 for bm25
Evaluating answer 3/701 for bm25
Evaluating answer 4/701 for bm25
Evaluating answer 5/701 for bm25
Evaluating answer 6/701 for bm25
Evaluating answer 7/701 for bm25
Evaluating answer 8/701 for bm25
Evaluating answer 9/701 for bm25
Evaluating answer 10/701 for bm25
Evaluating answer 11/701 for bm25
Evaluating answer 12/701 for bm25
Evaluating answer 13/701 for bm25
Evaluating answer 14/701 for bm25
Evaluating answer 15/701 for bm25
Evaluating answer 16/701 for bm25
Evaluating answer 17/701 for bm25
Evaluating answer 18/701 for bm25
Evaluating answer 19/701 for bm25
Evaluating answer 20/701 for bm25
Evaluating an