# Evaluating RAG Pipelines for Document-Based Question Answering

In this notebook, we assess the performance of three distinct Retrieval-Augmented Generation (RAG) pipelines—**Standard RAG**, **Two-Stage Consecutive RAG**, and **Hybrid RAG**—for document-based question answering tasks. 
Utilizing LangChain's `QAGenerateChain`, we generate a comprehensive set of question-answer pairs from a collection of PDF documents. Each RAG pipeline processes these questions to generate answers, which are then evaluated and scored using LangChain's evaluation tools. 
Finally, we compare the accuracy rates of each pipeline to determine their effectiveness in delivering precise and contextually relevant responses based on the provided documents.

In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


# Required imports
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
from langchain.document_loaders import PyPDFLoader
from langchain.evaluation.qa import QAGenerateChain, QAEvalChain
from langchain_openai import ChatOpenAI
import numpy as np
from pprint import pprint
from hydra import compose, initialize
from omegaconf import DictConfig, OmegaConf
initialize(config_path="../configs", job_name="notebook_config")

import sys
sys.path.append('../src')

from helper_functions import PDFManager, Retrievers, QAchains, Hybrid_Retrieval

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path="../configs", job_name="notebook_config")
  from tqdm.autonotebook import tqdm, trange


streamlit is not running


# Parameters

In [None]:

data_root = "../data/pdfs_selected/"

config = compose(config_name="config")
modelID = config.llm.openai_modelID
top_k_BM25 = config.Retrieval.top_k_BM25
top_k_semantic = config.Retrieval.top_k_semantic
top_k_final = config.Retrieval.top_k_final

question = " According to the documents, what is Morningstar's view on the Federal Reserve's interest rate decisions for the remainder of 2024 and into 2025?"

# QA evaluator

## document-level chunks
Creating one chunk per pdf file

In [None]:
filenames = [f for f in glob.glob(data_root + '*.pdf') if os.path.isfile(f)]

documents = []
for file in filenames:    
    loader = PyPDFLoader(f'{file}')
    pages = loader.load()
    # add all pages together    
    combined_page_content = "".join([page.page_content for page in pages])
    document = {"page_content": combined_page_content, "metadata": {"source": file}}
    documents.append(document)
    print(file)
print(f'{len(documents)} PDF chunks created with one chunk per PDF\n')

## QA pair generation using LLM

In [None]:
# generating n_samples of QA pairs from available PDFs
n_samples = 10

index = np.random.randint(1, len(documents), n_samples)
sample_docs = [documents[i] for i in index]   
llm = ChatOpenAI(temperature = 0.0, model=modelID)

example_gen_chain = QAGenerateChain.from_llm(llm)
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in sample_docs],
)
print(f'{len(new_examples)} QA pairs generated from PDFs\n')

for i, example in enumerate(new_examples):
    source = sample_docs[i]['metadata']['source']
    print("\n Source file:\n", source)    
    example['source'] = source
    print(f'Question:')
    pprint(f'{example.get('qa_pairs').get('query')}')
    print('Answer:')
    pprint(f'{example.get('qa_pairs').get('answer')}')    

# Two-Stage RAG

In [None]:
import warnings
import numpy as np

# config.settings.verbose = True

pdf_manager = PDFManager(data_root, config)
pdf_manager.load_pdfs()
pdf_manager.chunk_documents()
pdf_manager.create_vectorstore()

retrievers = Retrievers(pdf_manager, config)
retrievers.setup_retrievers()
qa_chains = QAchains(retrievers, config)

def two_stage_rag(question):
    qa_chains.shorten_question(question)
    qa_chains.retrieve_context()
    answer = qa_chains.generate_answer()
    return answer

## QA pair evaluation loop

In [None]:
predictions = list()
examples = list()
for example, idoc in zip(new_examples, index):
    print(f'Document {idoc}')
    question = example['qa_pairs']['query']
    pprint(question)
    answer = two_stage_rag(question)
    prediction = {'query': example['qa_pairs']['query'], 'answer': example['qa_pairs']['answer']}
    prediction["result"] = answer
    predictions.append(prediction)
    examples.append(
            example.get('qa_pairs')
        )

eval_chain = QAEvalChain.from_llm(llm)
graded_outputs_2RAG = eval_chain.evaluate(examples, predictions, prediction_key="result")
print('The result of RAG evaluation for the given example questions: ')

accuracy_2RAG = len([result for result in graded_outputs_2RAG if result['results'] == 'CORRECT'])/len(graded_outputs_2RAG)
print(f'\n Accuracy of the RAG pipeline: {accuracy_2RAG} \n')

graded_outputs_2RAG

# Hybrid Rag

In [None]:
hybrid_retrieval = Hybrid_Retrieval(pdf_manager, retrievers, config)
hybrid_RAG_QA = QAchains(retrievers, config)
def hybrid_rag(question,top_k_BM25, top_k_semantic, top_k_final, rrf_k = 60, hybrid = True):
    top_score_docs = hybrid_retrieval.hybrid_retriever(question, top_k_BM25, top_k_semantic, top_k_final, rrf_k, hybrid)
    hybrid_RAG_QA.top_score_docs = top_score_docs
    hybrid_RAG_QA.question = question
    answer = hybrid_RAG_QA.generate_answer()
    return answer
# answer_hybrid = hybrid_rag(question, 200, 50, 10)    
# pprint(answer_hybrid)

## QA pair evaluation loop

In [None]:
predictions_hybrid = list()
examples = list()
for example, idoc in zip(new_examples, index):
    question = example['qa_pairs']['query']
    print(f'Question:')
    pprint(question)
    answer = hybrid_rag(question, top_k_semantic, top_k_semantic, top_k_final)
    print(f'Answer:')
    pprint(answer)
    prediction = {'query': example['qa_pairs']['query'], 'answer': example['qa_pairs']['answer']}
    prediction["result"] = answer
    predictions_hybrid.append(prediction)
    examples.append(
            example.get('qa_pairs')
        )

eval_chain = QAEvalChain.from_llm(llm)
graded_outputs_hybrid = eval_chain.evaluate(examples, predictions_hybrid, prediction_key="result")
print('The result of RAG evaluation for the given example questions: ')

accuracy_hybrid = len([result for result in graded_outputs_hybrid if result['results'] == 'CORRECT'])/len(graded_outputs_hybrid)
print(f'\n Accuracy of the RAG pipeline: {accuracy_hybrid} \n')

graded_outputs_hybrid

# Normal RAG

In [None]:
predictions_rag = list()
examples = list()
for example, idoc in zip(new_examples, index):
    question = example['qa_pairs']['query']
    print(f'Question:')
    pprint(question)
    answer = hybrid_rag(question, top_k_semantic, top_k_semantic, top_k_final, hybrid = False)
    print(f'Answer:')
    pprint(answer)
    prediction = {'query': example['qa_pairs']['query'], 'answer': example['qa_pairs']['answer']}
    prediction["result"] = answer
    predictions_rag.append(prediction)
    examples.append(
            example.get('qa_pairs')
        )

eval_chain = QAEvalChain.from_llm(llm)
graded_outputs_rag = eval_chain.evaluate(examples, predictions_rag, prediction_key="result")
print('The result of RAG evaluation for the given example questions: ')

accuracy_rag = len([result for result in graded_outputs_rag if result['results'] == 'CORRECT'])/len(graded_outputs_rag)
print(f'\n Accuracy of the RAG pipeline: {accuracy_rag} \n')

graded_outputs_rag

# Comparison of RAG pipelines

In [None]:
# Create a dataframe
data = {
    'Method': ['RAG', 'Hybrid RAG', 'Two-Stage RAG'],
    'Large Scale Accuracy (%)': [accuracy_rag_large, accuracy_hybrid_large, accuracy_2RAG_large],
    'Small Scale Accuracy (%)': [accuracy_rag_small, accuracy_hybrid_small, accuracy_2RAG_small]
}
df = pd.DataFrame(data)
df_melted = df.melt(id_vars='Method', var_name='Experiment Scale', value_name='Accuracy (%)')
df_melted['Accuracy (%)'] = (df_melted['Accuracy (%)']*100).round(2)
df_melted.head()


# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Create the bar plot
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(
    x='Method',
    y='Accuracy (%)',
    hue='Experiment Scale',
    data=df_melted,
    # palette='viridis'
)

# Add percentage labels on top of the bars
for p in bar_plot.patches:
    height = p.get_height()
    bar_plot.annotate(f'{height}%',
                      (p.get_x() + p.get_width() / 2., height),
                      ha='center', va='bottom',
                      fontsize=11)

# Set labels and title
plt.ylim(0, 100)  # Adjusted to fit percentage scale
plt.title('Accuracy Comparison of RAG Methods Across Experiment Scales', fontsize=16)
plt.ylabel('Accuracy (%)')
plt.xlabel('Method')
plt.legend(title='Experiment Scale')

# Display the plot
plt.show()