## Evaluating RAG pipelines

In this notebook we will present different metrics to evaluate your RAG pipeline.


In [None]:
%pip install -qU pypdf llama-cpp-python huggingface_hub -q
%pip install -qU sentence_transformers -q
%pip install rank_bm25 -q
%pip install -q ragas -q
%pip install chromadb -q
%pip install openai -q
%pip install langchain -q   
%pip install langchain_openai -q
%pip intsall langchain_community -q
%pip install langchain_core -q
%pip install requests==2.27.1 -q
%pip install python-dotenv -q

In [2]:
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_colwidth', None)
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file

openai_api_key = os.getenv("AZURE_OPENAI_KEY")
openai_api_version = '2023-08-01-preview'
model_deployment_name = os.getenv('MODEL_DEPLOYMENT_NAME')
azure_deployment=os.getenv('EMBEDDING_DEPLOYMENT_NAME')
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["OPENAI_API_VERSION"] = openai_api_version
os.environ["MODEL_DEPLOYMENT_NAME"] = model_deployment_name

Load Your Documents


In [4]:
from pypdf import PdfReader   
  
    
reader = PdfReader("../assets/Acceptable_Use_Policy.pdf")  
  
# Initialize an empty string to store the text  
text = ''  
  
# Loop through each page in the PDF and extract the text  
for page in reader.pages:  
    text += page.extract_text()
reader

<pypdf._reader.PdfReader at 0x115e0cfa0>

Document Splitter


In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 100,
    length_function = len,
    is_separator_regex = False
)
#
texts = text_splitter.create_documents([text])

In [6]:
split_docs = text_splitter.split_documents(texts)
print(len(split_docs))

39


Instantiate the Vectorstore


In [7]:
import openai
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
)
vectorstore = Chroma.from_documents(documents=split_docs, embedding=embeddings)

<br>
Naive RAG
<img src="naiverag.png" width="950" align="center">


Advanced RAG
<img src="advancedrag.png" width="950" align="center">
<br>


In [None]:
from __future__ import annotations
from typing import Dict, Optional, Sequence
from langchain.schema import Document
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import ContextualCompressionRetriever
bm25_retriever = vectorstore
bm25_retriever.k= 10
os.environ['CURL_CA_BUNDLE'] = ''
from langchain.callbacks.manager import Callbacks
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors.base import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_transformers.long_context_reorder import LongContextReorder
from langchain.retrievers.multi_query import MultiQueryRetriever
from sentence_transformers import CrossEncoder
# from config import bge_reranker_large

class BgeRerank(BaseDocumentCompressor):
    model_name:str = 'BAAI/bge-reranker-large'
    """Model name to use for reranking."""
    top_n: int = 3
    """Number of documents to return."""
    model:CrossEncoder = CrossEncoder(model_name)
    """CrossEncoder instance to use for reranking."""

    def bge_rerank(self,query,docs):
        model_inputs =  [[query, doc] for doc in docs]
        scores = self.model.predict(model_inputs)
        results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
        return results[:self.top_n]


    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        callbacks: Optional[Callbacks] = None,
    ) -> Sequence[Document]:
        """
        Compress documents using BAAI/bge-reranker models.

        Args:
            documents: A sequence of documents to compress.
            query: The query to use for compressing the documents.
            callbacks: Callbacks to run during the compression process.

        Returns:
            A sequence of compressed documents.
        """
        if len(documents) == 0:  # to avoid empty api call
            return []
        doc_list = list(documents)
        _docs = [d.page_content for d in doc_list]
        results = self.bge_rerank(query, _docs)
        final_results = []
        for r in results:
            doc = doc_list[r[0]]
            doc.metadata["relevance_score"] = r[1]
            final_results.append(doc)
        return final_results
    


vs_retriever = vectorstore.as_retriever(search_kwargs={"k":5})
#

ensemble_retriever = vs_retriever
#

redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
print(redundant_filter)
#
reordering = LongContextReorder()
#
reranker = BgeRerank()
#
pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter,reordering,reranker])


compression_pipeline = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                      base_retriever=ensemble_retriever)


In [13]:
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI

# 
llm = AzureChatOpenAI(
    api_key = openai_api_key,
    api_version=openai_api_version,
    deployment_name=model_deployment_name,
    model_name="gpt-3.5-turbo-613"
)

llm4 = AzureChatOpenAI(
        api_key = openai_api_key,
        api_version=openai_api_version,
        deployment_name="gpt-4-0613",
        model_name="gpt-4"
        )

In [None]:
from langchain.chains import RetrievalQA
#
qa_naive = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=vectorstore.as_retriever(search_kwargs={"k":5}),
                                 return_source_documents=True)

naive_response = qa_naive("What is the code of conduct?")
naive_response["result"]


In [None]:
qa_advanced = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=compression_pipeline,
                                 return_source_documents=True)
#
qa_adv_response = qa_advanced("What is the code of conduct?")  
qa_adv_response["result"]


Evaluating Naive RAG and Advanced RAG using RAGAS evaluation Framework
Synthetic Test Set Generation


We can leverage Ragas’ Synthetic Test Data generation functionality to generate our own synthetic QC pairs - as well as a synthetic ground truth - quite easily!

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context


from typing import Union, Optional
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.language_models.llms import BaseLLM
from ragas.testset.extractor import KeyphraseExtractor
from ragas.testset.docstore import Document, DocumentStore, InMemoryDocumentStore
from ragas.llms import  LangchainLLMWrapper
from ragas.embeddings.base import BaseRagasEmbeddings, LangchainEmbeddingsWrapper


class TestsetGeneratorAzure(TestsetGenerator):
    @classmethod
    def with_openai(
            cls,
            generator_llm: Union[BaseLLM, BaseChatModel],
            critic_llm: Union[BaseLLM, BaseChatModel],
            embeddings,
            docstore: Optional[DocumentStore] = None,
            chunk_size: int = 1024
    ):
        generator_llm_model = LangchainLLMWrapper(generator_llm)
        critic_llm_model = LangchainLLMWrapper(critic_llm)
        embeddings_model = LangchainEmbeddingsWrapper(embeddings)
        keyphrase_extractor = KeyphraseExtractor(llm=generator_llm_model)
        if docstore is None:
            from langchain.text_splitter import TokenTextSplitter

            splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
            docstore = InMemoryDocumentStore(
                splitter=splitter,
                embeddings=embeddings_model,
                extractor=keyphrase_extractor,
            )
            print("1")
            return cls(
                generator_llm=generator_llm_model,
                critic_llm=critic_llm_model,
                embeddings=embeddings_model,
                docstore=docstore,
            )
        else:
            print("2")
            return cls(
                generator_llm=generator_llm_model,
                critic_llm=critic_llm_model,
                embeddings=embeddings_model,
                docstore=docstore,
            )

Ragas is a powerful library that lets us evaluate our RAG pipeline by collecting input/output/context triplets and obtaining metrics relating to a number of different aspects of our RAG pipeline.



RAGAS evaluation frame evaluates the two main components of the RAG pipeline:
Retriever
Generator


The metrics associated with evaluating Retrieval is as follows:
Context Precision : How relevant is the context retrieved to the question asked.
Context Recall : Is the retriever able to retrieve all of the relevant context pertaining to ground truth.

The metrics associated with evaluating Generation is as follows:

Answer Relevancy : How relevant is the answer to our initial question
Faithfulness : It tries to measure the factual consistency of the generated answers against the given context.

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGeneratorAzure.with_openai(llm,llm4,embeddings)
#
testset = generator.generate_with_langchain_docs(texts, test_size=4, distributions={simple: 0.2, reasoning: 0.4, multi_context:0.4})

In [None]:
import pandas as pd
test_df = testset.to_pandas()

Generating Responses with RAG Pipeline

Now that we have some QC pairs, and some ground truths, let’s evaluate our RAG pipeline using Ragas. The process is, again, quite straightforward — thanks to Ragas and LangChain! Let’s start by extracting our questions and ground truths from our create test set. We can start by converting our test dataset into a Pandas DataFrame.

In [None]:
#test_df.to_csv('testset_AUP_15.csv')

In [None]:
test_df = pd.read_csv('testset_AUP_15.csv')

test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()
test_df

Naive RAG

In [None]:
#Generate responses using Naive RAG pipeline using the questions we’ve generated.
from datasets import Dataset

adv_answers = []
adv_contexts = []

for question in test_questions:
    response = qa_naive.invoke({"query" : question})
    adv_answers.append(response["result"])
    adv_contexts.append([context.page_content for context in response['source_documents']])

#wrap into huggingface dataset
response_dataset_naive_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : adv_answers,
    "contexts" : adv_contexts,
    "ground_truth" : test_groundtruths
})
response_dataset_naive_retrieval[0]

Advanced RAG

In [None]:
#Generate responses using Advanced RAG pipeline using the questions we’ve generated.
from datasets import Dataset

adv_answers = []
adv_contexts = []

for question in test_questions:
    response = qa_advanced.invoke({"query" : question})
    adv_answers.append(response["result"])
    adv_contexts.append([context.page_content for context in response['source_documents']])

#wrap into huggingface dataset
response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : adv_answers,
    "contexts" : adv_contexts,
    "ground_truth" : test_groundtruths
})
response_dataset_advanced_retrieval[0]

In [None]:
#!pip install mlflow
import mlflow

experiment_name = "ragas_metrices"
mlflow.set_experiment(experiment_name)

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

with mlflow.start_run(run_name="Naive_Rag_re") as run:
    metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,]

    results_naive = evaluate(response_dataset_naive_retrieval, metrics,raise_exceptions=False, llm = llm4, embeddings=embeddings)

    mlflow.log_metrics(results_naive)

mlflow.end_run()


#


In [None]:
results_naive

In [None]:
with mlflow.start_run(run_name="Advanced_Rag_re") as run:
    metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,]

    results_advanced = evaluate(response_dataset_advanced_retrieval, metrics,raise_exceptions=False, llm = llm4, embeddings=embeddings)

    mlflow.log_metrics(results_advanced)
    
mlflow.end_run()

In [None]:
results_advanced

In [None]:
results.to_csv('Naive_rag_results.csv')
results_advanced.to_csv('Adv_rag_results.csv')