# Demonstration RAG-Eval


Quick test: see if things work as planned

In [24]:
from rag_eval.protocol import LLMClient
from rag_eval.client import AzureClient
from rag_eval.components import ClaimExtractor, EntailmentJudge
from rag_eval.data_models import EvalContainer, ClaimVerdict
from rag_eval.evaluators import AnswerPrecisionEvaluator, RetrievalPrecisionEvaluator
from rag_eval.coordinator import RAGEvaluator
from rag_eval.helpers import _safe_json
import json
from data.rag_eval_samples import test_examples
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

In [25]:

api_key = os.getenv("OPENAI_API_KEY")
api_version = os.getenv("OPENAI_API_VERSION")
base_url = os.getenv("OPENAI_CHATCOMPLETIONS_URL")
model = os.getenv("OPENAI_DEPLOYMENT")
password = os.getenv("CORRECT_PASSWORD")

llm = AzureClient(
    api_key=api_key,
    api_version=api_version,
    base_url=base_url,
    model=model
)


# Test Agent

Check if agent module actually works.

In [26]:
# load agent and pipeline for indexing docs
import agent.main as main
from rag.indexing import pdf_files, index_pipeline

In [27]:
index_pipeline.run({"converter": {"sources": pdf_files}})

incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.65it/s]
ID '6813de646a27f38897ceead174c0e733a49594824454166a61a9058a4eb27340' already exists
ID '55ea35af218d8f635cc1f164380ae56bd9ab934880719bd83532052a312b01d8' already exists
ID 'fa907c2693059c9626593992e357cafda724bc44afba9d0d7b86565380415d76' already exists
ID '8db7c025cfc37b01c412b348b9371a6a3259c15b4fe7b957fff2e749af64d3f1' already exists
ID '17f5daa9f6e8c9fbca59050f0cc03b48124ae7aca7bfe5dc101b3e92ef5fafc7' already exists
ID '9d17892efc6f60f289cc10133c069fcc3d75225288d139060930cbf5a20d7b0b' already exists
ID 'cc3053f925e71ae06c2e20569

{'writer': {'documents_written': 0}}

## RAG-Loop
Loop which passes questions to agent and extracts generated answers alongside retrieved contexts (plain text). Combines them with ground truth and query to form `EvalContainer`-object.

In [28]:
# quick test of my query_gt_extractor function
from src.rag_eval.helpers import query_gt_extractor

queries, gt = query_gt_extractor(path="data/agent_eval_questions.json", tokenize=False)
print(gt)


["Bei Il sogno de Anton ist der Espresso teurer: Anton's Pizza Espresso 2,00 € vs. Il sogno de Anton Espresso 3,50 €. ", "Risotto gibt es bei Anton's Pizza (z. B. ai Funghi 11,50 €, Meeresfrüchte 14,00 €) und bei Il sogno de Anton (al Tartufo Nero 28,00 €). Damit ist das günstigste Risotto 11,50 € (Anton’s), das teuerste 28,00 € (Il sogno). ", 'Französisch: mehrgängige Struktur (Entrée, Plat principal, Fromage, Dessert) mit Weinbegleitung aus Regionen wie Bordeaux/Burgund/Champagne; Italienisch: Antipasti → Primo (Pasta/Risotto) → Secondo → Contorno → Dolce mit begleitenden Weinen. ', "UNESCO 2010: das 'Gastronomische Mahl der Franzosen' (immaterielles Kulturerbe) und die mexikanische Küche (als erste Küche überhaupt immaterielles Kulturerbe). Beide 2010, aber unterschiedliche Gegenstände der Würdigung. ", "Mexikos 'heilige Fünf': Mais, Bohnen, Chili (u. a.); Italiens Grundpfeiler: z. B. Pizza und Pasta/Olivenöl-Wein-Getreide-Tradition. ", 'Prosecco (Il sogno de Anton) 50,00 € (0,75 l)

This code cell creates a RAG-loop for the agent. It:  
- feeds the questions from the test battery to the agent  
- extracts the generated answer and retrieved context from the RAG-call
- returns a list of `EvalContainer`-objects storing question, ground truth, generated answer and retrieved context

It thus lays the foundation for the `RAGEevaluator`. 

In [29]:
from src.rag_eval.eval_loop import rag_loop_agent

# commented out because I save the resulting data object to avoid frequent execution
# result = rag_loop_agent()

In [30]:
# show eval containers
#print(result)

import pickle

# save result
#with open("result.pkl", "wb") as f:
#    pickle.dump(result, f)

# access pickled result
with open("result.pkl", "rb") as f:
    result = pickle.load(f)

data = result


In [31]:
print(data)

[EvalContainer(query="In welchem Restaurant kostet der Espresso mehr: bei Anton's Pizza oder bei Il sogno de Anton, und wie hoch sind die jeweiligen Preise?", ground_truth_answer="Bei Il sogno de Anton ist der Espresso teurer: Anton's Pizza Espresso 2,00 € vs. Il sogno de Anton Espresso 3,50 €. ", generated_answer="Im bereitgestellten Kontext ist kein Preis für einen Espresso bei Anton's Pizza oder bei Il sogno de Anton aufgeführt. Die Getränkekarten und Preise listen zwar verschiedene Speisen und Getränke auf, einen Espresso oder dessen Preis findet man jedoch nicht in den genannten Auszügen. Daher kann leider nicht beantwortet werden, wie viel ein Espresso bei diesen Restaurants kostet.", retrieved_texts=["Anton's Pizza - Doggish Delights\nVorspeisen\nBruschetta mit Tomaten und Basilikum - 5,50 €\nKnoblauchbrot mit Kräuterbutter - 4,00 €\nCaprese (Mozzarella, Tomate, Basilikum) - 6,50 €\nFrittierte Mozzarella-Sticks - 6,00 €\nPizza\nMargherita (Tomate, Mozzarella, Basilikum) - 8,50 €

## Evaluation of Agentic RAG with RAGEvaluator


In [32]:
# initiate llm 
api_key = os.getenv("OPENAI_API_KEY")
api_version = os.getenv("OPENAI_API_VERSION")
base_url = os.getenv("OPENAI_CHATCOMPLETIONS_URL")
model = os.getenv("OPENAI_DEPLOYMENT")
password = os.getenv("CORRECT_PASSWORD")

llm = AzureClient(
    api_key=api_key,
    api_version=api_version,
    base_url=base_url,
    model=model
)


# run evaluator
coord = RAGEvaluator(llm_extractor=llm, llm_judge=llm)

In [33]:
# create EvalContainer-object for each question-ground truth-pair 
#from src.rag_eval.eval_loop import rag_loop_agent
import pandas as pd
#result = rag_loop_agent()

# initiate llm 
api_key = os.getenv("OPENAI_API_KEY")
api_version = os.getenv("OPENAI_API_VERSION")
base_url = os.getenv("OPENAI_CHATCOMPLETIONS_URL")
model = os.getenv("OPENAI_DEPLOYMENT")
password = os.getenv("CORRECT_PASSWORD")

llm = AzureClient(
    api_key=api_key,
    api_version=api_version,
    base_url=base_url,
    model=model
)


# run evaluator to create metrics: answer_precision, answer_recall, retrieval_precision, retrieval_recall
coord = RAGEvaluator(llm_extractor=llm, llm_judge=llm)

# # run rag_evaluation and extract metrics from it 

# answer_recall = []
# answer_precision = []
# retrieval_recall = []
# retrieval_precision = []

# for i, res in enumerate(data):
#     #iteration tracker
#     print(f"=== ITERATION-No. {i} ===")

#     result = coord.evaluate_all(res)

#     # extracting all variables needed
#     answer_recall.append(result["answer_quality"]["answer_recall"])
#     answer_precision.append(result["answer_quality"]["answer_precision"])
#     retrieval_recall.append(result["retrieval_quality"]["retrieval_recall"])
#     retrieval_precision.append(result["retrieval_quality"]["retrieval_precision"])


In [34]:
#print(answer_recall)

In [35]:
# turn data into df for visualisation
# df_data_agentic = {
#     "answer_recall": answer_recall,
#     "answer_precision": answer_precision,
#     "retrieval_recall": retrieval_recall,
#     "retrieval_precision": retrieval_precision
# }

# df_agentic = pd.DataFrame(df_data_agentic)

# print(df_agentic)

# df_agentic.to_csv("df_agentic.csv", index=True)


In [36]:
df_agentic = pd.read_csv("df_agentic.csv")
df_agentic.describe()

Unnamed: 0.1,Unnamed: 0,answer_recall,answer_precision,retrieval_recall,retrieval_precision
count,29.0,29.0,29.0,29.0,29.0
mean,14.0,0.515148,0.347249,0.581281,0.165517
std,8.514693,0.314428,0.320997,0.313003,0.097379
min,0.0,0.0,0.0,0.0,0.0
25%,7.0,0.333333,0.090909,0.4,0.1
50%,14.0,0.5,0.263158,0.5,0.2
75%,21.0,0.714286,0.5,0.833333,0.2
max,28.0,1.0,1.0,1.0,0.4


## Evaluation Plain RAG

In [37]:
from src.rag_eval.data_models import EvalContainer
from src.rag.retrieval import basic_rag
from src.rag_eval.helpers import query_gt_extractor

queries, gts = query_gt_extractor(path="data/agent_eval_questions.json", tokenize=False)

# initiate lists to store EvalContainer-Ojbects
eval_conts = []
predicted_answers = []
retrieved_contexts = []

for idx, query in enumerate(queries):

    # run RAG
    result = basic_rag.run({"query_embedder":{"text": query}, 
                            "prompt_builder":{"question": query}}, 
                            include_outputs_from="retriever")
    
    # store generated answer
    predicted_answers.append(result["llm"]["replies"][0])
    
    # for each question, store content from Document-object in list
    docs = result["retriever"]["documents"]
    retrieved_contexts.append([d.content for d in docs])

    # extract what is needed for EvalContainer-object
    query = queries[idx]
    ground_truth_answer = gts[idx]
    retrieved_texts = retrieved_contexts[idx]
    generated_answer = predicted_answers[idx]

    eval_cont = EvalContainer(query=query,
                                ground_truth_answer=ground_truth_answer,
                                generated_answer=generated_answer,
                                retrieved_texts=retrieved_texts # BUGFIX: retrieved_texts MUST be list, otherwise RetrievalPrecisionEvaluator will treat each letter of retrieved_texts as a doc_text
                                )
    
    eval_conts.append(eval_cont)
    


Batches: 100%|██████████| 1/1 [00:00<00:00, 31.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 63.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 284.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.25it/s]
Batches: 

In [38]:
print(retrieved_texts[0])

Tönis Tacos
Vorspeisen
Guacamole mit Tortilla-Chips - 5,50 €
Quesadillas mit Käse und Jalapeños - 6,00 €
Mexikanische Bohnen-Suppe - 4,50 €
Elote – gegrillter Mais mit Limette und Käse - 5,00 €
Tacos
Carne Asada Taco (Rindfleisch, Zwiebeln, Koriander) - 3,50 €
Pollo Taco (gegrilltes Hähnchen, Pico de Gallo) - 3,50 €
Veggie Taco (gegrilltes Gemüse, Avocado) - 


In [39]:
print(eval_conts[0])

EvalContainer(query="In welchem Restaurant kostet der Espresso mehr: bei Anton's Pizza oder bei Il sogno de Anton, und wie hoch sind die jeweiligen Preise?", ground_truth_answer="Bei Il sogno de Anton ist der Espresso teurer: Anton's Pizza Espresso 2,00 € vs. Il sogno de Anton Espresso 3,50 €. ", generated_answer="Die Menüauszüge im Kontext zeigen die Getränkepreise für Anton's Pizza und Chez Anton – Délices Canins, aber es gibt keine explizite Preisinformation zu „Espresso“ bei Anton's Pizza und auch nicht zu „Espresso“ oder „Café Espresso“ bei Il sogno de Anton (in den gegebenen Kontextauszügen wird Il sogno de Anton nicht explizit erwähnt, es tauchen nur Anton's Pizza und Chez Anton auf).\n\nDa im Kontext keine Preise für Espresso genannt werden, kann die Frage mit den verfügbaren Informationen nicht eindeutig beantwortet werden.\n\n**Antwort:**  \nIm bereitgestellten Kontext ist kein Espresso-Preis für Anton's Pizza noch für Il sogno de Anton angegeben, daher kann nicht festgestell

In [40]:
import pandas as pd
import importlib
import src.rag_eval.evaluators as evaluators
import rag_eval.coordinator as coordinator

importlib.reload(evaluators)  # force reimport after changes
importlib.reload(coordinator)


# initiate llm 
api_key = os.getenv("OPENAI_API_KEY")
api_version = os.getenv("OPENAI_API_VERSION")
base_url = os.getenv("OPENAI_CHATCOMPLETIONS_URL")
model = os.getenv("OPENAI_DEPLOYMENT")
password = os.getenv("CORRECT_PASSWORD")

llm = AzureClient(
    api_key=api_key,
    api_version=api_version,
    base_url=base_url,
    model=model
)


# run evaluator to create metrics: answer_precision, answer_recall, retrieval_precision, retrieval_recall
coord = RAGEvaluator(llm_extractor=llm, llm_judge=llm)

for i, res in enumerate(eval_conts):
    #iteration tracker
    print(f"=== ITERATION-No. {i+1} ===")

    result = coord.evaluate_all(res)

    # extracting all variables needed
    answer_recall.append(result["answer_quality"]["answer_recall"])
    answer_precision.append(result["answer_quality"]["answer_precision"])
    retrieval_recall.append(result["retrieval_quality"]["retrieval_recall"])
    retrieval_precision.append(result["retrieval_quality"]["retrieval_precision"])


=== ITERATION-No. 1 ===
GT Claim: Der Espresso kostet bei Anton's Pizza 2,00 €., Context: Anton's Pizza - Doggish Delights
Vorspeisen
Bruschetta mit Tomaten und Basilikum - 5,50 €
Knoblauchbrot mit Kräuterbutter - 4,00 €
Caprese (Mozzarella, Tomate, Basilikum) - 6,50 €
Frittierte Mozzarella-Sticks - 6,00 €
Pizza
Margherita (Tomate, Mozzarella, Basilikum) - 8,50 €
Doggish Special (Tomate, Mozzarella, Rinderhack, Speck, Zwiebeln) - 11,50 €
Funghi (Tomate, Mozzarella, Champignons) - 9,50 €
Veggie Delight (Tomate, 
GT Claim: Der Espresso kostet bei Il sogno de Anton 3,50 €., Context: Anton's Pizza - Doggish Delights
Vorspeisen
Bruschetta mit Tomaten und Basilikum - 5,50 €
Knoblauchbrot mit Kräuterbutter - 4,00 €
Caprese (Mozzarella, Tomate, Basilikum) - 6,50 €
Frittierte Mozzarella-Sticks - 6,00 €
Pizza
Margherita (Tomate, Mozzarella, Basilikum) - 8,50 €
Doggish Special (Tomate, Mozzarella, Rinderhack, Speck, Zwiebeln) - 11,50 €
Funghi (Tomate, Mozzarella, Champignons) - 9,50 €
Veggie Deli

NameError: name 'answer_recall' is not defined

In [None]:
    # turn data into df for visualisation
df_data_plain_rag = {
    "answer_recall": answer_recall,
    "answer_precision": answer_precision,
    "retrieval_recall": retrieval_recall,
    "retrieval_precision": retrieval_precision
}

df_plain_rag = pd.DataFrame(df_data_plain_rag)

print(df_plain_rag)
