In [212]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [282]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
import settings
import json
import hashlib
from typing import List
import utils
import os

from retriever import Retriever
from reranker import Reranker
from chatbot import Chatbot
from langchain_openai import ChatOpenAI
from rag_evaluator import RAGEvaluator

logger = utils.get_logger("evaluate")

k = 5
LOGS_DIR = f"logs/k={k}"
RESULTS_DIR = f"results/k={k}"

results = {}

In [214]:
def save_json(filename: str, obj: dict) -> None:
    with open(filename, "w") as file:
        json.dump(obj, file, indent=4, ensure_ascii=False, sort_keys=True)

def load_json(filename) -> List[dict]:
    try:
        with open(filename, "r") as f:
            return json.load(f)
    except json.JSONDecodeError:
        return {}

def save_result(name: str, value: float):
    with open(f"{RESULTS_DIR}/{name}.json", "w") as f:
        json.dump({name: value}, f, indent=4)

def compute_entry_id(entry) -> str:
    key_data = {
        "query": entry["user_input"],
        "answer": entry["response"],
        "context": sorted(entry["retrieved_ids"]),
    }
    key_string = json.dumps(key_data, separators=(",", ":"), sort_keys=True)
    
    return hashlib.sha256(key_string.encode("utf-8")).hexdigest()

def get_chatbot_responses(chatbot: Chatbot, questions: List[str], filename: str, rewrite_query: bool = False) -> None:
    queries = questions

    # load available data
    responses = load_json(filename) or []
    available_questions = [entry["user_input"] for entry in responses]
    queries = [question for question in questions if question not in available_questions]

    for question in queries:
        chatbot.run(query=question, rewrite_query=rewrite_query)
        response = {
            "user_input": question,
            "response": chatbot.get_response(),
            "retrieved_contexts": chatbot.get_context(),
            "retrieved_ids": chatbot.get_documents_ids(),
            "retrieved_scores": chatbot.get_retrieved_scores(),
        }
        id = compute_entry_id(response)
        response["id"] = id
        responses.append(response)

    save_json(filename, responses)

Load questions

In [215]:
# load questions
questions = load_json(f"data/questions.json")

#### Run chatbot with retriever only, save results

In [216]:
retriever = Retriever(k=k)
chatbot_llm = ChatOpenAI(model=settings.OPENAI_MODEL, temperature=settings.TEMPERATURE)
chatbot = Chatbot(retriever=retriever, reranker=None, llm=chatbot_llm, k=k)

# Run chatbot with retriever only
chatbot.rerank = False
get_chatbot_responses(chatbot=chatbot, questions=questions, filename=f"{LOGS_DIR}/qa_retriever.json")

#### Run chatbot with reranker, save results

In [217]:
retriever = Retriever(k=15)
reranker = Reranker()
chatbot_llm = ChatOpenAI(model=settings.OPENAI_MODEL, temperature=settings.TEMPERATURE)
chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm, k=k)

# Run chatbot with reranker
get_chatbot_responses(chatbot=chatbot, questions=questions, filename=f"{LOGS_DIR}/qa_reranker.json")

### 1. Retriever evaluation

In [218]:
retriever_evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_retriever.json")
retriever_evaluator.load_data(data)

1.1 Retriever MAP@k

In [262]:
retriever_map_at_k = retriever_evaluator.compute_map_at_k()
results["retriever_map_at_k"] = retriever_map_at_k
print(f"Retriever MAP@{k}: {retriever_map_at_k:.4f}")

Retriever MAP@7: 0.7151


1.2 Retriever MRR@k

In [263]:
retriever_mrr_at_k = retriever_evaluator.compute_mrr_at_k()
results["retriever_mrr_at_k"] = retriever_mrr_at_k
print(f"Retriever MRR@{k}: {retriever_mrr_at_k:.4f}")

Retriever MRR@7: 0.7327


### 2. Reranker evaluation

In [264]:
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_reranker.json")
evaluator.load_data(data)

2.1 Reranker MAP@k

In [265]:
reranker_map_at_k = evaluator.compute_map_at_k()
results["reranker_map_at_k"] = reranker_map_at_k
print(f"Reranker MAP@{k}: {reranker_map_at_k:.4f}")

Reranker MAP@7: 0.8155


2.2 Reranker MRR@k

In [266]:
reranker_mrr_at_k = evaluator.compute_mrr_at_k()
results["reranker_mrr_at_k"] = reranker_mrr_at_k
print(f"Reranker MRR@{k}: {reranker_mrr_at_k:.4f}")

Reranker MRR@7: 0.8479


### 3. End-to-end system evaluation

3.1 GPT Score - Faithfulness

Retriever-only

In [267]:
retriever_faithfulness_gpt_score = retriever_evaluator.compute_faithfulness_gpt_score()
results["retriever_faithfulness_gpt_score"] = retriever_faithfulness_gpt_score
print(f"Retriever faithfulness GPT Score: {retriever_faithfulness_gpt_score:.4f}")

Retriever faithfulness GPT Score: 0.5975


Reranker

In [268]:
faithfulness_gpt_score = evaluator.compute_faithfulness_gpt_score()
results["faithfulness_gpt_score"] = faithfulness_gpt_score
print(f"Faithfulness GPT Score: {faithfulness_gpt_score:.4f}")

Faithfulness GPT Score: 0.7275


#### 3.2 GPT Score - Answer relevance

Retriever-only

In [269]:
retriever_answer_relevance_gpt_score = retriever_evaluator.compute_answer_relevance_gpt_score()
results["retriever_answer_relevance_gpt_score"] = retriever_answer_relevance_gpt_score
print(f"Retriever answer relevance GPT Score: {retriever_answer_relevance_gpt_score:.4f}")

Retriever answer relevance GPT Score: 0.6025


Reranker

In [270]:
answer_relevance_gpt_score = evaluator.compute_answer_relevance_gpt_score()
results["answer_relevance_gpt_score"] = answer_relevance_gpt_score
print(f"Answer relevance GPT Score: {answer_relevance_gpt_score:.4f}")

Answer relevance GPT Score: 0.7050


#### 3.3 RAGAs - Faithfulness

Retriever only

In [271]:
retriever_faithfulness_ragas = await retriever_evaluator.compute_faithfulness_ragas()
results["retriever_faithfulness_ragas"] = retriever_faithfulness_ragas
print(f"Retriever faithfulness RAGAs: {retriever_faithfulness_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
Retriever faithfulness RAGAs: 0.7812


Reranker

In [272]:
faithfulness_ragas = await evaluator.compute_faithfulness_ragas()
results["faithfulness_ragas"] = faithfulness_ragas
print(f"Faithfulness RAGAs: {faithfulness_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
Faithfulness RAGAs: 0.8128


#### 3.4 RAGAs - Answer relevance

Retriever only

In [273]:
retriever_answer_relevance_ragas = await retriever_evaluator.compute_answer_relevance_ragas()
results["retriever_answer_relevance_ragas"] = retriever_answer_relevance_ragas
print(f"Retriever answer relevance RAGAs: {retriever_answer_relevance_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
Retriever answer relevance RAGAs: 0.4724


Reranker

In [274]:
answer_relevance_ragas = await evaluator.compute_answer_relevance_ragas()
results["answer_relevance_ragas"] = answer_relevance_ragas
print(f"Answer relevance RAGAs: {answer_relevance_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
Answer relevance RAGAs: 0.5720


In [275]:
save_json(f"{RESULTS_DIR}/results.json", results)

---

### Sanity check

In [283]:
sanity_check_questions = load_json("data/sanity_check_questions.json")
sanity_check_answers = load_json("data/sanity_check_answers.json")
question_mapping = {question: sanity_check_questions[i] for i, question in enumerate(questions)}
answers_mapping = {question: sanity_check_answers[i] for i, question in enumerate(questions)}

In [284]:
# initialize sanity check evaluator, uses different logs and results directories
sanity_check_evaluator = RAGEvaluator(k=k)
sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace user queries with random, out of domain, sanity check questions
sanity_check_data = load_json(f"{LOGS_DIR}/qa_retriever.json")
for item in sanity_check_data:
    item["user_input"] = question_mapping[item["user_input"]]
sanity_check_evaluator.load_data(sanity_check_data)

Retriever MAP@k

In [285]:
sanity_check_retriever_map_at_k = sanity_check_evaluator.compute_map_at_k()
results["sanity_check_retriever_map_at_k"] = sanity_check_retriever_map_at_k
print(f"Sanity check retriever MAP@{k}: {sanity_check_retriever_map_at_k:.4f}")

Sanity check retriever MAP@5: 0.0000


Retriever MRR@k

In [None]:
sanity_check_retriever_mrr_at_k = sanity_check_evaluator.compute_mrr_at_k()
results["sanity_check_retriever_mrr_at_k"] = sanity_check_retriever_mrr_at_k
print(f"Sanity check Retriever MRR@{k}: {sanity_check_retriever_mrr_at_k:.4f}")

Sanity check Retriever MRR@5: 0.0000


#### Reranker evaluation

In [287]:
# initialize sanity check evaluator, uses different logs and results directories
sanity_check_evaluator = RAGEvaluator(k=k)
sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace user queries with random, out of domain, sanity check questions
sanity_check_data = load_json(f"{LOGS_DIR}/qa_reranker.json")
for item in sanity_check_data:
    item["user_input"] = question_mapping[item["user_input"]]
sanity_check_evaluator.load_data(sanity_check_data)

Reranker MAP@k

In [288]:
sanity_check_reranker_map_at_k = sanity_check_evaluator.compute_map_at_k()
results["sanity_check_reranker_map_at_k"] = sanity_check_reranker_map_at_k
print(f"Sanity check Reranker MAP@{k}: {sanity_check_reranker_map_at_k:.4f}")

Sanity check Reranker MAP@5: 0.0000


Reranker MRR@k

In [None]:
sanity_check_reranker_mrr_at_k = sanity_check_evaluator.compute_mrr_at_k()
results["sanity_check_reranker_mrr_at_k"] = sanity_check_reranker_mrr_at_k
print(f"Sanity check Reranker MRR@{k}: {sanity_check_reranker_mrr_at_k:.4f}")

Sanity check Reranker MRR@5: 0.0000


End-to-end system evaluation

Retriever only

In [290]:
# initialize sanity check evaluator, uses different logs and results directories
retriever_sanity_check_evaluator = RAGEvaluator(k=k)
retriever_sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
retriever_sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace chatbot answers with random, out of domain, sanity check answers
sanity_check_data = load_json(f"{LOGS_DIR}/qa_retriever.json")
for item in sanity_check_data:
    item["response"] = answers_mapping[item["user_input"]]
retriever_sanity_check_evaluator.load_data(sanity_check_data)

Reranker

In [291]:
# initialize sanity check evaluator, uses different logs and results directories
sanity_check_evaluator = RAGEvaluator(k=k)
sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace chatbot answers with random, out of domain, sanity check answers
sanity_check_data = load_json(f"{LOGS_DIR}/qa_retriever.json")
for item in sanity_check_data:
    item["response"] = answers_mapping[item["user_input"]]
sanity_check_evaluator.load_data(sanity_check_data)

#### End-to-end system

Faithfulness - GPT Score

Retriever only

In [295]:
retriever_sanity_check_faithfulness_gpt_score = retriever_sanity_check_evaluator.compute_faithfulness_gpt_score()
results["retriever_sanity_check_faithfulness_gpt_score"] = retriever_sanity_check_faithfulness_gpt_score
print(f"Retriever only sanity check Faithfulness GPT Score: {retriever_sanity_check_faithfulness_gpt_score:.4f}")

Retriever only sanity check Faithfulness GPT Score: 0.0000


Reranker

In [294]:
sanity_check_faithfulness_gpt_score = sanity_check_evaluator.compute_faithfulness_gpt_score()
results["sanity_check_faithfulness_gpt_score"] = sanity_check_faithfulness_gpt_score
print(f"Sanity Check Faithfulness GPT Score: {sanity_check_faithfulness_gpt_score:.4f}")

Sanity Check Faithfulness GPT Score: 0.0000


Answer relevance - GPT Score

Retriever only

In [297]:
retriever_sanity_check_answer_relevance_gpt_score = retriever_sanity_check_evaluator.compute_answer_relevance_gpt_score()
results["retriever_sanity_check_answer_relevance_gpt_score"] = retriever_sanity_check_answer_relevance_gpt_score
print(f"Retriever only Sanity check Answer relevance GPT Score: {retriever_sanity_check_answer_relevance_gpt_score:.4f}")

Retriever only Sanity check Answer relevance GPT Score: 0.0000


Reranker

In [298]:
sanity_check_answer_relevance_gpt_score = sanity_check_evaluator.compute_answer_relevance_gpt_score()
results["sanity_check_answer_relevance_gpt_score"] = sanity_check_answer_relevance_gpt_score
print(f"Sanity check Answer relevance GPT Score: {sanity_check_answer_relevance_gpt_score:.4f}")

Sanity check Answer relevance GPT Score: 0.0000


Faithfulness - RAGAs

Retriever only

In [303]:
retriever_sanity_check_faithfulness_ragas = await retriever_sanity_check_evaluator.compute_faithfulness_ragas()
results["retriever_sanity_check_faithfulness_ragas"] = retriever_sanity_check_faithfulness_ragas
print(f"Retriever only Sanity check Faithfulness RAGAs: {retriever_sanity_check_faithfulness_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
Retriever only Sanity check Faithfulness RAGAs: 0.0000


Reranker

In [304]:
sanity_check_faithfulness_ragas = await sanity_check_evaluator.compute_faithfulness_ragas()
results["sanity_check_faithfulness_ragas"] = sanity_check_faithfulness_ragas
print(f"Sanity check Faithfulness RAGAs: {sanity_check_faithfulness_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
Sanity check Faithfulness RAGAs: 0.0000


Answer relevance

Retriever only

In [307]:
retriever_sanity_check_answer_relevance_ragas = await retriever_sanity_check_evaluator.compute_answer_relevance_ragas()
results["retriever_sanity_check_answer_relevance_ragas"] = retriever_sanity_check_answer_relevance_ragas
print(f"Retriever only sanity check Answer relevance RAGAs: {retriever_sanity_check_answer_relevance_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
Retriever only sanity check Answer relevance RAGAs: 0.1027


Reranker

In [308]:
sanity_check_answer_relevance_ragas = await sanity_check_evaluator.compute_answer_relevance_ragas()
results["sanity_check_answer_relevance_ragas"] = sanity_check_answer_relevance_ragas
print(f"Sanity check Answer relevance RAGAs: {sanity_check_answer_relevance_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
Sanity check Answer relevance RAGAs: 0.1027


In [310]:
save_json(f"results/sanity_check/results.json", results)