In [114]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [115]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
import settings
import json
from typing import List
import utils
import os

from retriever import Retriever
from reranker import Reranker
from chatbot import Chatbot
from langchain_openai import ChatOpenAI
from rag_evaluator import RAGEvaluator

logger = utils.get_logger("evaluate")

LOGS_DIR = "logs/"
RESULTS_DIR = "results/"

results = {}

In [116]:
def save_json(filename: str, obj: dict) -> None:
    with open(filename, "w") as file:
        json.dump(obj, file, indent=4, ensure_ascii=False, sort_keys=True)

def load_json(filename) -> List[dict]:
    with open(filename, "r") as file:
        data = json.load(file)
    return data

def save_result(name: str, value: float):
    with open(f"{RESULTS_DIR}/{name}.json", "w") as f:
        json.dump({name: value}, f, indent=4)

def get_chatbot_responses(chatbot: Chatbot, questions: List[str], rewrite_query: bool = False) -> List[dict]:
    responses = []
    for question in questions:
        chatbot.run(query=question, rewrite_query=rewrite_query)
        response = {
            "user_input": question,
            "response": chatbot.get_response(),
            "retrieved_contexts": chatbot.get_context(),
            "retrieved_ids": chatbot.get_documents_ids(),
        }
        responses.append(response)
    return responses

Load questions

In [117]:
# load questions
questions = load_json(f"data/questions.json")
sanity_check_questions = load_json("data/sanity_check_questions.json")
sanity_check_answers = load_json("data/sanity_check_answers.json")
question_mapping = {question: sanity_check_questions[i] for i, question in enumerate(questions)}
answers_mapping = {question: sanity_check_answers[i] for i, question in enumerate(questions)}

#### Run chatbot with retriever only, save results

In [118]:
# retriever = Retriever(k=5)
# reranker = Reranker()
# chatbot_llm = ChatOpenAI(model=settings.OPENAI_MODEL, temperature=0.00000001, top_p=1)
# chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm)

In [119]:
# # Run chatbot with retriever only
# chatbot.rerank = False
# responses = get_chatbot_responses(chatbot=chatbot, questions=questions)
# save_json(f"{LOGS_DIR}/qa_retriever.json", responses)

#### Run chatbot with reranker, save results

In [120]:
# retriever = Retriever(k=15)
# reranker = Reranker()
# chatbot_llm = ChatOpenAI(model=settings.OPENAI_MODEL, temperature=0.00000001, top_p=1)
# chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm)

In [121]:
# # Run chatbot with reranker
# responses = get_chatbot_responses(chatbot=chatbot, questions=questions)
# save_json(f"{LOGS_DIR}/qa_reranker.json", responses)

### 1. Retriever evaluation

In [122]:
k = 5
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_retriever.json")
evaluator.load_data(data)

# initialize sanity check evaluator, uses different logs and results directories
sanity_check_evaluator = RAGEvaluator(k=k)
sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace user queries with random, out of domain, sanity check questions
sanity_check_data = load_json(f"{LOGS_DIR}/qa_retriever.json")
for item in sanity_check_data:
    item["user_input"] = question_mapping[item["user_input"]]
sanity_check_evaluator.load_data(sanity_check_data)

#### 1.1 Retriever MAP@k

In [123]:
retriever_map_at_k = evaluator.compute_map_at_k()
results["retriever_map_at_k"] = retriever_map_at_k
print(f"Retriever MAP@{k}: {retriever_map_at_k:.4f}")
# save_result(f"retriever_map_at_{k}", retriever_map_at_k)

Retriever MAP@5: 0.7213


Sanity check:

In [124]:
sanity_check_retriever_map_at_k = sanity_check_evaluator.compute_map_at_k()
results["sanity_check_retriever_map_at_k"] = sanity_check_retriever_map_at_k
print(f"Sanity check retriever MAP@{k}: {sanity_check_retriever_map_at_k:.4f}")
# save_result(f"sanity_check_retriever_map_at_{k}", sanity_check_retriever_map_at_k)

Sanity check retriever MAP@5: 0.0000


#### 1.2 Retriever MRR@k

In [125]:
retriever_mrr_at_k = evaluator.compute_mrr_at_k()
results["retriever_mrr_at_k"] = retriever_mrr_at_k
print(f"Retriever MRR@{k}: {retriever_mrr_at_k:.4f}")
# save_result(f"retriever_mrr_at_{k}", retriever_mrr_at_k)

Retriever MRR@5: 0.7354


Sanity check:

In [126]:
sanity_check_retriever_mrr_at_k = sanity_check_evaluator.compute_mrr_at_k()
results["sanity_check_retriever_mrr_at_k"] = sanity_check_retriever_mrr_at_k
print(f"Sanity check Retriever MRR@{k}: {sanity_check_retriever_mrr_at_k:.4f}")
# save_result(f"sanity_check_retriever_mrr_at_{k}", sanity_check_retriever_mrr_at_k)

Sanity check Retriever MRR@5: 0.0000


### 2. Reranker evaluation

In [127]:
k = 5
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_reranker.json")
evaluator.load_data(data)

# initialize sanity check evaluator, uses different logs and results directories
sanity_check_evaluator = RAGEvaluator(k=k)
sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace user queries with random, out of domain, sanity check questions
sanity_check_data = load_json(f"{LOGS_DIR}/qa_reranker.json")
for item in sanity_check_data:
    item["user_input"] = question_mapping[item["user_input"]]
sanity_check_evaluator.load_data(sanity_check_data)

#### 2.1 Reranker MAP@k

In [128]:
reranker_map_at_k = evaluator.compute_map_at_k()
results["reranker_map_at_k"] = reranker_map_at_k
print(f"Reranker MAP@{k}: {reranker_map_at_k:.4f}")
# save_result(f"reranker_map_at_{k}", reranker_map_at_k)

Reranker MAP@5: 0.8292


Sanity check:

In [129]:
sanity_check_reranker_map_at_k = sanity_check_evaluator.compute_map_at_k()
results["sanity_check_reranker_map_at_k"] = sanity_check_reranker_map_at_k
print(f"Sanity check Reranker MAP@{k}: {sanity_check_reranker_map_at_k:.4f}")
# save_result(f"sanity_check_reranker_map_at_{k}", sanity_check_reranker_map_at_k)

Sanity check Reranker MAP@5: 0.0000


#### 2.2 Reranker MRR@k

In [130]:
reranker_mrr_at_k = evaluator.compute_mrr_at_k()
results["reranker_mrr_at_k"] = reranker_mrr_at_k
print(f"Reranker MRR@{k}: {reranker_mrr_at_k:.4f}")
# save_result(f"reranker_mrr_at_{k}", reranker_mrr_at_k)

Reranker MRR@5: 0.8604


Sanity check:

In [131]:
sanity_check_reranker_mrr_at_k = sanity_check_evaluator.compute_mrr_at_k()
results["sanity_check_reranker_mrr_at_k"] = sanity_check_reranker_mrr_at_k
print(f"Sanity check Reranker MRR@{k}: {sanity_check_reranker_mrr_at_k:.4f}")
# save_result(f"sanity_check_reranker_mrr_at_{k}", sanity_check_reranker_mrr_at_k)

Sanity check Reranker MRR@5: 0.0000


### 3. End-to-end system evaluation
Prompts translated and adapted from the [RAGAs paper](http://arxiv.org/abs/2309.15217).

#### 3.1 GPT Score - Faithfulness

In [132]:
k = 5
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_reranker.json")
evaluator.load_data(data)

# initialize sanity check evaluator, uses different logs and results directories
sanity_check_evaluator = RAGEvaluator(k=k)
sanity_check_evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", "logs", "sanity_check")
sanity_check_evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", "results", "sanity_check")

# load data: replace chatbot answers with random, out of domain, sanity check answers
sanity_check_data = load_json(f"{LOGS_DIR}/qa_reranker.json")
for item in sanity_check_data:
    item["response"] = answers_mapping[item["user_input"]]
sanity_check_evaluator.load_data(sanity_check_data)

In [133]:
faithfulness_gpt_score = evaluator.compute_faithfulness_gpt_score()
results["faithfulness_gpt_score"] = faithfulness_gpt_score
print(f"Faithfulness GPT Score: {faithfulness_gpt_score:.4f}")
# save_result("gptscore_faithfulness", faithfulness_gpt_score)

Faithfulness GPT Score: 0.9450


Sanity check:

In [134]:
sanity_check_faithfulness_gpt_score = sanity_check_evaluator.compute_faithfulness_gpt_score()
results["sanity_check_faithfulness_gpt_score"] = sanity_check_faithfulness_gpt_score
print(f"Sanity Check Faithfulness GPT Score: {sanity_check_faithfulness_gpt_score:.4f}")
# save_result("sanity_check_gptscore_faithfulness", sanity_check_faithfulness_gpt_score)

Sanity Check Faithfulness GPT Score: 0.0000


#### 3.2 GPT Score - Answer relevance

In [135]:
answer_relevance_gpt_score = evaluator.compute_answer_relevance_gpt_score()
results["answer_relevance_gpt_score"] = answer_relevance_gpt_score
print(f"Answer relevance GPT Score: {answer_relevance_gpt_score:.4f}")
# save_result("gptscore_answer_relevance", answer_relevance_gpt_score)

Answer relevance GPT Score: 0.8125


Sanity check:

In [136]:
sanity_check_answer_relevance_gpt_score = sanity_check_evaluator.compute_answer_relevance_gpt_score()
results["sanity_check_answer_relevance_gpt_score"] = sanity_check_answer_relevance_gpt_score
print(f"Sanity check Answer relevance GPT Score: {sanity_check_answer_relevance_gpt_score:.4f}")
# save_result("sanity_check_gptscore_answer_relevance", sanity_check_answer_relevance_gpt_score)

Sanity check Answer relevance GPT Score: 0.0000


#### 3.3 RAGAs - Faithfulness

In [137]:
faithfulness_ragas = await evaluator.compute_faithfulness_ragas()
results["faithfulness_ragas"] = faithfulness_ragas
print(f"Faithfulness RAGAs: {faithfulness_ragas:.4f}")
# save_result("ragas_faithfulness", faithfulness_ragas)

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
Faithfulness RAGAs: 0.8155


Sanity check:

In [138]:
sanity_check_faithfulness_ragas = await sanity_check_evaluator.compute_faithfulness_ragas()
results["sanity_check_faithfulness_ragas"] = sanity_check_faithfulness_ragas
print(f"Sanity check Faithfulness RAGAs: {sanity_check_faithfulness_ragas:.4f}")
# save_result("sanity_check_ragas_faithfulness", sanity_check_faithfulness_ragas)

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
Sanity check Faithfulness RAGAs: 0.0000


#### 3.4 RAGAs - Answer relevance

In [139]:
answer_relevance_ragas = await evaluator.compute_answer_relevance_ragas()
results["answer_relevance_ragas"] = answer_relevance_ragas
print(f"Answer relevance RAGAs: {answer_relevance_ragas:.4f}")
# save_result("ragas_answer_relevance", answer_relevance_ragas)

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
Answer relevance RAGAs: 0.6590


Sanity check:

In [140]:
sanity_check_answer_relevance_ragas = await sanity_check_evaluator.compute_answer_relevance_ragas()
results["sanity_check_answer_relevance_ragas"] = sanity_check_answer_relevance_ragas
print(f"Sanity check Answer relevance RAGAs: {sanity_check_answer_relevance_ragas:.4f}")
# save_result("sanity_check_ragas_answer_relevance", sanity_check_answer_relevance_ragas)

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
Sanity check Answer relevance RAGAs: 0.1057


In [141]:
save_json(f"{RESULTS_DIR}/results.json", results)