In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
import settings
import json
from typing import List
import utils

from retriever import Retriever
from reranker import Reranker
from chatbot import Chatbot
from langchain_openai import ChatOpenAI
from rag_evaluator import RAGEvaluator

logger = utils.get_logger("evaluate")

LOGS_DIR = "logs/"
RESULTS_DIR = "results/"

In [17]:
def save_json(filename: str, obj: dict) -> None:
    with open(filename, "w") as file:
        json.dump(obj, file, indent=4, ensure_ascii=False)

def load_json(filename) -> List[dict]:
    with open(filename, "r") as file:
        data = json.load(file)
    return data

def save_result(name: str, value: float):
    with open(f"{RESULTS_DIR}/{name}.json", "w") as f:
        json.dump({name: value}, f, indent=4)

def get_chatbot_responses(chatbot: Chatbot, questions: List[str], rewrite_query: bool = False) -> List[dict]:
    responses = []
    for question in questions:
        chatbot.run(query=question, rewrite_query=rewrite_query)
        response = {
            "user_input": question,
            "response": chatbot.get_response(),
            "retrieved_contexts": chatbot.get_context(),
            "retrieved_ids": chatbot.get_documents_ids(),
        }
        responses.append(response)
    return responses

Load questions

In [18]:
# load questions
questions = load_json(f"data/questions.json")

#### Run chatbot with retriever only, save results

In [9]:
retriever = Retriever(k=5)
reranker = Reranker()
chatbot_llm = ChatOpenAI(model=settings.OPENAI_MODEL, temperature=0.00000001, top_p=1)
chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm)

In [None]:
# Run chatbot with retriever only
chatbot.rerank = False
responses = get_chatbot_responses(chatbot=chatbot, questions=questions)
save_json(f"{LOGS_DIR}/qa_retriever.json", responses)

#### Run chatbot with reranker, save results

In [19]:
retriever = Retriever(k=15)
reranker = Reranker()
chatbot_llm = ChatOpenAI(model=settings.OPENAI_MODEL, temperature=0.00000001, top_p=1)
chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm)

In [60]:
# Run chatbot with reranker
responses = get_chatbot_responses(chatbot=chatbot, questions=questions)
save_json(f"{LOGS_DIR}/qa_reranker.json", responses)

### 1. Retriever evaluation

In [19]:
k = 5
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_retriever.json")
evaluator.load_data(data)

#### 1.1 Retriever MAP@k

In [20]:
retriever_map_at_k = evaluator.compute_map_at_k()
print(f"Retriever MAP@{k}: {retriever_map_at_k:.4f}")
# save_result(f"retriever_map_at_{k}", retriever_map_at_k)

Retriever MAP@5: 0.7273


#### 1.2 Retriever MRR@k

In [21]:
retriever_mrr_at_k = evaluator.compute_mrr_at_k()
print(f"Retriever MRR@{k}: {retriever_mrr_at_k:.4f}")
# save_result(f"retriever_mrr_at_{k}", retriever_mrr_at_k)

Retriever MRR@5: 0.7149


### 2. Reranker evaluation

In [None]:
k = 5
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_reranker.json")
evaluator.load_data(data)

#### 2.1 Reranker MAP@k

In [9]:
reranker_map_at_k = evaluator.compute_map_at_k()
print(f"Reranker MAP@{k}: {reranker_map_at_k:.4f}")
# save_result(f"reranker_map_at_{k}", reranker_map_at_k)

Reranker MAP@5: 0.8094


#### 2.2 Reranker MRR@k

In [10]:
reranker_mrr_at_k = evaluator.compute_mrr_at_k()
print(f"Reranker MRR@{k}: {reranker_mrr_at_k:.4f}")
# save_result(f"reranker_mrr_at_{k}", reranker_mrr_at_k)

Reranker MRR@5: 0.8421


### 3. End-to-end system evaluation
Prompts translated and adapted from the [RAGAs paper](http://arxiv.org/abs/2309.15217).

#### 3.1 GPT Score - Faithfulness

In [None]:
k = 5
evaluator = RAGEvaluator(k=k)
data = load_json(f"{LOGS_DIR}/qa_reranker.json")
evaluator.load_data(data)

In [12]:
faithfulness_gpt_score = evaluator.compute_faithfulness_gpt_score()
print(f"Faithfulness GPT Score: {faithfulness_gpt_score:.4f}")
# save_result("gptscore_faithfulness", faithfulness_gpt_score)

Faithfulness GPT Score: 0.9053


#### 3.2 GPT Score - Answer relevance

In [13]:
answer_relevance_gpt_score = evaluator.compute_answer_relevance_gpt_score()
print(f"Answer relevance GPT Score: {answer_relevance_gpt_score:.4f}")
# save_result("gptscore_answer_relevance", answer_relevance_gpt_score)

Answer relevance GPT Score: 0.8579


#### 3.3 RAGAs - Faithfulness

In [None]:
faithfulness_ragas = await evaluator.compute_faithfulness_ragas()
print(f"Faithfulness RAGAs: {faithfulness_ragas:.4f}")
# save_result("ragas_faithfulness", faithfulness_ragas)

#### 3.4 RAGAs - Answer relevance

In [None]:
answer_relevance_ragas = await evaluator.compute_answer_relevance_ragas()
print(f"Answer relevance RAGAs: {answer_relevance_ragas:.4f}")
# save_result("ragas_answer_relevance", answer_relevance_ragas)