In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
import settings
import json
import hashlib
from typing import List
import utils
import os

from retriever import Retriever
from reranker import Reranker
from chatbot import Chatbot
from langchain_openai import ChatOpenAI
from rag_evaluator import RAGEvaluator
import time

logger = utils.get_logger("evaluate")

k = 5
results = {}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def save_json(filename: str, obj: dict) -> None:
    with open(filename, "w") as file:
        json.dump(obj, file, indent=4, ensure_ascii=False, sort_keys=True)

def load_json(filename) -> List[dict]:
    try:
        with open(filename, "r") as f:
            return json.load(f)
    except json.JSONDecodeError:
        return {}

def compute_entry_id(entry) -> str:
    key_data = {
        "query": entry["user_input"],
        "answer": entry["response"],
        "context": sorted(entry["retrieved_ids"]),
    }
    key_string = json.dumps(key_data, separators=(",", ":"), sort_keys=True)
    
    return hashlib.sha256(key_string.encode("utf-8")).hexdigest()

def get_chatbot_responses(chatbot: Chatbot, questions: List[str], filename: str, rewrite_query: bool = False) -> None:
    queries = questions

    # load available data
    responses = load_json(filename) or []
    available_questions = [entry["user_input"] for entry in responses]
    queries = [question for question in questions if question not in available_questions]

    for question in queries:
        start_time = time.perf_counter()

        chatbot.run(query=question, rewrite_query=rewrite_query)

        end_time = time.perf_counter()
        elapsed_time = end_time - start_time

        response = {
            "user_input": question,
            "response": chatbot.get_response(),
            "retrieved_contexts": chatbot.get_context(),
            "retrieved_ids": chatbot.get_documents_ids(),
            "retrieved_scores": chatbot.get_retrieved_scores(),
            "response_time": elapsed_time
        }
        id = compute_entry_id(response)
        response["id"] = id
        responses.append(response)

        time.sleep(10)

    save_json(filename, responses)

In [3]:
questions = load_json(f"data/questions.json")

#### Run chatbot with gpt-4o-mini, save results

In [None]:
retriever = Retriever()
reranker = Reranker()
chatbot_llm = ChatOpenAI(model="gpt-4o-mini", temperature=settings.TEMPERATURE, top_p=settings.TOP_P)
chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm, k=k)

# Run chatbot with reranker
get_chatbot_responses(chatbot=chatbot, questions=questions, filename=f"logs/gpt-4o-mini/qa.json")

#### Run chatbot with deepseek, save results

In [16]:
LOGS_DIR = f"logs/deepseek"

retriever = Retriever()
reranker = Reranker()
chatbot_llm = ChatOpenAI(
    model="deepseek-chat",
    temperature=settings.TEMPERATURE,
    top_p=settings.TOP_P,
    openai_api_key="sk-c59800a5739b4026a62d1aca27657267",
    openai_api_base="https://api.deepseek.com"
)
chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm, k=k)

get_chatbot_responses(chatbot=chatbot, questions=questions, filename=f"logs/deepseek/qa.json")

#### Run chatbot with Gemma

In [53]:
from google import genai

class GemmaLLM:
    def __init__(self, model: str, temperature: float):
        self.client = genai.Client(api_key="AIzaSyB3FMFxSCWrKqaxjpA0K2F1hOn1aOIBIOI")
        self.model_name = model
        self.temperature=temperature

    class LLMResponse:
        def __init__(self, content: str):
            self.content = content

    def invoke(self, prompt: str) -> 'GemmaLLM.LLMResponse':
        response = self.client.models.generate_content(
            model=self.model_name,
            contents=prompt,
            config=genai.types.GenerateContentConfig(
                temperature=self.temperature
            )
        )

        return self.LLMResponse(response.text)

In [63]:
model_name = "gemma-3n-e4b-it"

retriever = Retriever()
reranker = Reranker()
chatbot_llm = GemmaLLM(model_name, temperature=settings.TEMPERATURE)
chatbot = Chatbot(retriever=retriever, reranker=reranker, llm=chatbot_llm, k=k)

# Run chatbot with reranker
get_chatbot_responses(chatbot=chatbot, questions=questions, filename=f"logs/{model_name}/qa.json")

### Evaluate responses

## gpt-4o-mini

In [None]:
model_name = "gpt-4o-mini"
evaluator = RAGEvaluator(k=k)
evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", f"logs/{model_name}")
evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", f"results/{model_name}")

data = load_json(f"logs/{model_name}/qa.json")
evaluator.load_data(data)

results[model_name] = {}

- GPT Score - Faithfulness

In [None]:
faithfulness_gpt_score = evaluator.compute_faithfulness_gpt_score()
results[model_name]["faithfulness_gpt_score"] = faithfulness_gpt_score
print(f"[{model_name}] faithfulness GPT Score: {faithfulness_gpt_score:.4f}")

[gpt-4o-mini] faithfulness GPT Score: 0.8400


- GPT Score - Answer relevance

In [None]:
answer_relevance_gpt_score = evaluator.compute_answer_relevance_gpt_score()
results[model_name]["answer_relevance_gpt_score"] = answer_relevance_gpt_score
print(f"[{model_name}] answer relevance GPT Score: {answer_relevance_gpt_score:.4f}")

- RAGAs - Faithfulness

In [None]:
faithfulness_ragas = await evaluator.compute_faithfulness_ragas()
results[model_name]["faithfulness_ragas"] = faithfulness_ragas
print(f"[{model_name}] Faithfulness RAGAs: {faithfulness_ragas:.4f}")

- RAGAs - Answer Relevance

In [None]:
answer_relevance_ragas = await evaluator.compute_answer_relevance_ragas()
results[model_name]["answer_relevance_ragas"] = answer_relevance_ragas
print(f"[{model_name}] Answer relevance RAGAs: {answer_relevance_ragas:.4f}")

## deepsek

In [24]:
evaluator = RAGEvaluator(k=k)
evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", f"logs/deepseek")
evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", f"results/deepseek")
model_name = "deepseek"

data = load_json(f"logs/deepseek/qa.json")
evaluator.load_data(data)

results[model_name] = {}

- GPT Score - Faithfulness

In [None]:
faithfulness_gpt_score = evaluator.compute_faithfulness_gpt_score()
results[model_name]["faithfulness_gpt_score"] = faithfulness_gpt_score
print(f"[{model_name}] faithfulness GPT Score: {faithfulness_gpt_score:.4f}")

[deepseek] faithfulness GPT Score: 0.9375


- GPT Score - Answer Relevance

In [26]:
answer_relevance_gpt_score = evaluator.compute_answer_relevance_gpt_score()
results[model_name]["answer_relevance_gpt_score"] = answer_relevance_gpt_score
print(f"[{model_name}] answer relevance GPT Score: {answer_relevance_gpt_score:.4f}")

[deepseek] answer relevance GPT Score: 0.7650


- RAGAs - Faithfulness

In [29]:
faithfulness_ragas = await evaluator.compute_faithfulness_ragas()
results[model_name]["faithfulness_ragas"] = faithfulness_ragas
print(f"[{model_name}] Faithfulness RAGAs: {faithfulness_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
[deepseek] Faithfulness RAGAs: 0.7523


- RAGAs - Answer Relevance

In [31]:
answer_relevance_ragas = await evaluator.compute_answer_relevance_ragas()
results[model_name]["answer_relevance_ragas"] = answer_relevance_ragas
print(f"[{model_name}] Answer relevance RAGAs: {answer_relevance_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
[deepseek] Answer relevance RAGAs: 0.5138


In [35]:
save_json(f"results/{model_name}/results.json", results)

## gemma-3n-e4b-it

In [4]:
model_name = "gemma-3n-e4b-it"
evaluator = RAGEvaluator(k=k)
evaluator.logs_dir = os.path.join(settings.BASE_DIR, "test", f"logs/{model_name}")
evaluator.results_dir = os.path.join(settings.BASE_DIR, "test", f"results/{model_name}")

data = load_json(f"logs/{model_name}/qa.json")
evaluator.load_data(data)

results[model_name] = {}

  self.embedding_model = LangchainEmbeddingsWrapper(HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"))


- GPT Score - Faithfulness

In [5]:
faithfulness_gpt_score = evaluator.compute_faithfulness_gpt_score()
results[model_name]["faithfulness_gpt_score"] = faithfulness_gpt_score
print(f"[{model_name}] faithfulness GPT Score: {faithfulness_gpt_score:.4f}")

[gemma-3n-e4b-it] faithfulness GPT Score: 0.8400


- GPT Score - Answer Relevance

In [6]:
answer_relevance_gpt_score = evaluator.compute_answer_relevance_gpt_score()
results[model_name]["answer_relevance_gpt_score"] = answer_relevance_gpt_score
print(f"[{model_name}] answer relevance GPT Score: {answer_relevance_gpt_score:.4f}")

[gemma-3n-e4b-it] answer relevance GPT Score: 0.6300


- RAGAs - Faithfulness

In [10]:
faithfulness_ragas = await evaluator.compute_faithfulness_ragas()
results[model_name]["faithfulness_ragas"] = faithfulness_ragas
print(f"[{model_name}] Faithfulness RAGAs: {faithfulness_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/faithfulness_n_l_i_statement_prompt_romanian.json' already exists.
All entries already evaluated.
[gemma-3n-e4b-it] Faithfulness RAGAs: 0.6643


- RAGAs - Answer Relevance

In [14]:
answer_relevance_ragas = await evaluator.compute_answer_relevance_ragas()
results[model_name]["answer_relevance_ragas"] = answer_relevance_ragas
print(f"[{model_name}] Answer relevance RAGAs: {answer_relevance_ragas:.4f}")

Error saving prompts: The file '/home/ana/ACS/rag/test/prompts/answer_relevancy_response_relevance_prompt_romanian.json' already exists.
All entries already evaluated.
[gemma-3n-e4b-it] Answer relevance RAGAs: 0.5124


In [15]:
save_json(f"results/{model_name}/results.json", results)