In [None]:
import json
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient
from tqdm import tqdm
from os import environ
import sys
import time
import pandas as pd

sys.path.append('..')
from composables.files import open_json_file, save_json_file
from composables.search import llm, format_hits_response
from composables.data_processing import format_list_in_batch

load_dotenv()

True

In [64]:
QDRANT_URL = environ.get('QDRANT_URL')
QDRANT_API_KEY = environ.get('QDRANT_API_KEY')
COLLECTION_NAME = 'lotr-characters'
EMBEDDING_DIMENSION = 512
JINA_EMBEDDING_MODEL = "jina-embeddings-v4"
JINA_URL = "https://api.jina.ai/v1/embeddings"
JINA_API_KEY = environ.get('JINA_API_KEY')
QUERYING_TASK = "retrieval.query"
OPENAI_MODEL = "gpt-4o-mini"
OPENAI_TEMPERATURE = 0.5

In [65]:
openai_client = OpenAI()
qd_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

In [67]:
def format_rag_prompt (query: str, search_results: list[dict[str,str]]):
    raw_user_prompt = """
Context from database:
{retrieved_context}

User question:
{user_question}

Answer the question using ONLY the context above.
""".strip()
    
    system_prompt = """
You are a helpful lore expert on J.R.R. Tolkien's Middle-earth. 
You can only answer questions about characters using the provided context retrieved from the database. 
The context includes structured information such as: name, race, titles, realm, family relations, birth and death dates, and short descriptions.

Guidelines:
- If the answer is found in the context, respond clearly and directly.
- If the answer is not in the context, say you don’t know or that the information was not provided.
- Do not invent new facts outside the context.
- Keep your answers concise, but include all relevant details from the context.
- If the user asks for speculation (e.g., "what would happen if X met Y?"), you can summarize based only on what the context says about their traits.
""".strip()
    
    user_prompt = raw_user_prompt.format(retrieved_context=search_results, user_question=query).strip()
    return user_prompt, system_prompt

In [68]:
def format_eval_prompt (payload: dict[str,str])-> tuple[str, str]:
    raw_user_prompt = """
Evaluate the following RAG output.
{{
  "question": "{question}",
  "context": "{context}",
  "answer": "{answer}"
}}
""".strip()

    system_prompt = """
You are an impartial evaluator assessing the quality of a RAG (Retrieval-Augmented Generation) system that answers questions about J.R.R. Tolkien’s Middle-earth characters.

You will receive a JSON input with the following fields:
{
  "question": "<user query>",
  "context": "<retrieved context>",
  "answer": "<model-generated answer>"
}

Your task is to evaluate how well the answer satisfies the question, using only the information in the context.

Evaluate on four criteria:
1. Relevance — Does the answer directly address the question?
2. Groundedness — Are all facts supported by the provided context (no hallucinations)?
3. Completeness — Does the answer include all key details from the context?
4. Faithfulness — Does it follow the system rules (concise, factual, no invention, admits missing info)?

Scoring Guide (0–3 for each):
- 3: Excellent — fully meets the criterion
- 2: Fair — mostly correct, minor omissions or minor unsupported detail
- 1: Weak — noticeable errors, missing or irrelevant info
- 0: None — fails completely or contradicts context

Your output must be a single valid JSON object:
{
  "relevance": <0–3>,
  "groundedness": <0–3>,
  "completeness": <0–3>,
  "faithfulness": <0–3>,
  "comments": "<1–2 sentence summary of reasoning>"
}

Output only the JSON object — no markdown, no extra text.
""".strip()
    user_prompt = raw_user_prompt.format(question=payload.get('question'), context=payload.get('context'), answer=payload.get('answer')).strip()
    
    return user_prompt, system_prompt

In [102]:
def rag_with_search_result (data: dict, query: str):
    search_result = data.get('search_results')
    formatted_search_result = format_hits_response(hits=search_result)
    user_prompt, system_prompt = format_rag_prompt(query=query, search_results=formatted_search_result)
    res = llm(user_prompt=user_prompt, system_prompt=system_prompt)
    return res, formatted_search_result

In [96]:
def rag_eval_llm_as_judge (query: str):
    answer, search_result = rag_with_search_result(query=query)
    payload = {
        "question": query,
        "context": search_result,
        "answer": answer
    }
    user_prompt, system_prompt = format_eval_prompt(payload=payload)
    res = llm(user_prompt=user_prompt, system_prompt=system_prompt)
    
    if type(res) == str:
        json_res = json.loads(res)
        return {"question": query, "answer": answer, **json_res}
    else:
        return {"question": query, "answer": answer, **res}

In [103]:
def rag_eval_with_retrieval_results(data: dict):
    search_result = data.get('search_results')
    question = data.get('question')
    formatted_search_result = format_hits_response(hits=search_result)
    rag_user_prompt, rag_sys_prompt = format_rag_prompt(query=question, search_results=formatted_search_result)
    answer = llm(user_prompt=rag_user_prompt, system_prompt=rag_sys_prompt)
    payload = {
        "question": question,
        "context": search_result,
        "answer": answer
    }
    eval_user_prompt, eval_sys_prompt = format_eval_prompt(payload=payload)
    res = llm(user_prompt=eval_user_prompt, system_prompt=eval_sys_prompt)

    if type(res) == str:
        json_res = json.loads(res)
        return {"question": question, "answer": answer, **json_res}
    else:
        return {"question": question, "answer": answer, **res}

In [106]:
def generate_rag_eval_result_with_retrieval_resutls(data: list[dict]):
    eval_results = []
    for retrieval_result in tqdm(data, desc="Processing documents"):
        result = rag_eval_with_retrieval_results(data=retrieval_result)
        eval_results.append(result)
    return eval_results

In [97]:
def generate_rag_result(data: list=None, previous_results=None, start_index=0, requests_per_minute=400):
    rag_results = previous_results if previous_results is not None else []
    current_index = start_index

    # Calculate delay between requests to stay under rate limit
    delay_seconds = 60.0 / requests_per_minute

    try:
        for obj in tqdm(data, desc="Processing documents"):
            doc_id = obj["id"]
            for q_idx, question in enumerate(obj["questions"]):
                if current_index < start_index:
                    current_index += 1
                    continue

                try:
                    answer, search_result = rag_with_search_result(query=question)
                    if answer is None:
                        raise ValueError("Search returned None")
                    result = {
                        "question": question,
                        "answer": answer,
                        "search_result": search_result
                    }
                    rag_results.append(result)
                    current_index += 1

                    # Add delay to respect rate limit
                    time.sleep(delay_seconds)
                except Exception as e:
                    print(f"\n❌ Error at index {current_index}")
                    print(f"   Document ID: {doc_id}")
                    print(f"   Question {q_idx + 1}/{len(obj['questions'])}: {question}")
                    print(f"   Error: {type(e).__name__}: {str(e)}")
                    print(f"\n💾 Processed {len(rag_results)} questions before failure")
                    print(f"   Returning (relevance_total, {current_index}) for resume")
                    return rag_results, current_index
    
    except KeyboardInterrupt:
        print(f"\n⚠️  Interrupted by user at index {current_index}")
        print(f"💾 Processed {len(rag_results)} questions")
        return rag_results, current_index
    
    return rag_results, current_index


In [90]:
def generate_evaluation_result(data: list=None, previous_results=None, start_index=0, requests_per_minute=400):
    eval_results = previous_results if previous_results is not None else []
    current_index = start_index

    # Calculate delay between requests to stay under rate limit
    delay_seconds = 60.0 / requests_per_minute

    try:
        for obj in tqdm(data, desc="Processing documents"):
            doc_id = obj["id"]
            for q_idx, question in enumerate(obj["questions"]):
                if current_index < start_index:
                    current_index += 1
                    continue

                try:
                    result = rag_eval_llm_as_judge(query=question)
                    if result is None:
                        raise ValueError("Search returned None")
                    
                    eval_results.append(result)
                    current_index += 1

                    # Add delay to respect rate limit
                    time.sleep(delay_seconds)
                except Exception as e:
                    print(f"\n❌ Error at index {current_index}")
                    print(f"   Document ID: {doc_id}")
                    print(f"   Question {q_idx + 1}/{len(obj['questions'])}: {question}")
                    print(f"   Error: {type(e).__name__}: {str(e)}")
                    print(f"\n💾 Processed {len(eval_results)} questions before failure")
                    print(f"   Returning (relevance_total, {current_index}) for resume")
                    return eval_results, current_index
    
    except KeyboardInterrupt:
        print(f"\n⚠️  Interrupted by user at index {current_index}")
        print(f"💾 Processed {len(eval_results)} questions")
        return eval_results, current_index
    
    return eval_results, current_index


In [118]:
def analyze_evaluation_result (file_path: str):
    eval_data: list[dict] = open_json_file(file_path=file_path)
    num_entries = len(eval_data)
    
    relevance_total = 0
    groundedness_total = 0
    completeness_total = 0
    faithfulness_total = 0

    for entry in tqdm(eval_data, desc="Processing data"):
        relevance = entry.get("relevance")
        groundedness = entry.get("groundedness")
        completeness = entry.get("completeness")
        faithfulness = entry.get("faithfulness")

        relevance_total += relevance
        groundedness_total += groundedness
        completeness_total += completeness
        faithfulness_total += faithfulness
    
    avg_relevance = relevance_total / num_entries
    avg_groundedness = groundedness_total / num_entries
    avg_completeness = completeness_total / num_entries
    avg_faithfulness = faithfulness_total / num_entries
    total_avg_score = (relevance_total + groundedness_total + completeness_total + faithfulness_total) / (num_entries * 4)

    print("""
Evaluate on four criteria:
1. Relevance — Does the answer directly address the question?
2. Groundedness — Are all facts supported by the provided context (no hallucinations)?
3. Completeness — Does the answer include all key details from the context?
4. Faithfulness — Does it follow the system rules (concise, factual, no invention, admits missing info)?

Scoring Guide (0–3 for each):
- 3: Excellent — fully meets the criterion
- 2: Fair — mostly correct, minor omissions or minor unsupported detail
- 1: Weak — noticeable errors, missing or irrelevant info
- 0: None — fails completely or contradicts context
""")
    print(f"Number of entries: {num_entries}")
    print(f"Average Relevance Score: {avg_relevance}")
    print(f"Average Groundedness Score: {avg_groundedness}")
    print(f"Average Completeness Score: {avg_completeness}")
    print(f"Average Faithfulness Score: {avg_faithfulness}")
    print(f"Total Average Score: {total_avg_score}")

In [99]:
golden_questions = open_json_file(file_path="../dist/golden_questions.json")
raw_search_results = open_json_file(file_path="../dist/retrieval_search_results.json")
batched_data = format_list_in_batch(data=golden_questions, batch_size=50)
golden_questions_batch_1 = batched_data[0]

Total entries: 749


In [107]:
eval_results = generate_rag_eval_result_with_retrieval_resutls(data=raw_search_results)

Processing documents: 100%|██████████| 500/500 [34:20<00:00,  4.12s/it]  


In [109]:
save_json_file(data=eval_results, file_path="../dist/evaluation_results_gpt_4o_mini.json")

In [119]:
analyze_evaluation_result(file_path="../dist/evaluation_results_gpt_4o_mini.json")

Processing data: 100%|██████████| 500/500 [00:00<00:00, 2413293.44it/s]


Evaluate on four criteria:
1. Relevance — Does the answer directly address the question?
2. Groundedness — Are all facts supported by the provided context (no hallucinations)?
3. Completeness — Does the answer include all key details from the context?
4. Faithfulness — Does it follow the system rules (concise, factual, no invention, admits missing info)?

Scoring Guide (0–3 for each):
- 3: Excellent — fully meets the criterion
- 2: Fair — mostly correct, minor omissions or minor unsupported detail
- 1: Weak — noticeable errors, missing or irrelevant info
- 0: None — fails completely or contradicts context

Number of entries: 500
Average Relevance Score: 2.998
Average Groundedness Score: 2.992
Average Completeness Score: 2.738
Average Faithfulness Score: 2.99
Total Average Score: 2.9295





In [123]:
gpt_4o_mini_eval_df = pd.DataFrame(data=eval_results)
gpt_4o_mini_eval_df.to_csv("../dist/evaluation_results_gpt_4o_mini.csv")