In [1]:
import json
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient
from tqdm import tqdm
from os import environ
import sys
import requests
import time
import itertools

load_dotenv()

True

In [2]:
QDRANT_URL = environ.get('QDRANT_URL')
QDRANT_API_KEY = environ.get('QDRANT_API_KEY')
COLLECTION_NAME = 'lotr-characters'
EMBEDDING_DIMENSION = 512
JINA_EMBEDDING_MODEL = "jina-embeddings-v4"
JINA_URL = "https://api.jina.ai/v1/embeddings"
JINA_API_KEY = environ.get('JINA_API_KEY')
QUERYING_TASK = "retrieval.query"
OPENAI_MODEL = "gpt-4o-mini"
OPENAI_TEMPERATURE = 0.5

In [3]:
openai_client = OpenAI()
qd_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

In [7]:
def format_eval_prompt (payload: dict[str,str])-> tuple[str, str]:
    raw_user_prompt = """
Evaluate the following RAG output.
{{
  "question": "{question}",
  "context": "{context}",
  "answer": "{answer}"
}}
""".strip()

    system_prompt = """
You are an impartial evaluator assessing the quality of a RAG (Retrieval-Augmented Generation) system that answers questions about J.R.R. Tolkien’s Middle-earth characters.

You will receive a JSON input with the following fields:
{
  "question": "<user query>",
  "context": "<retrieved context>",
  "answer": "<model-generated answer>"
}

Your task is to evaluate how well the answer satisfies the question, using only the information in the context.

Evaluate on four criteria:
1. Relevance — Does the answer directly address the question?
2. Groundedness — Are all facts supported by the provided context (no hallucinations)?
3. Completeness — Does the answer include all key details from the context?
4. Faithfulness — Does it follow the system rules (concise, factual, no invention, admits missing info)?

Scoring Guide (0–3 for each):
- 3: Excellent — fully meets the criterion
- 2: Fair — mostly correct, minor omissions or minor unsupported detail
- 1: Weak — noticeable errors, missing or irrelevant info
- 0: None — fails completely or contradicts context

Your output must be a single valid JSON object:
{
  "relevance": <0–3>,
  "groundedness": <0–3>,
  "completeness": <0–3>,
  "faithfulness": <0–3>,
  "comments": "<1–2 sentence summary of reasoning>"
}

Output only the JSON object — no markdown, no extra text.
""".strip()
    user_prompt = raw_user_prompt.format(question=payload.get('question'), context=payload.get('context'), answer=payload.get('answer')).strip()
    
    return user_prompt, system_prompt