In [7]:
import json
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient
from tqdm import tqdm

load_dotenv()

True

In [6]:
with open("../dist/qdrant_records.json", 'r') as file:
    qdrant_records = json.load(file)

In [25]:
OPENAI_MODEL = "gpt-4o-mini"
OPENAI_TEMPERATURE = 0.5

In [26]:
openai_client = OpenAI()

In [43]:
def format_prompt (payload: dict[str,str])-> tuple[str, str]:
    raw_user_prompt = """
Payload:
{payload}
""".strip()

    system_prompt = """
You are an assistant that generates evaluation questions to test retrieval quality on character data.
You will receive a JSON object describing a fictional character, which may include fields such as name, race, gender, birth, death, spouse, realm, biography, and others.

Your task:
1. Read and understand the JSON payload carefully.
2. Generate exactly 5 diverse and specific questions that can be answered using the information in the payload.
3. Focus on factual, grounded details — such as relationships, timeline, characteristics, or key events mentioned in the biography.
4. Avoid trivial or repetitive questions.
5. Do not include any reasoning, explanations, or text outside the JSON array.

Output format (JSON only):
[
"Question 1?",
"Question 2?",
"Question 3?",
"Question 4?",
"Question 5?"
]

If a field is null or missing, do not ask about it. If there is limited information, create general but relevant questions based on available content.
""".strip()
    user_prompt = raw_user_prompt.format(payload=payload).strip()
    
    return user_prompt, system_prompt

In [23]:
def format_records ():
    formatted_records = []
    for record in tqdm(qdrant_records):
        basic_fields = ['race', 'gender', 'realm', 'culture', 'birth', 'death', 'spouse', 'hair', 'height', 'biography', 'history']
        character = {
            "id": record["id"]
        }
        character.update([(field, record["payload"][field]) for field in basic_fields if record["payload"].get(field)])
        formatted_records.append(character)

    return formatted_records

In [27]:
def llm(user_prompt: str, system_prompt: str)-> list[str]:
    """
    llm function to call openAI with our specific prompts
    """
    res = openai_client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=OPENAI_TEMPERATURE
    )
    return res.choices[0].message.content

In [29]:
def generate_question(ctx: dict[str,str])->list[str]:
    user_prompt, system_prompt = format_prompt(ctx)
    return llm(user_prompt=user_prompt, system_prompt=system_prompt)

In [44]:
rau = format_records()
generate_question(rau[0])

100%|██████████| 749/749 [00:00<00:00, 240454.17it/s]


'[\n"Who is Hareth\'s spouse?",\n"In which year did Hareth marry Galdor?",\n"What is the name of Hareth\'s father?",\n"How many children did Hareth have?",\n"Which age of Middle-earth did Hareth live in?"\n]'