# Ground Truth Generation for Retrieval Evaluation


In [36]:
from openai import OpenAI
import json
from pathlib import Path
from tqdm import tqdm
from datetime import datetime, timezone
import getpass


## Config


In [37]:
MODEL_NAME = "gpt-5-nano"
QUESTIONS_PER_DOC = 2  # dev-stage
NUM_DOCS = 5           # number of canonical docs to process (dev vs full run)
# Timestamped output to avoid overwriting
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
OUTPUT_PATH = Path(f"../data/eval/ground_truth_gpt5nano_{timestamp}.json")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
CANONICAL_PATH = Path("../data/canonical/all_documents.json")


## Initialise OpenAI client


In [4]:
try:
    API_KEY = getpass.getpass("Enter your OpenAI API key: ")
except Exception as error:
    print("ERROR", error)
else:
    print("API_KEY entered")
    
client = OpenAI(api_key=API_KEY)


Enter your OpenAI API key:  ········


API_KEY entered


## Load canonical documents


In [5]:
with open(CANONICAL_PATH) as f:
    documents = json.load(f)
    print(f"Loaded {len(documents)} canonical documents.")


Loaded 99 canonical documents.


## Prompt to generate questions


In [12]:
def build_question_prompt(text, n_questions):
    return f"""You are generating user search questions for a retrieval evaluation dataset.

The user has NOT seen the text below.
They are searching for information contained in it.

Generate {n_questions} DISTINCT, realistic user questions that could retrieve this text.
- Questions should vary in wording and intent
- Do NOT quote the text
- Do NOT include answers
- Do NOT number the questions
- Each question must be on a separate line

TEXT:
{text}"""


## Function to call GPT-5-nano


In [30]:
# def generate_questions(text, n_questions=5):
#     response = client.responses.create(
#         model=MODEL_NAME,
#         input=build_question_prompt(text, n_questions),
#         max_output_tokens=256
#     )
#     raw = response.output_text.strip()
    
#     # Deterministic: sorted set of unique questions
#     questions = sorted(set(
#         q.strip(" -\n\r\t")
#         for q in raw.split("\n")
#         if q.strip()
#     ))
    
#     return questions



# def generate_questions(text, n_questions=5):
#     response = client.responses.create(
#         model=MODEL_NAME,
#         input=build_question_prompt(text, n_questions),
#         max_output_tokens=256,
#         reasoning={"effort": "minimal"}
#     )
    
#     # Navigate the response structure: output -> message item -> content -> text
#     raw = ""
#     for item in response.output:
#         if item.type == 'message' and item.content:
#             for content_item in item.content:
#                 if content_item.type == 'output_text':
#                     raw += content_item.text
    
#     questions = [q.strip("- ").strip() for q in raw.strip().split("\n") if q.strip()]
#     return questions


def generate_questions(text, n_questions=5):
    response = client.responses.create(
        model=MODEL_NAME,
        input=build_question_prompt(text, n_questions),
        max_output_tokens=256,
        reasoning={"effort": "minimal"}
    )
    
    # Navigate the response structure: output -> message item -> content -> text
    raw = ""
    for item in response.output:
        if item.type == 'message' and item.content:
            for content_item in item.content:
                if content_item.type == 'output_text':
                    raw += content_item.text
    
    # Deterministic: sorted set of unique questions
    questions = sorted(set(
        q.strip("- ").strip()
        for q in raw.strip().split("\n")
        if q.strip()
    ))
    
    return questions


## Generate ground truth dataset


In [38]:
ground_truth = []
for doc in tqdm(documents[:NUM_DOCS]):
    doc_id = doc["id"]
    text = doc["text"]
    try:
        questions = generate_questions(text, QUESTIONS_PER_DOC)
        
        # Changed: only warn if we got ZERO questions, not just fewer than expected
        if len(questions) == 0:
            print(f"[WARNING] No questions generated for {doc_id}")
            continue  # Skip this document
        
        # Log if we got fewer than expected (informational, not a warning)
        if len(questions) < QUESTIONS_PER_DOC:
            print(f"[INFO] Generated {len(questions)}/{QUESTIONS_PER_DOC} unique questions for {doc_id}")
        
        for q in questions:
            ground_truth.append({
                "query": q,
                "relevant_doc_ids": [doc_id]
            })
    except Exception as e:
        print(f"[ERROR] Doc {doc_id}: {e}")


100%|██████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.50s/it]


## Sanity checks


In [39]:
print(f"\nGenerated {len(ground_truth)} questions.")
unique_doc_ids = set(gt["relevant_doc_ids"][0] for gt in ground_truth)
print(f"Questions cover {len(unique_doc_ids)} documents: {unique_doc_ids}")



Generated 10 questions.
Questions cover 5 documents: {'gHHjDRDNUNU__chunk_002', 'gHHjDRDNUNU__chunk_000', 'gHHjDRDNUNU__chunk_001', 'gHHjDRDNUNU__chunk_003', 'gHHjDRDNUNU__chunk_004'}


## Save dataset


In [40]:
with open(OUTPUT_PATH, "w") as f:
    json.dump(ground_truth, f, indent=2)
    
print(f"Ground truth saved to {OUTPUT_PATH}")

Ground truth saved to ../data/eval/ground_truth_gpt5nano_20260122T203300.json
