In [1]:
import json
from typing import Dict, Iterable, List, Tuple
import uuid
from __future__ import annotations

import json
import os
from typing import Dict, Any

from dotenv import load_dotenv
from dotenv import dotenv_values

from openai import OpenAI



config = dotenv_values(".env")
api_key = config.get("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

file = "Data/slack_QA.json"


In [2]:
with open(file, "r", encoding="utf-8") as f:
        data=json.load(f)

In [3]:

def qdrant_point_id(name: str) -> str:
    """Deterministic UUIDv5 for a given name string."""
    return str(uuid.uuid5(uuid.NAMESPACE_URL, name))

In [4]:
def iter_records(data: List[Dict]) -> Iterable[Tuple[str, Dict]]:
    for thread in data:
        thread_ts = str(thread.get("thread_ts", ""))
        thread_ts_nodot = thread_ts.replace(".", "")
        qas = thread.get("qas", [])
        if not isinstance(qas, list):
            continue
        for i, qa in enumerate(qas):
            name = f"{i}{thread_ts_nodot}"
            point_id = qdrant_point_id(name)
            record = {
                "channel": thread.get("channel"),
                "thread_ts": thread_ts,
                "asked_by": qa.get("asked_by"),
                "answered_by": qa.get("answered_by"),
                "question": qa.get("question", ""),
                "answer": qa.get("answer", ""),
            }
            yield point_id, record

In [5]:
pairs = list(iter_records(data))

In [6]:
for pid, rec in pairs:
    rec["id"] = pid

data_with_id = [rec for _, rec in pairs]


In [7]:
output_file = "Data/data_with_id.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data_with_id, f, ensure_ascii=False, indent=2)

In [10]:
prompt_template = """
You are a dataset generation assistant.
Given a FAQ question and its corresponding answer, generate 5 diverse and natural-sounding user questions that could be answered using the provided answer.
These questions will be used to evaluate a search system.

Requirements:
Each question must be fully answerable using the provided answer only (no external knowledge).
Use as few words from the answer as possible — aim for low lexical overlap.
Rephrase and vary the style of the questions: include paraphrased, inferred, and reworded forms.
Avoid copying the original FAQ question.
Make the questions sound like real user queries.

Input:

Original FAQ Question: {question}
Answer: {answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [11]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [23]:
from tqdm.auto import tqdm
results = {}
for doc in tqdm(data_with_id): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/807 [00:00<?, ?it/s]

In [26]:
output_file = "Data/ground_truth.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)