In [1]:
import json

with open('../Data/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

len(documents)

424

In [2]:
# intent: {intent}
# question: {question}
# response: {response}
# category: {category}
# tags: {tags}

documents[3]

{'intent': 'report_copyright_infringement',
 'question': 'can uhelp me submitting a notification of copyright  infringement',
 'response': 'To address copyright infringement concerns, please adhere to the following guidelines:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Go to the {{COPYRIGHT_SECTION}} area.\n3. Find and select the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} choice.\n4. Complete the necessary fields, providing comprehensive details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nYour submission will be reviewed, and we will take the requisite steps in line with our policies.',
 'category': 'CONTENT',
 'id': '1e254822'}

In [None]:
# In Python .format() strings:
# - Single {} → means “replace with a variable” (e.g., {intent} → replaced by your dict).
# - Double braces {{ or }} → means “output a literal { or }.
GEN_QUESTIONS_PROMPT_TEMPLATE = """
You are simulating a Media domain customer contacting customer support.
Your task is to generate **exactly 5 distinct natural-language customer questions** based on the provided FAQ record.

FAQ Record:
- category: {category}
- intent: {intent}
- question: {question}
- response: '''
{response}
'''

Requirements:
1. Each question must be clear, complete, natural-sounding, and aligned with the FAQ intent and category.
2. Paraphrase the original FAQ question. Do not copy it verbatim.
3. All generated questions must be answerable **solely using the given FAQ response**.
4. Replace placeholders with realistic examples:
   - {{WEBSITE_URL}} → "https://support.example.com"
   - {{DEVICE_PLATFORM}} → "Smart TV"
   - {{PROGRAM_TYPE}} → "documentary series"
   - {{LANGUAGE_OPTION}} → "Spanish"
   - {{PAYMENT_METHODS_INFORMATION}} → "credit card or PayPal"
   - {{COPYRIGHT_SECTION}} → "Copyright Policy"
   - {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} → "Report Copyright Issue"
   - {{CONTACT_US_SECTION}} → "Contact Us"
   - {{EMAIL_OPTION}} → "support@example.com"
   - {{SUBMIT_BUTTON}} → "Submit"
5. Vary phrasing, tone, and style across the 5 questions (e.g., casual, formal, direct).
6. Keep each question concise but specific.
7. Output must be strictly valid JSON, with no extra commentary, code blocks, or formatting beyond the schema.

Output format:
{{
  "questions": [
    "question1",
    "question2",
    "question3",
    "question4",
    "question5"
  ]
{{
""".strip()

In [None]:
print(GEN_QUESTIONS_PROMPT_TEMPLATE.format(**documents[3]))

You are simulating a Media domain customer contacting customer support.
Based on the provided FAQ record, generate 5 distinct natural-language questions the customer might ask.

FAQ Record:
- intent: report_copyright_infringement
- question: can uhelp me submitting a notification of copyright  infringement
- response: '''
To address copyright infringement concerns, please adhere to the following guidelines:

1. Access our website at {{WEBSITE_URL}}.
2. Go to the {{COPYRIGHT_SECTION}} area.
3. Find and select the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} choice.
4. Complete the necessary fields, providing comprehensive details regarding the infringement.
5. Submit the completed form for our assessment.

Your submission will be reviewed, and we will take the requisite steps in line with our policies.
'''
- category: CONTENT

Requirements Guidelines:
1. Each question must be clear, complete, natural, and aligned with the FAQ intent and category.
2. Paraphrase the original FAQ question; do 

In [21]:
import sys
sys.path.append("../assistant")

from config import SETTINGS

In [22]:
print(SETTINGS.LLM_PROVIDER)
print(SETTINGS.API_KEY)
print(SETTINGS.BASE_URL)
print(SETTINGS.MODEL_EMBED)
print(SETTINGS.MODEL_CHAT)

OLLAMA
ollama
http://localhost:11434/v1
nomic-embed-text
phi3


In [None]:
from openai import OpenAI

## curl https://<your-forwarded-url-11434>.app.github.dev/v1  # make public port
BASE_URL="https://solid-halibut-49xw4wpwpvxc55vq-11434.app.github.dev/v1"

client = OpenAI(
    # https://ollama.com/library/phi3
    # ollama pull phi3
    base_url=BASE_URL,  # Ollama API endpoint
    api_key=SETTINGS.API_KEY,  # dummy key (ignored by Ollama)
)

In [None]:
def generate_questions(doc):
    prompt = GEN_QUESTIONS_PROMPT_TEMPLATE.format(**doc)

    response = client.chat.completions.create(
        model=SETTINGS.MODEL_CHAT,
        messages=[{"role": "user", "content": prompt}]
    )
    json_response = response.choices[0].message.content
    return json_response

In [25]:
from tqdm.auto import tqdm

In [None]:
results = {}

In [None]:
for doc in tqdm(documents[255:], desc="Generate questions", disable=False): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    # Generate questions
    questions = generate_questions(doc)
    # Store results incrementally
    results[doc_id] = questions
    # Save progress incrementally
    with open('results.json', 'w') as json_file:
        json.dump(results, json_file)

    # break

In [65]:
len(results.items())

423

In [67]:
# Load
# with open('results.json', 'rt') as f_in:
#     results = json.load(f_in)

# # Save
# with open("results.json", "wt", encoding='utf-8') as f_out:
#     # Writes human-readable text (UTF-8 by default if you open the file in text mode "w")
#     json.dump(results, f_out, ensure_ascii=False, indent=None)
# # Load
# with open("results.json", "rt", encoding="utf-8") as f_in:
#     results = json.load(f_in)

results

{'34b742ae': '{\n   "questions": [\n      "I suspect my copyright was violated on your platform; what do I need to know about reporting it?",\n      "Could you explain the process for filing a report against unfair use or reproduction of \'Flipping History\', please, as per our Spanish guidelines?",\n      "For someone with limited tech skills watching their documentary series on Smart TV via example.com\'s services being infringed upon - what steps should they follow to report this offense for justice under your policies?",\n      "I need the procedure written out, from start till end – how does one convey a claim of copyright abuse while using their smart television at home when it comes to documentaries on \'History Uncovered\' series provided by you? Please specify for credit card or PayPal transactions.",\n      "How can we submit our detailed content violation account regarding the use in my living room against your guidelines through that email contact form?"\n   ]\n}',
 '521e79

In [None]:
# import joblib
# import pickle

# with open('results.pkl', 'wb') as f_out:
#     # Save the results to a binary file
#     pickle.dump(results, f_out)

# with open('results.pkl', 'rb') as f_in:
#     # Load the results from the binary file
#     loaded = pickle.load(f_in)
# print(loaded)

In [71]:
import re

def clean_json_string(json_string):
    """Clean the input JSON string by removing code block markers and extracting valid JSON."""
    # Remove code block markers
    cleaned = re.sub(r'```json|```', '', json_string).strip()
    # print(cleaned)
    # Use regex to find valid JSON portion (either array or object)
    match = re.search(r'(\[.*\]|\{.*\})', cleaned, re.DOTALL)
    return match.group(0) if match else cleaned  # Return matched JSON or the original string

parsed_results = {}

# # Parsing each JSON string after cleaning
# for doc_id, json_questions in results.items():
#     print(doc_id)
#     # print(json_questions)
#     cleaned_json = clean_json_string(json_questions)
#     print(cleaned_json)
#     parsed_results[doc_id] = json.loads(cleaned_json)

# Parsing each JSON string after cleaning
for doc_id, json_questions in results.items():
    try:
        # print(doc_id)
        # print(json_questions)
        cleaned_json = clean_json_string(json_questions)
        # print(cleaned_json)
        parsed_results[doc_id] = json.loads(cleaned_json)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON for doc_id {doc_id}: {e}")

Error parsing JSON for doc_id 521e7904: Expecting value: line 5 column 7 (char 294)
Error parsing JSON for doc_id 1e254822: Expecting ',' delimiter: line 6 column 7 (char 736)
Error parsing JSON for doc_id a93d2530: Invalid control character at: line 5 column 456 (char 1506)
Error parsing JSON for doc_id 929addc5: Expecting value: line 6 column 7 (char 515)
Error parsing JSON for doc_id dab953e3: Expecting ',' delimiter: line 5 column 9 (char 289)
Error parsing JSON for doc_id 722ba431: Expecting ',' delimiter: line 5 column 3 (char 389)
Error parsing JSON for doc_id d4961e9b: Expecting ',' delimiter: line 5 column 168 (char 544)
Error parsing JSON for doc_id f11c1526: Expecting value: line 5 column 9 (char 394)
Error parsing JSON for doc_id 632ead4a: Expecting ',' delimiter: line 4 column 207 (char 387)
Error parsing JSON for doc_id f495240f: Invalid control character at: line 6 column 186 (char 570)
Error parsing JSON for doc_id bdd32e1c: Expecting ',' delimiter: line 3 column 232 (c

In [72]:
doc_index = {d['id']: d for d in documents}
len(doc_index)

424

In [73]:
list(parsed_results.items())[:3]

[('34b742ae',
  {'questions': ['I suspect my copyright was violated on your platform; what do I need to know about reporting it?',
    "Could you explain the process for filing a report against unfair use or reproduction of 'Flipping History', please, as per our Spanish guidelines?",
    "For someone with limited tech skills watching their documentary series on Smart TV via example.com's services being infringed upon - what steps should they follow to report this offense for justice under your policies?",
    "I need the procedure written out, from start till end – how does one convey a claim of copyright abuse while using their smart television at home when it comes to documentaries on 'History Uncovered' series provided by you? Please specify for credit card or PayPal transactions.",
    'How can we submit our detailed content violation account regarding the use in my living room against your guidelines through that email contact form?']}),
 ('804f1a37',
  {'questions': ["Can someone

In [74]:
final_results = []

# for doc_id, questions in parsed_results.items():
#     category = doc_index[doc_id]['category']
#     for q in questions:
#         final_results.append((q, category, doc_id))

for doc_id, data in parsed_results.items():
    # Ensure the doc_id exists in the doc_index to retrieve the category
    if doc_id in doc_index:
        category = doc_index[doc_id]['category']

        # Check if `data` is already a list of questions or a dictionary containing 'questions'
        if isinstance(data, list):
            questions = data  # If it's already a list of questions
        elif isinstance(data, dict):
            questions = data.get('questions', [])  # If it's a dict, extract 'questions'

        for q in questions:
            final_results.append((q, category, doc_id))

In [75]:
len(final_results), final_results[:3]

(1441,
 [('I suspect my copyright was violated on your platform; what do I need to know about reporting it?',
   'CONTENT',
   '34b742ae'),
  ("Could you explain the process for filing a report against unfair use or reproduction of 'Flipping History', please, as per our Spanish guidelines?",
   'CONTENT',
   '34b742ae'),
  ("For someone with limited tech skills watching their documentary series on Smart TV via example.com's services being infringed upon - what steps should they follow to report this offense for justice under your policies?",
   'CONTENT',
   '34b742ae')])

In [76]:
import pandas as pd

In [77]:
df = pd.DataFrame(final_results, columns=['question', 'category', 'document'])
df

Unnamed: 0,question,category,document
0,I suspect my copyright was violated on your pl...,CONTENT,34b742ae
1,Could you explain the process for filing a rep...,CONTENT,34b742ae
2,For someone with limited tech skills watching ...,CONTENT,34b742ae
3,"I need the procedure written out, from start t...",CONTENT,34b742ae
4,How can we submit our detailed content violati...,CONTENT,34b742ae
...,...,...,...
1436,Could you guide me on the procedure to file a ...,CONTENT,dad7cacf
1437,I need some assistance regarding how we can pr...,CONTENT,dad7cacf
1438,How do I initiate an official complaint for my...,CONTENT,dad7cacf
1439,'Report a Copyright Issue.' How do I use this ...,CONTENT,dad7cacf


In [78]:
df.isna().sum()

question    0
category    0
document    0
dtype: int64

In [79]:
# df = pd.read_csv('../Data/ground-truth-data.csv').rename(columns={"document":"id"})
df = df.rename(columns={"document":"id"})
df

Unnamed: 0,question,category,id
0,I suspect my copyright was violated on your pl...,CONTENT,34b742ae
1,Could you explain the process for filing a rep...,CONTENT,34b742ae
2,For someone with limited tech skills watching ...,CONTENT,34b742ae
3,"I need the procedure written out, from start t...",CONTENT,34b742ae
4,How can we submit our detailed content violati...,CONTENT,34b742ae
...,...,...,...
1436,Could you guide me on the procedure to file a ...,CONTENT,dad7cacf
1437,I need some assistance regarding how we can pr...,CONTENT,dad7cacf
1438,How do I initiate an official complaint for my...,CONTENT,dad7cacf
1439,'Report a Copyright Issue.' How do I use this ...,CONTENT,dad7cacf


In [80]:
df.to_csv('../Data/ground-truth-data.csv', index=False)