In [5]:
import json

with open('../Data/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [6]:
documents[3]

{'intent': 'report_copyright_infringement',
 'question': 'can uhelp me submitting a notification of copyright  infringement',
 'response': 'To address copyright infringement concerns, please adhere to the following guidelines:\n\n1. Access our website at {{WEBSITE_URL}}.\n2. Go to the {{COPYRIGHT_SECTION}} area.\n3. Find and select the {{REPORT_COPYRIGHT_INFRINGEMENT_OPTION}} choice.\n4. Complete the necessary fields, providing comprehensive details regarding the infringement.\n5. Submit the completed form for our assessment.\n\nYour submission will be reviewed, and we will take the requisite steps in line with our policies.',
 'category': 'CONTENT',
 'id': '1e254822'}

In [None]:
# intent: {intent}
# question: {question}
# response: {response}
# category: {category}
# tags: {tags}
prompt_template = """
You are emulating a Media domain customer contacting customer support.
Based on the FAQ record provided, generate 5 distinct questions the customer might ask.

Requirements:
- Each question should be complete, natural, and aligned with the FAQ intent and category.
- Avoid copying phrases verbatim from the FAQ; paraphrase where possible.
- The questions must be answerable using the FAQ response.
- If entity placeholders (e.g., {{WEBSITE_URL}}, {{DEVICE_PLATFORM}}, {{PROGRAM_TYPE}}, {{LANGUAGE_OPTION}}, {{PAYMENT_METHODS_INFORMATION}}) are present, replace them with natural examples.
- If tags are provided, vary style or tone accordingly.
- Keep the questions concise but clear.
- Return exactly 5 questions.
- Output only valid JSON, with no extra text.

FAQ Record:

intent: {intent}
question: {question}
response: {response}
category: {category}

Output format in parsable JSON (JSON, no code blocks):

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [None]:
import os

os.getenv("OLLAMA_URL")

In [None]:
from openai import OpenAI

client = OpenAI(
    # https://ollama.com/library/phi3
    # ollama pull phi3
    # curl https://<your-forwarded-url>.app.github.dev/v1
    base_url='http://localhost:11434/v1/',  # Ollama API endpoint
    api_key='ollama',  # dummy key (ignored by Ollama)
)

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    json_response = response.choices[0].message.content
    return json_response

In [20]:
from tqdm.auto import tqdm

In [21]:
results = {}

In [None]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    # Generate questions
    questions = generate_questions(doc)
    # Store results incrementally
    results[doc_id] = questions
    # Save progress incrementally
    with open('results.json', 'w') as json_file:
        json.dump(results, json_file)

  0%|          | 0/424 [00:00<?, ?it/s]

In [None]:
# Load
# with open('results.json', 'rt') as f_in:
#     results = json.load(f_in)

# # Save
# with open("results.json", "wt", encoding='utf-8') as f_out:
#     # Writes human-readable text (UTF-8 by default if you open the file in text mode "w")
#     json.dump(results, f_out, ensure_ascii=False, indent=None)
# # Load
# with open("results.json", "rt", encoding="utf-8") as f_in:
#     loaded = json.load(f_in)

results

{'34b742ae': '{\n  "questions": [\n    "How do I go about reporting a copyright violation on your platform?",\n    "What steps are necessary to report an infringement of content rights?",\n    "Could you guide me through the process of filing a copyright complaint with Media Domain?",\n    "When submitting evidence for potential piracy, what form do I need to complete on your website?",\n    "What kind of responses can users expect after they report suspected copyright infringement?"\n  ]\n}',
 '521e7904': '```json\n{\n  "questions": [\n    "I\'d like to report a copyright violation on your website. Could you guide me through the process?",\n    "Is there specific information I need to provide when reporting potential copyright infringement online with Media content?",\n    "What should my next steps be after finding material that may or not respect our copyright laws within a video platform and wanting to report it on your site via email, as opposed to live chat?",\n    "Could you exp

In [None]:
# import joblib
# import pickle

# with open('results.pkl', 'wb') as f_out:
#     # Save the results to a binary file
#     pickle.dump(results, f_out)

# with open('results.pkl', 'rb') as f_in:
#     # Load the results from the binary file
#     loaded = pickle.load(f_in)
# print(loaded)

In [None]:
import re

def clean_json_string(json_string):
    """Clean the input JSON string by removing code block markers and extracting valid JSON."""
    # Remove code block markers
    cleaned = re.sub(r'```json|```', '', json_string).strip()
    # print(cleaned)
    # Use regex to find valid JSON portion (either array or object)
    match = re.search(r'(\[.*\]|\{.*\})', cleaned, re.DOTALL)
    return match.group(0) if match else cleaned  # Return matched JSON or the original string

parsed_results = {}

# # Parsing each JSON string after cleaning
# for doc_id, json_questions in results.items():
#     print(doc_id)
#     # print(json_questions)
#     cleaned_json = clean_json_string(json_questions)
#     print(cleaned_json)
#     parsed_results[doc_id] = json.loads(cleaned_json)

# Parsing each JSON string after cleaning
for doc_id, json_questions in results.items():
    try:
        # print(doc_id)
        # print(json_questions)
        cleaned_json = clean_json_string(json_questions)
        # print(cleaned_json)
        parsed_results[doc_id] = json.loads(cleaned_json)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON for doc_id {doc_id}: {e}")

Error parsing JSON for doc_id 632ead4a: Expecting ',' delimiter: line 6 column 169 (char 572)
Error parsing JSON for doc_id a2693afb: Expecting ',' delimiter: line 6 column 3 (char 817)
Error parsing JSON for doc_id a81169ec: Invalid control character at: line 5 column 159 (char 395)
Error parsing JSON for doc_id 9269c04d: Expecting value: line 7 column 3 (char 545)
Error parsing JSON for doc_id c2150ed2: Expecting ',' delimiter: line 4 column 89 (char 174)
Error parsing JSON for doc_id 65f3dec0: Expecting property name enclosed in double quotes: line 8 column 1 (char 718)
Error parsing JSON for doc_id 43cfb284: Invalid control character at: line 6 column 735 (char 1046)
Error parsing JSON for doc_id 9f586862: Invalid control character at: line 6 column 310 (char 586)
Error parsing JSON for doc_id 528e2e85: Expecting ',' delimiter: line 5 column 303 (char 521)
Error parsing JSON for doc_id 47d9bfed: Expecting value: line 6 column 6 (char 306)
Error parsing JSON for doc_id b5266091: Inv

In [30]:
doc_index = {d['id']: d for d in documents}

In [31]:
parsed_results.items()



In [32]:
parsed_results

{'34b742ae': {'questions': ['How do I go about reporting a copyright violation on your platform?',
   'What steps are necessary to report an infringement of content rights?',
   'Could you guide me through the process of filing a copyright complaint with Media Domain?',
   'When submitting evidence for potential piracy, what form do I need to complete on your website?',
   'What kind of responses can users expect after they report suspected copyright infringement?']},
 '521e7904': {'questions': ["I'd like to report a copyright violation on your website. Could you guide me through the process?",
   'Is there specific information I need to provide when reporting potential copyright infringement online with Media content?',
   'What should my next steps be after finding material that may or not respect our copyright laws within a video platform and wanting to report it on your site via email, as opposed to live chat?',
   'Could you explain how I go about contacting customer service regar

In [33]:
final_results = []

# for doc_id, questions in parsed_results.items():
#     category = doc_index[doc_id]['category']
#     for q in questions:
#         final_results.append((q, category, doc_id))

for doc_id, data in parsed_results.items():
    # Ensure the doc_id exists in the doc_index to retrieve the category
    if doc_id in doc_index:
        category = doc_index[doc_id]['category']

        # Check if `data` is already a list of questions or a dictionary containing 'questions'
        if isinstance(data, list):
            questions = data  # If it's already a list of questions
        elif isinstance(data, dict):
            questions = data.get('questions', [])  # If it's a dict, extract 'questions'

        for q in questions:
            final_results.append((q, category, doc_id))

In [34]:
final_results

[('How do I go about reporting a copyright violation on your platform?',
  'CONTENT',
  '34b742ae'),
 ('What steps are necessary to report an infringement of content rights?',
  'CONTENT',
  '34b742ae'),
 ('Could you guide me through the process of filing a copyright complaint with Media Domain?',
  'CONTENT',
  '34b742ae'),
 ('When submitting evidence for potential piracy, what form do I need to complete on your website?',
  'CONTENT',
  '34b742ae'),
 ('What kind of responses can users expect after they report suspected copyright infringement?',
  'CONTENT',
  '34b742ae'),
 ("I'd like to report a copyright violation on your website. Could you guide me through the process?",
  'CONTENT',
  '521e7904'),
 ('Is there specific information I need to provide when reporting potential copyright infringement online with Media content?',
  'CONTENT',
  '521e7904'),
 ('What should my next steps be after finding material that may or not respect our copyright laws within a video platform and wantin

In [11]:
import pandas as pd

In [36]:
df = pd.DataFrame(final_results, columns=['question', 'category', 'document'])

In [37]:
df

Unnamed: 0,question,category,document
0,How do I go about reporting a copyright violat...,CONTENT,34b742ae
1,What steps are necessary to report an infringe...,CONTENT,34b742ae
2,Could you guide me through the process of fili...,CONTENT,34b742ae
3,"When submitting evidence for potential piracy,...",CONTENT,34b742ae
4,What kind of responses can users expect after ...,CONTENT,34b742ae
...,...,...,...
1926,I believe someone has used my content without ...,CONTENT,ecc7b5ed
1927,How do I report a copyright violation accordin...,CONTENT,dad7cacf
1928,What specific steps must be taken for submitti...,CONTENT,dad7cacf
1929,Upon completing my copyright infringement repo...,CONTENT,dad7cacf


In [15]:
df.to_csv('../Data/ground-truth-data.csv', index=False)

In [14]:
df = pd.read_csv('../Data/ground-truth-data.csv').rename(columns={"document":"id"})
df

Unnamed: 0,question,category,id
0,How do I go about reporting a copyright violat...,CONTENT,34b742ae
1,What steps are necessary to report an infringe...,CONTENT,34b742ae
2,Could you guide me through the process of fili...,CONTENT,34b742ae
3,"When submitting evidence for potential piracy,...",CONTENT,34b742ae
4,What kind of responses can users expect after ...,CONTENT,34b742ae
...,...,...,...
1926,I believe someone has used my content without ...,CONTENT,ecc7b5ed
1927,How do I report a copyright violation accordin...,CONTENT,dad7cacf
1928,What specific steps must be taken for submitti...,CONTENT,dad7cacf
1929,Upon completing my copyright infringement repo...,CONTENT,dad7cacf
