In [5]:
# --- DIVERSE Generator Script (Randomized Scenarios) ---
import sys
import os
import json
import pandas as pd
from openai import OpenAI
import toml
from tqdm import tqdm
import re
import random  

# Configuration
TARGET_COUNT = 1000
OUTPUT_PATH = "../data/synthetic_test_data.csv"
CATEGORIES = ["Food", "Transport", "Utilities", "Retail", "Entertainment", "Personal Care", "Other"]

# Define distinct scenarios to force diversity
SCENARIOS = [
    "Simple List: The user simply lists 2-3 items they bought clearly.",
    "Correction: The user makes a mistake on the price or item and corrects themselves (e.g., 'wait, no...').",
    "Slang/Casual: The user uses words like 'bucks', 'quid', 'grand', or casual phrasing.",
    "Long Narrative: The user tells a short story about their day and mentions expenses implicitly.",
    "Mixed Categories: The user buys things from completely different categories (e.g., Gas and a Movie ticket).",
    "Rapid Fire: The user lists many small items quickly.",
    "Vague Context: The user mentions a brand name or a store instead of the item (e.g., 'Starbucks' instead of coffee).",
]

# Load API Key
try:
    secrets = toml.load("../.streamlit/secrets.toml")
    DEEPSEEK_API_KEY = secrets["DEEPSEEK_API_KEY"]
except Exception:
    print("Error: Key not found.")
    sys.exit()

client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")

generated_data = []

print(f"Generating {TARGET_COUNT} diverse synthetic cases (Phase I)...")

for _ in tqdm(range(TARGET_COUNT)):
    # Randomly select a scenario for this iteration
    current_scenario = random.choice(SCENARIOS)

    # Inject the scenario into the prompt
    PROMPT = f"""
    Generate 1 unique expense tracking test case.
    
    Current Scenario Style: "{current_scenario}"
    (Make the speech input match this specific style).

    The output must contain:
    1. "speech_input": A natural spoken sentence.
    2. "expected_json": The strict JSON ground truth. 

    CRITICAL CATEGORY RULES:
    - Groceries, Supermarket runs, Snacks, Dining out -> 'Food'
    - Clothes, Electronics, Household items, Gifts -> 'Retail'
    - Categories must be one of: {', '.join(CATEGORIES)}.

    CRITICAL JSON FORMAT RULES:
    1. **The "item" value MUST be the EXACT substring from "speech_input".** (e.g., if input says "Starbucks", item is "Starbucks", NOT "Coffee").
    2. You MUST use the exact keys: "item", "amount", "category".
    3. Every object must have all 3 keys.

    Output STRICTLY valid JSON format like:
    {{
      "speech_input": "I bought an apple for 5 dollars",
      "expected_json": "[{{\"item\": \"apple\", \"amount\": 5.00, \"category\": \"Food\"}}]"
    }}
    """

    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role": "user", "content": PROMPT}],
            temperature=0.9 
        )
        content = response.choices[0].message.content
        if content.startswith("```"):
            content = re.sub(r"```(json|JSON)?", "", content).strip()
            
        data = json.loads(content)
        
        # Validation
        if "item" not in data['expected_json'] or "category" not in data['expected_json']:
             continue
        
        generated_data.append(data)
    except:
        continue 

# Save to CSV
df = pd.DataFrame(generated_data)
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) 
df.to_csv(OUTPUT_PATH, index=False)
print(f"Success! Saved {len(df)} diverse cases to {OUTPUT_PATH}")

Generating 1000 diverse synthetic cases (Phase I)...


100%|██████████| 1000/1000 [1:07:13<00:00,  4.03s/it]

Success! Saved 1000 diverse cases to ../data/synthetic_test_data.csv



