# 04 Synthetic Data Generation

We use the LLM itself to generate test cases for our benchmark.
We will ask the model to generate a JSON list of words with varying lengths.

In [3]:
import sys
import os
import json
sys.path.append(os.path.abspath('..'))

from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# Load environment variables from parent directory
load_dotenv(os.path.join('..', '.env'))

True

In [7]:
import sys
import os
import json
sys.path.append(os.path.abspath('..'))

from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# Load environment variables from parent directory
load_dotenv(os.path.join('..', '.env'))

# Direct LLM connection (no agent loop needed for generation)
llm = ChatOpenAI(
    model=os.getenv("MODEL_NAME", "ibm-granite/granite-4.0-h-micro"),
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1"
)

prompt = """
Generate a valid JSON object containing a list of 10 test cases for a character counting tool.
The format should be: {"test_cases": ["word1", "word2", ...]}.
Include some simple words, some very long words, and some with hyphens.
Do not include markdown formatting.
"""

print("Generating data...")
try:
    response = llm.invoke(prompt)
    content = response.content.strip()

    # Clean up markdown code blocks if the model adds them
    if content.startswith("```"):
        content = content.replace("```json", "").replace("```", "")

    data = json.loads(content)
    print(json.dumps(data, indent=2))

    # Save to file in the current directory
    with open('test_data.json', 'w') as f:
        json.dump(data, f)
        
    print("\n✅ Saved to test_data.json")
    
except json.JSONDecodeError:
    print("❌ Failed to decode JSON. Raw output:")
    print(content)
except Exception as e:
    print(f"❌ An error occurred: {e}")

Generating data...
{
  "test_cases": [
    "hello",
    "world",
    "character",
    "counting",
    "tool",
    "longwordwithmanycharacters",
    "hyphenated-word",
    "short",
    "tiny",
    "a"
  ]
}

✅ Saved to test_data.json
