Generating test datasets

In [1]:
from anthropic import Anthropic
from dotenv import load_dotenv
load_dotenv()

import json

In [2]:
model = "claude-sonnet-4-0"
client = Anthropic()

In [3]:
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature
    }
    if system:
        params["system"] = system
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

Model Based Grading

In [4]:
# Generating Test Data with Code
def generate_dataset():
    prompt = """
Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
  {
    "task": "Description of task",
    "format": "json" or "python" or "regex",
    "solution_criteria": "Key criteria for evaluating the solution"
  },
  ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [5]:
# Generate the dataset
dataset = generate_dataset()
print(dataset)

[{'task': "Create a JSON policy document that allows an IAM user to read objects from a specific S3 bucket named 'my-app-logs' and list the bucket contents", 'format': 'json', 'solution_criteria': 'JSON must be valid IAM policy syntax with correct Version, Statement structure, Effect set to Allow, Principal or Resource specified correctly, and Actions including s3:GetObject and s3:ListBucket for the specified bucket ARN'}, {'task': 'Write a Python function that takes an AWS CloudWatch log event and extracts the timestamp, log level, and message from a typical application log line format', 'format': 'python', 'solution_criteria': 'Function must accept a log event parameter, correctly parse standard log format (timestamp, level, message), handle common log levels (INFO, ERROR, WARN, DEBUG), return structured data (dict or tuple), and include basic error handling for malformed input'}, {'task': 'Create a regex pattern that validates AWS Lambda function names according to AWS naming rules'

In [6]:
# Save dataset to a JSON file
with open(r'dataset/test-case-dataset3.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [7]:
# Function to grade a test case + output using a model
def grade_by_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Criteria you should use to evaluate the solution:
<criteria>
{test_case["solution_criteria"]}
</criteria>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [8]:
# Passes a test case into Claude
def run_prompt(test_case):
    prompt = f"""
Please solve the following task:

{test_case["task"]}

* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    output = chat(messages, stop_sequences=["```"])
    return output

In [9]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)


In [10]:
# Function to execute a single test case and grade the output
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2

    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning,
    }

In [11]:
from statistics import mean


def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

In [12]:
with open(r"dataset/test-case-dataset3.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average score: 8.166666666666666


In [37]:
# Examining the Results
print(json.dumps(results, indent=2))

[
  {
    "output": "I'll write a Python function to validate AWS S3 bucket names according to the AWS naming conventions you specified.\n\n```python\nimport re\n\ndef validate_s3_bucket_name(bucket_name):\n    \"\"\"\n    Validates whether an AWS S3 bucket name follows AWS naming conventions.\n    \n    AWS S3 bucket naming rules:\n    - Must be between 3 and 63 characters long\n    - Can consist only of lowercase letters, numbers, and hyphens\n    - Must not contain consecutive hyphens\n    - Must not start or end with a hyphen\n    - Must not start or end with a period (additional AWS rule)\n    - Must not contain underscores or other special characters\n    - Must not be formatted as an IP address (additional AWS rule)\n    \n    Args:\n        bucket_name (str): The bucket name to validate\n        \n    Returns:\n        bool: True if the bucket name is valid, False otherwise\n    \"\"\"\n    \n    # Check if bucket_name is a string\n    if not isinstance(bucket_name, str):\n    

Each result contains three key pieces of information:

- output: The complete response from Claude
- test_case: The original test case that was processed
- score: The evaluation score (currently hardcoded)