Generating test datasets

In [16]:
from anthropic import Anthropic
from dotenv import load_dotenv
load_dotenv()

import json

When evaluating AI models that generate code, you need more than just checking if the response makes sense. You also need to verify that the generated code actually has valid syntax and follows the correct format. This is where code-based grading comes in.

In [17]:
model = "claude-sonnet-4-0"
client = Anthropic()

In [18]:
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences
    }
    if system:
        params["system"] = system
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

Code Based Grading

In [19]:
import re
import ast

In [20]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)


In [21]:
# Generating Test Data with Code
def generate_dataset():
    prompt = """
Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
  {
    "task": "Description of task",
    "format" : "python" or "json" or "regex",
  },
  ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [22]:
# Generate the dataset
dataset = generate_dataset()
print(dataset)

[{'task': "Create a Python function that parses an S3 bucket ARN and returns the bucket name. The function should extract the bucket name from ARNs in the format 'arn:aws:s3:::bucket-name'", 'format': 'python'}, {'task': 'Create a JSON configuration object for an AWS Lambda function that runs Python 3.9, has 512MB memory, 30 second timeout, and includes environment variables for DATABASE_URL and LOG_LEVEL', 'format': 'json'}, {'task': 'Write a regex pattern that matches valid AWS region names (e.g., us-east-1, eu-west-2, ap-southeast-1) which follow the format of 2-3 lowercase letters, hyphen, cardinal direction, hyphen, and a number', 'format': 'regex'}]


In [23]:
# Save dataset to a JSON file
with open(r'dataset/test-case-dataset2.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [24]:
# Passes a test case into Claude
def run_prompt(test_case):
    prompt = f"""
Please solve the following task:

{test_case["task"]}

* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    output = chat(messages, stop_sequences=["```"])
    return output

In [25]:
# Function to grade a test case + output using a model
def grade_by_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [26]:
# Function to execute a single test case and grade the output
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2

    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning,
    }

In [27]:
# This function coordinates the entire evaluation process:

from statistics import mean

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

In [29]:
with open(r"dataset/test-case-dataset2.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average score: 8.0


In [30]:
# Examining the Results
print(json.dumps(results, indent=2))

[
  {
    "output": "\ndef parse_s3_bucket_arn(arn):\n    return arn.split(':::')[1]\n",
    "test_case": {
      "task": "Create a Python function that parses an S3 bucket ARN and returns the bucket name. The function should extract the bucket name from ARNs in the format 'arn:aws:s3:::bucket-name'",
      "format": "python"
    },
    "score": 7.0,
    "reasoning": "While the core logic works for valid S3 ARNs, the solution lacks robustness. It will fail catastrophically on invalid inputs without proper error handling or validation, making it unsuitable for production use."
  },
  {
    "output": "\n{\n  \"FunctionName\": \"my-lambda-function\",\n  \"Runtime\": \"python3.9\",\n  \"Role\": \"arn:aws:iam::123456789012:role/lambda-execution-role\",\n  \"Handler\": \"lambda_function.lambda_handler\",\n  \"Code\": {\n    \"ZipFile\": \"\"\n  },\n  \"Description\": \"\",\n  \"Timeout\": 30,\n  \"MemorySize\": 512,\n  \"Environment\": {\n    \"Variables\": {\n      \"DATABASE_URL\": \"\",\n

Each result contains three key pieces of information:

- output: The complete response from Claude
- test_case: The original test case that was processed
- score: The evaluation score (currently hardcoded)