Generating test datasets

In [26]:
from anthropic import Anthropic
from dotenv import load_dotenv
load_dotenv()

import json

In [27]:
model = "claude-sonnet-4-0"
client = Anthropic()

In [28]:
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature
    }
    if system:
        params["system"] = system
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

Model Based Grading

In [29]:
# Generating Test Data with Code
def generate_dataset():
    prompt = """
Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
  {
    "task": "Description of task",
  },
  ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [30]:
# Generate the dataset
dataset = generate_dataset()
print(dataset)

[{'task': "Write a Python function that validates whether an AWS S3 bucket name follows AWS naming conventions (3-63 characters, lowercase letters/numbers/hyphens only, no consecutive hyphens, doesn't start/end with hyphen)"}, {'task': "Create a JSON object representing an AWS IAM policy that grants read-only access to a specific S3 bucket named 'my-app-logs' and all objects within it"}, {'task': 'Write a regex pattern that matches valid AWS EC2 instance IDs (format: i- followed by 8 or 17 hexadecimal characters)'}]


In [31]:
# Save dataset to a JSON file
with open(r'dataset/test-case-dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [32]:
# This function takes a test case and merges it with our prompt template:

def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case["task"]}
"""
    
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

In [33]:
# Implementing a Model Grader

def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = """
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Task: {task}
    Solution: {solution}
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [34]:
# This function orchestrates running a single test case and grading the result:

def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)
    
    model_grade = grade_by_model(test_case, output)
    score = model_grade["score"]
    # Grade the output
    reasoning = model_grade["reasoning"]
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning
    }

In [35]:
# This function coordinates the entire evaluation process:

from statistics import mean

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

In [36]:
with open(r"dataset/test-case-dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average score: 4.666666666666667


In [37]:
# Examining the Results
print(json.dumps(results, indent=2))

[
  {
    "output": "I'll write a Python function to validate AWS S3 bucket names according to the AWS naming conventions you specified.\n\n```python\nimport re\n\ndef validate_s3_bucket_name(bucket_name):\n    \"\"\"\n    Validates whether an AWS S3 bucket name follows AWS naming conventions.\n    \n    AWS S3 bucket naming rules:\n    - Must be between 3 and 63 characters long\n    - Can consist only of lowercase letters, numbers, and hyphens\n    - Must not contain consecutive hyphens\n    - Must not start or end with a hyphen\n    - Must not start or end with a period (additional AWS rule)\n    - Must not contain underscores or other special characters\n    - Must not be formatted as an IP address (additional AWS rule)\n    \n    Args:\n        bucket_name (str): The bucket name to validate\n        \n    Returns:\n        bool: True if the bucket name is valid, False otherwise\n    \"\"\"\n    \n    # Check if bucket_name is a string\n    if not isinstance(bucket_name, str):\n    

Each result contains three key pieces of information:

- output: The complete response from Claude
- test_case: The original test case that was processed
- score: The evaluation score (currently hardcoded)