In [1]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
model = "claude-3-5-haiku-latest"

In [2]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [3]:
def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = f"""
You are an expert code reviewer. Evaluate this AI-generated solution.

Task: {test_case["task"]}
Solution: {output}

Provide your evaluation as a structured JSON object with:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement  
- "reasoning": A concise explanation of your assessment
- "score": A number between 1-10
"""
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    try:
        return json.loads(eval_text)
    except json.JSONDecodeError:
        return {"score": 0, "strengths": [], "weaknesses": [], "reasoning": "Failed to parse evaluation"}

In [4]:
# Function to generate a new dataset
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code
* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [5]:
import json

dataset = generate_dataset()
print(json.dumps(dataset, indent=2))

[
  {
    "task": "Create a JSON schema that validates an AWS EC2 instance configuration with required fields like instance type, AMI ID, and security group",
    "format": "json"
  },
  {
    "task": "Write a Python function to parse and extract the AWS region from an ARN (Amazon Resource Name)",
    "format": "python"
  },
  {
    "task": "Create a regex pattern to validate an AWS S3 bucket name according to naming rules (lowercase, no spaces, 3-63 characters)",
    "format": "regex"
  }
]


In [6]:
with open('dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [7]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    format_type = test_case["format"]

    if format_type == "python":
        prompt = f"""
Task: {test_case["task"]}

Requirements:
- Provide ONLY valid Python code
- No markdown formatting, no backticks, no explanations
- No comments or docstrings
- Start directly with the function definition
- Single function only
"""
    elif format_type == "json":
        prompt = f"""
Task: {test_case["task"]}

Requirements:
- Provide ONLY valid JSON
- No markdown formatting, no backticks, no explanations
- No comments before or after the JSON
- Start directly with opening brace {{
"""
    elif format_type == "regex":
        prompt = f"""
Task: {test_case["task"]}

Requirements:
- Provide ONLY the regex pattern
- No markdown formatting, no backticks, no explanations
- No anchors unless specifically needed
- Just the raw regex pattern
"""
    else:
        prompt = f"""
Please solve the following task:

{test_case["task"]}

Provide only the solution without any explanation or formatting.
"""

    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

In [8]:
# Functions to validate the output structure
import re
import ast


def extract_code_from_markdown(text, language):
    """Extract code from markdown code blocks"""
    pattern = rf'```{language}(.*?)```'
    matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
    if matches:
        return matches[0].strip()
    
    # Try without language specifier
    pattern = r'```(.*?)```'
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        return matches[0].strip()
    
    return None


def validate_json(text):
    """Validate JSON with fallback to markdown extraction"""
    text = text.strip()
    
    # First try direct parsing
    try:
        json.loads(text)
        return 10
    except json.JSONDecodeError:
        pass
    
    # Try extracting from markdown
    extracted = extract_code_from_markdown(text, 'json')
    if extracted:
        try:
            json.loads(extracted)
            return 10
        except json.JSONDecodeError:
            pass
    
    return 0


def validate_python(text):
    """Validate Python with fallback to markdown extraction"""
    text = text.strip()
    
    # First try direct parsing
    try:
        ast.parse(text)
        return 10
    except SyntaxError:
        pass
    
    # Try extracting from markdown
    extracted = extract_code_from_markdown(text, 'python')
    if extracted:
        try:
            ast.parse(extracted)
            return 10
        except SyntaxError:
            pass
    
    return 0


def validate_regex(text):
    """Validate regex with fallback to markdown extraction"""
    text = text.strip()
    
    # First try direct compilation
    try:
        re.compile(text)
        return 10
    except re.error:
        pass
    
    # Try extracting from markdown
    extracted = extract_code_from_markdown(text, 'regex')
    if extracted:
        try:
            re.compile(extracted)
            return 10
        except re.error:
            pass
    
    # Try finding regex patterns in text
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if line and not line.startswith('#') and not line.startswith('//'):
            try:
                re.compile(line)
                return 10
            except re.error:
                continue
    
    return 0


def grade_syntax(response, test_case):
    format_type = test_case["format"]
    if format_type == "json":
        return validate_json(response)
    elif format_type == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [9]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)
    
    # TODO - Grading
    # Grade the output
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score
    }

In [10]:
from statistics import mean

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")
    
    return results

In [11]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

print(json.dumps(results, indent=2))

Average score: 8.083333333333334
[
  {
    "output": "{\n    \"$schema\": \"http://json-schema.org/draft-07/schema#\",\n    \"type\": \"object\",\n    \"properties\": {\n        \"instanceType\": {\n            \"type\": \"string\",\n            \"enum\": [\n                \"t2.micro\",\n                \"t2.small\",\n                \"t2.medium\",\n                \"m5.large\",\n                \"c5.xlarge\"\n            ]\n        },\n        \"amiId\": {\n            \"type\": \"string\",\n            \"pattern\": \"^ami-[0-9a-f]{8,17}$\"\n        },\n        \"securityGroups\": {\n            \"type\": \"array\",\n            \"minItems\": 1,\n            \"items\": {\n                \"type\": \"string\",\n                \"pattern\": \"^sg-[0-9a-f]{8,17}$\"\n            }\n        },\n        \"tags\": {\n            \"type\": \"object\",\n            \"additionalProperties\": {\n                \"type\": \"string\"\n            }\n        },\n        \"networkInterfaces\": {\n 