In [4]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
model = "claude-3-5-haiku-latest"

In [5]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [None]:
def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = f"""
    You are an expert code reviewer. Evaluate this AI-generated solution.

    Task: {test_case['task']}
    Solution: {output}

    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")

    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [17]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])

    return json.loads(text)

In [18]:
import json

dataset = generate_dataset()
print(json.dumps(dataset, indent=2))

[
  {
    "task": "Create a Python function that validates an AWS IAM username follows the required naming conventions (lowercase, 64 chars max, starts with a letter, contains only letters, numbers, and certain special characters)",
    "type": "Python Function"
  },
  {
    "task": "Write a JSON schema to define the structure for an AWS CloudFormation template parameter section, ensuring required fields and allowed value types",
    "type": "JSON Schema"
  },
  {
    "task": "Develop a regular expression that matches valid AWS EC2 instance IDs (format: i-[alphanumeric characters], exactly 17 characters long)",
    "type": "Regex"
  }
]


In [19]:
with open('dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [20]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case["task"]}
"""
    
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

In [22]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)
    
    # TODO - Grading
    # Grade the output
    model_grade = grade_by_model(test_case, output)
    score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score
    }

In [None]:
from statistics import mean

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")
    
    return results

In [15]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

print(json.dumps(results, indent=2))

[
  {
    "output": "Here's a regular expression that validates an AWS IAM username according to the specified requirements:\n\n```\n^[a-zA-Z][a-zA-Z0-9+=,.@_]*$\n```\n\nLet's break down the regex:\n\n- `^` - Anchors the start of the string\n- `[a-zA-Z]` - Ensures the first character is a letter (uppercase or lowercase)\n- `[a-zA-Z0-9+=,.@_]*` - Allows zero or more characters that are:\n  - Letters (a-z or A-Z)\n  - Numbers (0-9)\n  - Specific special characters: +, =, ,, ., @, _\n- `$` - Anchors the end of the string\n\nExamples of valid usernames:\n- john.doe\n- jane_smith\n- user123\n- admin+test\n- john.doe@company\n\nExamples of invalid usernames:\n- 123user (starts with a number)\n- user! (contains an unauthorized special character)\n- #username (starts with an unauthorized special character)\n\nYou can test this regex in most programming languages or regex testers to validate AWS IAM usernames.",
    "test_case": {
      "task": "Create a regular expression to validate an AWS IA