In [8]:
# %pip install anthropic python-dotenv

from dotenv import load_dotenv
load_dotenv()

from anthropic import Anthropic

client = Anthropic()
model = "claude-3-5-haiku-latest"

def add_user_message(messages, text):
  user_message = {"role": "user", "content": text}
  messages.append(user_message)

def add_assistant_message(messages, text):
  assistant_message = {"role": "assistant", "content": text}
  messages.append(assistant_message)

def add_system_message(messages, text):
  system_message = {"role": "system", "content": text}
  messages.append(system_message)


def chat(messages, stop_sequences=[], system=None, temperature=1.0):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [11]:
import json


def generate_dataset():
  prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
  messages = []
  add_user_message(messages, prompt)
  add_assistant_message(messages, "```json")
  output = chat(messages, stop_sequences=["```"])
  return json.loads(output)

dataset = generate_dataset()
with open("dataset.json", "w") as f:
  json.dump(dataset, f, indent=2)



In [13]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case["task"]}
"""

    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    # TODO - Grading
    score = 10

    return {
        "output": output,
        "test_case": test_case,
        "score": score
    }

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    return results

In [14]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

In [17]:
print(json.dumps(results, indent=2))


[
  {
    "output": "Here's a Python function to extract the AWS region from an ARN string:\n\n```python\ndef extract_aws_region_from_arn(arn):\n    \"\"\"\n    Extract the AWS region from a valid ARN string.\n    \n    Args:\n        arn (str): The Amazon Resource Name (ARN)\n    \n    Returns:\n        str: The AWS region extracted from the ARN\n        None: If the ARN is invalid or region cannot be extracted\n    \"\"\"\n    # Check if the ARN is a valid string\n    if not isinstance(arn, str):\n        return None\n    \n    # Split the ARN into its components\n    try:\n        # ARN format: arn:partition:service:region:account-id:resource-type/resource-id\n        arn_parts = arn.split(':')\n        \n        # Validate basic ARN structure\n        if len(arn_parts) < 5 or arn_parts[0] != 'arn':\n            return None\n        \n        # The region is typically the 4th component (index 3)\n        region = arn_parts[3]\n        \n        # Additional validation to ensure regi