In [None]:
#load environment variables
from dotenv import load_dotenv  
load_dotenv()

In [39]:
#Create API client
from anthropic import Anthropic
client = Anthropic()
model = "claude-sonnet-4-0"

In [40]:
#helper method to add user messages
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

#helper method to add assistant messages
def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

#Make a request to the API
def chat(messages,system=None,temperature=1.0,stop_sequences=[]):
    params ={
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences
    }

    if system:
        params["system"] = system

    response = client.messages.create(**params)
    return response.content[0].text  # Return the text of the first message in the response

#Usage of system Prompts

In [41]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages,stop_sequences=["```"])
    return json.loads(text)

dataset = generate_dataset()

dataset

# Save the dataset to a file
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)

In [42]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case['task']}
"""
    message = []
    add_user_message(message, prompt)
    output = chat(message)
    return output


In [43]:
def run_test_case(test_case):
    """calls the run_prompt function with the test case and grades the result"""
    output = run_prompt(test_case)
    # Here you would implement the grading logic based on the output
    # TODO calculate the score
    score = 10
    return {
        "output" : output,
        "test_case":test_case,
        "score" : score
    }

In [44]:
def run_eval(dataset):
    """Runs the evaluation on the dataset and returns the results"""
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    return results


In [45]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)


In [None]:
print(json.dumps(results, indent=2))