In [33]:
#load environment variables
from dotenv import load_dotenv  
load_dotenv()

True

In [34]:
#Create API client
from anthropic import Anthropic
client = Anthropic()
model = "claude-3-5-haiku-latest"

In [35]:
#helper method to add user messages
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

#helper method to add assistant messages
def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

#Make a request to the API
def chat(messages,system=None,temperature=1.0,stop_sequences=[]):
    params ={
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences
    }

    if system:
        params["system"] = system

    response = client.messages.create(**params)
    return response.content[0].text  # Return the text of the first message in the response

#Usage of system Prompts

In [36]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex",
        "solution_criteria": "key Criteria for evaluating the solution"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages,stop_sequences=["```"])
    return json.loads(text)

dataset = generate_dataset()

dataset

# Save the dataset to a file
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)

In [37]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case['task']}

* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation

"""
    message = []
    add_user_message(message, prompt)
    # Add the test case input as a code block..i.e. python, json, or regex
    add_assistant_message(message, "```code")
    output = chat(message, stop_sequences=["```"])
    return output


In [38]:
def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = f"""
    You are an expert code reviewer. Evaluate this AI-generated solution against the solution criteria mentinoed the tast case.
    
Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Criteria you should use to evaluate the solution:
<criteria>
{test_case["solution_criteria"]}
</criteria>
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}

    """
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")

    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [39]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [40]:
def run_test_case(test_case):
    """calls the run_prompt function with the test case and grades the result"""
    output = run_prompt(test_case)
    # Here you would implement the grading logic based on the output
    
    # Grade the output
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2
    
    return {
        "output": output, 
        "test_case": test_case, 
        "score": score,
        "reasoning": reasoning
    }

In [41]:
from statistics import mean

def run_eval(dataset):
    """Runs the evaluation on the dataset and returns the results"""
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results


In [42]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)


Average score: 8.0


In [43]:
print(json.dumps(results, indent=2))

[
  {
    "output": "\n{\n    \"Version\": \"2012-10-17\",\n    \"Statement\": [\n        {\n            \"Effect\": \"Allow\",\n            \"Principal\": {\n                \"AWS\": \"arn:aws:iam::ACCOUNT_ID:role/ROLE_NAME\"\n            },\n            \"Action\": [\n                \"s3:GetObject\",\n                \"s3:ListBucket\",\n                \"s3:GetBucketLocation\"\n            ],\n            \"Resource\": [\n                \"arn:aws:s3:::BUCKET_NAME\",\n                \"arn:aws:s3:::BUCKET_NAME/*\"\n            ]\n        }\n    ]\n}\n",
    "test_case": {
      "task": "Create a JSON configuration for an AWS S3 bucket policy that allows read-only access to a specific IAM role",
      "format": "json",
      "solution_criteria": "Policy must include a valid IAM role ARN, specify read-only actions like s3:GetObject, and define the correct resource scope"
    },
    "score": 8.5,
    "reasoning": "The solution provides a solid template for S3 read-only access with corr