## Lab 4: Model deployment and inference

After training and evaluating our model, we want to make it available for inference. Amazon Bedrock provides a serverless endpoint for model deployment, allowing us to serve the model without managing infrastructure.

The Bedrock Custom Model feature of Amazon Bedrock lets us import our fine-tuned model and access it through the same API as other foundation models. This provides:

### Prerequisites

In [None]:
import sagemaker
import boto3

sess = sagemaker.Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
bucket_name = sess.default_bucket()
default_prefix = sess.default_bucket_prefix

# Initialize the Bedrock client
bedrock = boto3.client("bedrock", region_name=sess.boto_region_name)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

### Utility functions

Retrieve last completed job from SageMaker AI

In [None]:
def get_last_job_name(job_name_prefix):
    sagemaker_client = boto3.client('sagemaker')

    matching_jobs = []
    next_token = None

    while True:
        # Prepare the search parameters
        search_params = {
            'Resource': 'TrainingJob',
            'SearchExpression': {
                'Filters': [
                    {
                        'Name': 'TrainingJobName',
                        'Operator': 'Contains',
                        'Value': job_name_prefix
                    },
                    {
                        'Name': 'TrainingJobStatus',
                        'Operator': 'Equals',
                        'Value': "Completed"
                    }
                ]
            },
            'SortBy': 'CreationTime',
            'SortOrder': 'Descending',
            'MaxResults': 100
        }

        # Add NextToken if we have one
        if next_token:
            search_params['NextToken'] = next_token

        # Make the search request
        search_response = sagemaker_client.search(**search_params)

        # Filter and add matching jobs
        matching_jobs.extend([
            job['TrainingJob']['TrainingJobName'] 
            for job in search_response['Results']
            if job['TrainingJob']['TrainingJobName'].startswith(job_name_prefix)
        ])

        # Check if we have more results to fetch
        next_token = search_response.get('NextToken')
        if not next_token or matching_jobs:  # Stop if we found at least one match or no more results
            break

    if not matching_jobs:
        print(f"No completed training jobs found starting with prefix '{job_name_prefix}'")
        return None

    return matching_jobs[0]

Get Checkpoint configurations

In [None]:
def get_sagemaker_checkpoint_s3_uri(training_job_name):
    sagemaker = boto3.client('sagemaker')
    try:
        response = sagemaker.describe_training_job(TrainingJobName=training_job_name)
        checkpoint_config = response['CheckpointConfig']['S3Uri']
        return checkpoint_config
    except Exception as e:
        print(f"Error retrieving checkpoint configuration: {e}")
        return None

Extract the right checkpoint path from the logs

In [None]:
import os
import re

def filter_s3_paths(message, base_path):
    pattern = r's3://[^\s]+'
    matches = re.findall(pattern, message)
    return [match.rstrip('.') for match in matches if base_path in match]

def get_logs_containing_text(log_group_name, search_text, region='us-east-1'):
    logs_client = boto3.client('logs', region_name=region)
    matching_events = []
    
    paginator = logs_client.get_paginator('filter_log_events')
    for page in paginator.paginate(logGroupName=log_group_name):
        for event in page['events']:
            if search_text in event['message']:
                matching_events.append(event)
    
    return matching_events

Utility functions to check the manifest.json file

In [None]:
import json
import os
import tarfile

def extract_tar_gz(tar_path, extract_to='.'):
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(extract_to)

def download_s3_file(s3_path, local_path):
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    s3 = boto3.client('s3')
    bucket = s3_path.split('/')[2]
    key = '/'.join(s3_path.split('/')[3:])
    s3.download_file(bucket, key, local_path)

def get_checkpoint_path(manifest_path):
    with open(manifest_path) as f:
        return json.load(f)['checkpoint_s3_bucket']

#### Update model configurations

In [None]:
job_name = get_last_job_name("train-nova-micro-sft-peft")

print("Job name:", job_name)

if job_name:
    model_path = get_sagemaker_checkpoint_s3_uri(job_name)

    print("Model path:", model_path)

    # Define name for imported model
    imported_model_name = "nova-micro-sagemaker-sft-peft-reasoning"

    # Define the guardrail ID (Optional)
    guardrail_id = None

### Creating the Bedrock Custom Model

In [None]:
from botocore.exceptions import ClientError

try:
    request_params = {
        "modelName": imported_model_name,
        "modelSourceConfig": {
            "s3DataSource": {
                "s3Uri": model_path,
            }
        },
        "roleArn": role,
        "clientRequestToken": "NovaRecipeSageMaker",
    }
    response = bedrock.create_custom_model(**request_params)
    model_arn = response["modelArn"]

    print("/***************************************************/")
    print(f"Model import job created with ARN: {model_arn}")
    print("/***************************************************/")
except ClientError as e:
    if e.response['Error']['Code'] == 'ValidationException':
        print("S3 URI invalid, downloading manifest to find correct path...")

        try:
            download_s3_file(f"s3://{sess.default_bucket()}/train-nova-micro-sft-peft/{job_name}/output/output.tar.gz", "./tmp_output/output.tar.gz")
            extract_tar_gz('./tmp_output/output.tar.gz', './tmp_output')
            model_path = get_checkpoint_path('./tmp_output/manifest.json')

            request_params["modelSourceConfig"]["s3DataSource"]["s3Uri"] = model_path
            response = bedrock.create_custom_model(**request_params)
            model_arn = response["modelArn"]

            print("/***************************************************/")
            print(f"Model import job created with ARN: {model_arn}")
            print("/***************************************************/")
        except:
            print("Manifest not found, searching CloudWatch logs for correct path...")

            logs = get_logs_containing_text(
                '/aws/sagemaker/TrainingJobs',
                model_path
            )

            for log in logs:
                s3_paths = filter_s3_paths(log['message'], model_path)
                if s3_paths:
                    model_path = s3_paths[0]
                    print(f"Found S3 path in logs: {model_path}")
                    break

            request_params["modelSourceConfig"]["s3DataSource"]["s3Uri"] = model_path
            response = bedrock.create_custom_model(**request_params)
            model_arn = response["modelArn"]

            print("/***************************************************/")
            print(f"Model import job created with ARN: {model_arn}")
            print("/***************************************************/")
    else:
        raise

### Monitoring the Model status

After initiating the model import, we need to monitor its progress. The status goes through several states:

* CREATING: Model is being imported
* ACTIVE: Import successful
* FAILED: Import encountered errors

This cell polls the Bedrock API every 60 seconds to check the status of the model import, continuing until it reaches a terminal state (ACTIVE or FAILED). Once the import completes successfully, we'll have the model ARN which we can use for inference.

In [None]:
from IPython.display import clear_output
import time

model_arn = None

while True:
    response = bedrock.list_custom_models(sortBy='CreationTime',sortOrder='Descending')
    model_summaries = response["modelSummaries"]
    status = ""
    for model in model_summaries:
        if model["modelName"] == imported_model_name:
            status = model["modelStatus"].upper()
            model_arn = model["modelArn"]
            print(f'{model["modelStatus"].upper()} {model["modelArn"]} ...')
            if status in ["ACTIVE", "FAILED"]:
                break
    if status in ["ACTIVE", "FAILED"]:
        break
    clear_output(wait=True)
    time.sleep(10)

model_arn

##### ⚠️ After the model is ACTIVE, deploy a custom model for on-demand inference!

Please refer to the official [AWS Documentation](https://docs.aws.amazon.com/nova/latest/userguide/deploy-custom-model.html)

In [None]:
request_params = {
    "clientRequestToken": "NovaRecipeSageMakerODI",
    "modelDeploymentName": f"{imported_model_name}-odi",
    "modelArn": model_arn,
}

response = bedrock.create_custom_model_deployment(**request_params)

response

In [None]:
from IPython.display import clear_output
import time

custom_model_arn = None

while True:
    response = bedrock.list_custom_model_deployments(
        sortBy="CreationTime", sortOrder="Descending"
    )
    model_summaries = response["modelDeploymentSummaries"]
    status = ""
    for model in model_summaries:
        if model["customModelDeploymentName"] == f"{imported_model_name}-odi":
            status = model["status"].upper()
            custom_model_arn = model["customModelDeploymentArn"]
            print(f'{model["status"].upper()} {model["customModelDeploymentArn"]} ...')
            if status in ["CREATING"]:
                break
    if status in ["ACTIVE", "FAILED"]:
        break
    clear_output(wait=True)
    time.sleep(10)

custom_model_arn

***

### Testing the Deployed Model

Now that our model is deployed to Amazon Bedrock, we can invoke it for inference. We'll set up the necessary clients and functions to interact with our model through the Bedrock Runtime API.

Inference Setup Components:
* Bedrock Runtime Client: AWS SDK client for making inference calls
* Helper Function: To handle retry logic and properly format requests
The generate function we're defining:

Applies the proper chat template to user messages
* Handles retry logic for robustness
* Sets appropriate generation parameters like temperature and top-p

This setup allows us to easily test how well our training worked by sending queries to the model and evaluating its responses.

In [None]:
import boto3
from botocore.config import Config


# Initialize Bedrock Runtime client
session = boto3.Session()
client = session.client(
    service_name="bedrock-runtime",
    region_name=sess.boto_region_name,
    config=Config(
        connect_timeout=300,  # 5 minutes
        read_timeout=300,  # 5 minutes
        retries={"max_attempts": 3},
    ),
)

In [None]:
import time


def generate(
    model_id,
    messages,
    system_prompt=None,
    tools=None,
    temperature=0.3,
    max_tokens=4096,
    top_p=0.9,
    guardrail_id=None,
    max_retries=10,
):
    """
    Generate response using the model with proper tokenization and retry mechanism

    Parameters:
        model_id (str): ID of the model to use
        messages (list): List of message dictionaries with 'role' and 'content'
        system_prompt (str, optional): System prompt to guide the model
        tools (dict, optional): Tool configuration for the model
        temperature (float): Controls randomness in generation (0.0-1.0)
        max_tokens (int): Maximum number of tokens to generate
        top_p (float): Nucleus sampling parameter (0.0-1.0)
        guardrail_id (str): Identifier of the guardrail to apply
        max_retries (int): Maximum number of retry attempts

    Returns:
        dict: Model response containing generated text and metadata
    """
    # Prepare base parameters for the API call
    kwargs = {
        "inferenceConfig": {
            "temperature": temperature,
            "maxTokens": max_tokens,
            "topP": top_p,
        },
    }

    # Add optional parameters if provided
    if tools:
        kwargs["toolConfig"] = tools
    if system_prompt:
        kwargs["system"] = [{"text": system_prompt}]
    if guardrail_id:
        kwargs["guardrailConfig"] = {
            "guardrailIdentifier": guardrail_id,
            "guardrailVersion": "1",
        }

    # Retry logic
    for attempt in range(max_retries):
        try:
            return client.converse(modelId=model_id, messages=messages, **kwargs)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(30)
            else:
                print("Max retries reached. Unable to get response.")
                return None

Use the custom model deployment ARN created with the Bedrock Custom Model

In [None]:
import json
import textwrap

model_arn = (
    custom_model_arn
    if custom_model_arn is not None
    else "<BEDROCK_CUSTOM_MODEL_DEPLOYMENT_ARN>"
)

system_prompt = """
You are an AI assistant that thinks in {language} but responds in English.

IMPORTANT: Follow this exact format for every response:
1. First, write your reasoning and thoughts inside <think>...</think> tags
2. Then, provide your final answer in English

Always think through the problem in {language}, then translate your conclusion to English for the final response.
"""

system_prompt = system_prompt.format(language="Spanish")

messages = [
    {
        "role": "user",
        "content": [
            {
                "text": "Hello, how are you?"
            }
        ],
    },
]

response = generate(
    model_id=model_arn,
    system_prompt=textwrap.dedent(system_prompt).strip(),
    messages=messages,
    max_tokens=512,
    temperature=0.1,
    top_p=0.9,
    guardrail_id=guardrail_id,
)

print(json.dumps(response["output"], indent=2))

***

## Lab 5: Model evaluation

We are now going to evaluate the model with the LLM as a Judge evaluation task. First, let's define the Bedrock client

In [None]:
import boto3
from botocore.config import Config

# Initialize Bedrock Runtime client
session = boto3.Session()
client = session.client(
    service_name="bedrock-runtime",
    region_name=sess.boto_region_name,
    config=Config(
        connect_timeout=300,  # 5 minutes
        read_timeout=300,  # 5 minutes
        retries={"max_attempts": 3},
    ),
)

To invoke the fine-tuned model for inference, we'll set up the necessary clients and functions to interact with our model through the Bedrock Runtime API.

Inference Setup Components:
* Bedrock Runtime Client: AWS SDK client for making inference calls
* Helper Function: To handle retry logic and properly format requests
The generate function we're defining:

Applies the proper chat template to user messages
* Handles retry logic for robustness
* Sets appropriate generation parameters like temperature and top-p

This setup allows us to easily test how well our training worked by sending queries to the model and evaluating its responses.

In [None]:
import time


def generate(
    model_id,
    messages,
    system_prompt=None,
    tools=None,
    temperature=0.3,
    max_tokens=4096,
    top_p=0.9,
    guardrail_id=None,
    max_retries=10,
):
    """
    Generate response using the model with proper tokenization and retry mechanism

    Parameters:
        model_id (str): ID of the model to use
        messages (list): List of message dictionaries with 'role' and 'content'
        system_prompt (str, optional): System prompt to guide the model
        tools (dict, optional): Tool configuration for the model
        temperature (float): Controls randomness in generation (0.0-1.0)
        max_tokens (int): Maximum number of tokens to generate
        top_p (float): Nucleus sampling parameter (0.0-1.0)
        guardrail_id (str): Identifier of the guardrail to apply
        max_retries (int): Maximum number of retry attempts

    Returns:
        dict: Model response containing generated text and metadata
    """
    # Prepare base parameters for the API call
    kwargs = {
        "inferenceConfig": {
            "temperature": temperature,
            "maxTokens": max_tokens,
            "topP": top_p,
        },
    }

    # Add optional parameters if provided
    if tools:
        kwargs["toolConfig"] = tools
    if system_prompt:
        kwargs["system"] = [{"text": system_prompt}]
    if guardrail_id:
        kwargs["guardrailConfig"] = {
            "guardrailIdentifier": guardrail_id,
            "guardrailVersion": "1",
        }

    # Retry logic
    for attempt in range(max_retries):
        try:
            return client.converse(modelId=model_id, messages=messages, **kwargs)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(30)
            else:
                print("Max retries reached. Unable to get response.")
                return None

#### Generate model inference on the created test dataset

In the following cell, we are going to invoke the model to create the dataset for LLM as a judge.

The required dataset structure for LLM as a judge task is:

```
{
    "prompt": "String containing the input prompt and instructions.",
    "response_A": "String containing the ground truth output",
    "response_B": "String containing the customized model output."
}
```

In [None]:
system_promt_base = f"""
You are an expert LLM evaluator tasked with comparing responses from a base model and a fine-tuned model to determine if the fine-tuned model has successfully learned to follow specific formatting and reasoning instructions.

## EVALUATION CRITERIA

The fine-tuned model should demonstrate the following behaviors that the base model likely does not:

1. **Reasoning Language Compliance**: The model should think/reason in the language specified in the system prompt's "reasoning language" field
2. **Structured Thinking**: All reasoning must be enclosed within `<think>...</think>` tags
3. **English Response**: The final answer/response must be in English, regardless of the reasoning language
4. **Reasoning Quality**: The thinking process should be coherent and relevant to the user's question

## EVALUATION PROCESS

For each model response, evaluate:

### Format Compliance
- **Think Tags Present**: Does the response contain `<think>...</think>` tags?

- **Language Separation**: Is there clear separation between reasoning language and English response?
### Content Quality
- **Reasoning Coherence**: Is the thinking process logical and relevant?

- **Response Completeness** Does the English response adequately address the user's question?

### Instruction Following
- **System Prompt Adherence** : Does the model follow the specific instructions in the system prompt?

## OUTPUT FORMAT

For each evaluation, provide:

1. **Format Compliance Score**: X/60
   - Language Separation: X/40 (explanation)
   - Think Tags: X/20 (explanation)
2. **Instruction Following Score**: X/30 (explanation)
3. **Content Quality Score**: X/10
   - Reasoning Coherence: X/5 (explanation)
   - Response Completeness: X/5 (explanation)
4. **Total Score**: X/100
5. **Overall Assessment**: [Brief summary of strengths and weaknesses]
6. **Recommendation**: [Which model performed better and why]

## SPECIAL CONSIDERATIONS

- If the system prompt specifies a reasoning language other than English, pay close attention to whether the model actually thinks in that language
- Look for code-switching or language mixing within the think tags
- Note any creative or unexpected but valid interpretations of the instructions in your evaluation

Remember: The goal is to determine if fine-tuning successfully taught the model to follow the specific format and language instructions while maintaining response quality.

At the end of your evaluation, you MUST put your final preference in the tags <final_preference>...</final_preference> like:
<final_preference>
Response B - Score 80/100
</final_preference>

This is the baseline to evaluate:

System prompt:

{{system_prompt}}

Question:

{{question}}

Target answer:

{{answer}}
"""

In [None]:
from datasets import load_dataset
import pandas as pd
import textwrap
import time

test_dataset = load_dataset(
    "json", data_files="./data/test/gen_qa.jsonl", split="train"
)

llm_val_dataset = []

model_id = (
    custom_model_arn
    if custom_model_arn is not None
    else "<CUSTOM_MODEL_DEPLOYMENT_ARN>"
)

request_times = []
index = 1

for el in test_dataset:
    print("Processing row ", index)

    messages = [
        {"role": "user", "content": [{"text": el["query"]}]},
    ]

    # Rate limit for first request
    current_time = time.time()
    request_times = [t for t in request_times if current_time - t < 60]

    if len(request_times) >= 5:
        wait_time = 60 - (current_time - request_times[0])
        print(f"Waiting for {wait_time:.2f}s")
        time.sleep(wait_time)
        request_times = request_times[1:]

    request_times.append(time.time())

    response_base = generate(
        model_id="us.amazon.nova-micro-v1:0",
        system_prompt=el["system"],
        messages=messages,
        temperature=0.1,
        top_p=0.9,
        guardrail_id=guardrail_id,
    )

    # Rate limit for second request
    current_time = time.time()
    request_times = [t for t in request_times if current_time - t < 60]

    if len(request_times) >= 5:
        wait_time = 60 - (current_time - request_times[0])
        print(f"Waiting for {wait_time:.2f}s")
        time.sleep(wait_time)
        request_times = request_times[1:]

    request_times.append(time.time())

    response_ft = generate(
        model_id=model_id,
        system_prompt=el["system"],
        messages=messages,
        temperature=0.1,
        top_p=0.9,
        guardrail_id=guardrail_id,
    )

    system_prompt = system_promt_base.format(
        system_prompt=el["system"],
        question=el["query"],
        answer=el["response"],
    )

    llm_val_dataset.append([
        textwrap.dedent(system_prompt).strip(),
        el["system"],
        el["query"],
        response_base["output"]["message"]["content"][0]["text"],
        response_ft["output"]["message"]["content"][0]["text"],
    ])

    index += 1

print("Inference completed!")

llm_judge_df = pd.DataFrame(
    llm_val_dataset, columns=["llm_eval_prompt", "system_prompt", "question", "response_A", "response_B"]
)

In [None]:
llm_judge_df.to_json("./llm_judge_results_base_vs_pt.json", orient="records")

In [None]:
llm_judge_df.head()

### Use Amazon Nova Pro as Judge

We invoke Amazon Nova Pro on the generated dataset to evaluate the fine-tuned model against the base one

In [None]:
import re
import textwrap
import time

def extract_final_preference(text):
    """
    Extract content within <final_preference>...</final_preference> tags

    Args:
        text (str): The input text containing the tags

    Returns:
        str: The content within the tags, or None if not found
    """
    # Pattern to match content between <final_preference> tags
    pattern = r"<final_preference>(.*?)</final_preference>"

    # Search for the pattern (re.DOTALL allows . to match newlines)
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # Return the captured group (content between tags)
        return match.group(1).strip()
    else:
        return None

request_times = []
results = []

for index, el in llm_judge_df.iterrows():
    print(f"Index: {index + 1}")

    # Rate limiting: max 5 requests per 60 seconds
    current_time = time.time()
    request_times = [t for t in request_times if current_time - t < 60]

    if len(request_times) >= 5:
        wait_time = 60 - (current_time - request_times[0])
        print(f"Waiting for {wait_time:.2f}s")
        time.sleep(wait_time)
        request_times = request_times[1:]

    request_times.append(time.time())

    # Your existing code
    prompt = textwrap.dedent(
        """
        Response A:
        {response_A}

        Response B:
        {response_B}
        """
    ).strip()

    prompt = prompt.format(
        response_A=el["response_A"],
        response_B=el["response_B"],
    )

    messages = [
        {"role": "user", "content": [{"text": prompt}]},
    ]

    response = generate(
        model_id="us.amazon.nova-pro-v1:0",
        system_prompt=el["llm_eval_prompt"],
        messages=messages,
        temperature=0.1,
        top_p=0.9,
        guardrail_id=guardrail_id,
    )

    results.append([
        extract_final_preference(response["output"]["message"]["content"][0]["text"]),
        response["output"]["message"]["content"][0]["text"]
    ])

results_df = pd.DataFrame(
    results, columns=["preference", "details"]
)

results_df.to_json("./llm_judge_results.json", orient="records")

#### Visualize results

In [None]:
import json
import matplotlib.pyplot as plt
import re
from collections import Counter

def parse_preference_data(json_file_path):
    """Parse the JSON file and extract preference data"""
    with open(json_file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    preferences = []
    scores_a = []
    scores_b = []

    for item in data:
        preference_text = item.get("preference", "")

        if "Response A" in preference_text:
            preferences.append("A Preferred")
            # Extract score for A
            score_match = re.search(r"Score (\d+)/100", preference_text)
            if score_match:
                scores_a.append(int(score_match.group(1)))
        elif "Response B" in preference_text:
            preferences.append("B Preferred")
            # Extract score for B
            score_match = re.search(r"Score (\d+)/100", preference_text)
            if score_match:
                scores_b.append(int(score_match.group(1)))
        else:
            preferences.append("Ties")

    return preferences, scores_a, scores_b


def create_preference_pie_chart(preferences):
    """Create pie chart for preference distribution"""
    preference_counts = Counter(preferences)

    # Calculate percentages
    total = len(preferences)
    labels = []
    sizes = []
    colors = []

    for pref in ["A Preferred", "B Preferred", "Ties"]:
        count = preference_counts.get(pref, 0)
        percentage = (count / total) * 100
        labels.append(f"{pref}\n{percentage:.1f}%")
        sizes.append(count)

        if pref == "A Preferred":
            colors.append("#FF6B6B")  # Red/pink
        elif pref == "B Preferred":
            colors.append("#4ECDC4")  # Teal/turquoise
        else:
            colors.append("#95E1D3")  # Light green

    # Remove entries with 0 count
    filtered_data = [
        (label, size, color)
        for label, size, color in zip(labels, sizes, colors)
        if size > 0
    ]
    if filtered_data:
        labels, sizes, colors = zip(*filtered_data)

    plt.figure(figsize=(8, 6))
    plt.pie(sizes, labels=labels, colors=colors, autopct="", startangle=90)
    plt.title(
        "Preference Distribution\n(Valid Judgments Only)",
        fontsize=14,
        fontweight="bold",
    )
    plt.axis("equal")

    return plt.gcf()


def create_score_comparison_bar_chart(scores_a, scores_b):
    """Create bar chart comparing A vs B scores"""
    plt.figure(figsize=(8, 6))

    # Count scores for A and B
    count_a = len(scores_a)
    count_b = len(scores_b)

    categories = ["A Scores", "B Scores"]
    counts = [count_a, count_b]
    colors = ["#FF6B6B", "#4ECDC4"]  # Red for A, Teal for B

    bars = plt.bar(categories, counts, color=colors)

    # Add count labels on top of bars
    for i, (bar, count) in enumerate(zip(bars, counts)):
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.1,
            str(count),
            ha="center",
            va="bottom",
            fontweight="bold",
            fontsize=12,
        )

    # Add difference annotation
    if count_b > count_a:
        diff = count_b - count_a
        plt.annotate(
            f"B leads by {diff}",
            xy=(1, count_b * 0.75),
            xytext=(0.5, count_b * 0.75),
            bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7),
            ha="center",
            fontweight="bold",
        )
    elif count_a > count_b:
        diff = count_a - count_b
        plt.annotate(
            f"A leads by {diff}",
            xy=(0, count_a * 0.75),
            xytext=(0.5, count_a * 0.75),
            bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7),
            ha="center",
            fontweight="bold",
        )

    plt.title("A vs B Score Comparison", fontsize=14, fontweight="bold")
    plt.ylabel("Score Count", fontsize=12)
    plt.ylim(0, max(counts) * 1.2)

    # Remove top and right spines
    ax = plt.gca()
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    return plt.gcf()

In [None]:
import glob
import os

def find_json_files(path):
    return glob.glob(os.path.join(path, "*results.json"))

In [None]:
evaluation_results_path = find_json_files("./")[0]

In [None]:
# Parse the data
preferences, scores_a, scores_b = parse_preference_data(evaluation_results_path)

print(f"Total judgments: {len(preferences)}")
print(f"A Preferred: {preferences.count('A Preferred')}")
print(f"B Preferred: {preferences.count('B Preferred')}")
print(f"Ties: {preferences.count('Ties')}")
print(f"A Scores: {scores_a}")
print(f"B Scores: {scores_b}")

In [None]:
# Create the plots
fig1 = create_preference_pie_chart(preferences)
plt.tight_layout()
plt.savefig(
    "./preference_distribution.png",
    dpi=300,
    bbox_inches="tight",
)
plt.show()

fig2 = create_score_comparison_bar_chart(scores_a, scores_b)
plt.tight_layout()
plt.savefig(
    "./score_comparison.png", dpi=300, bbox_inches="tight"
)
plt.show()