# Supervised Fine-Tuning (SFT) with Serverless customization on SageMaker AI

## Lab 3 - LLM Evaluation

In this notebook, we are going to run an Evaluation job on the fine-tuned model by using LLM as a Judge with Custom Metrics

***

### Prerequistes

#### Setup and dependencies

In [None]:
import boto3
from sagemaker.core.helper.session_helper import Session, get_execution_role

sess = Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

s3_client = boto3.client("s3")
sess = Session(default_bucket=sagemaker_session_bucket)
sm_client = boto3.client("sagemaker", region_name=sess.boto_region_name)
bucket_name = sess.default_bucket()
default_prefix = sess.default_bucket_prefix

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

Edit model package group name and model package version if needed

In [None]:
from sagemaker.ai_registry.dataset import DataSet
from sagemaker.core.resources import ModelPackageGroup

base_model_id = "huggingface-llm-qwen2-5-7b-instruct"

model_package_group_name = f"{base_model_id}-mpg"
model_package_version = "1"

In [None]:
test_dataset = DataSet.get(name="medical-o1-reasoning-sft-test")
print(f"Test dataset for evaluation: {test_dataset}")
model_package_group = ModelPackageGroup.get(model_package_group_name)

fine_tuned_model_package_group_arn = model_package_group.model_package_group_arn
print(f"Fine-tuned Model Package Group ARN: {fine_tuned_model_package_group_arn}")

fine_tuned_model_package_arn = f"{model_package_group.model_package_group_arn.replace("model-package-group", "model-package", 1)}/{model_package_version}"
print(f"Fine-tuned Model Package ARN: {fine_tuned_model_package_arn}")

if default_prefix:
    output_path = f"s3://{bucket_name}/{default_prefix}/{base_model_id}/evaluation"
else:
    output_path = f"s3://{bucket_name}/{base_model_id}/evaluation"

***

### Create custom metrics for evaluation

In [None]:
import json

In [None]:
EVALUATOR_MODEL = "amazon.nova-pro-v1:0"

In [None]:
BUILTIN_METRICS = ["Correctness", "Completeness", "Faithfulness", "Coherence"]

custom_metrics_list = [
    {
        "customMetricDefinition": {
            "name": "MedicalReasoningQuality",
            "instructions": (
                "Evaluate if the response demonstrates sound medical reasoning. "
                "Check if the thinking process logically connects symptoms, conditions, and conclusions. "
                "Prompt: {{prompt}}\nResponse: {{prediction}}"
            ),
            "ratingScale": [
                {
                    "definition": "Excellent - Clear logical chain from symptoms to diagnosis/answer",
                    "value": {"floatValue": 3},
                },
                {
                    "definition": "Adequate",
                    "value": {"floatValue": 2},
                },
                {
                    "definition": "Poor",
                    "value": {"floatValue": 1},
                },
                {
                    "definition": "Incorrect",
                    "value": {"floatValue": 0},
                },
            ],
        }
    },
    {
        "customMetricDefinition": {
            "name": "ClinicalAccuracy",
            "instructions": (
                "Assess if the medical facts, terminology, and clinical conclusions are accurate. "
                "Consider diagnoses, treatments, mechanisms, and medical concepts mentioned. "
                "Prompt: {{prompt}}\nResponse: {{prediction}}\nReference: {{ground_truth}}"
            ),
            "ratingScale": [
                {
                    "definition": "Good",
                    "value": {"floatValue": 1},
                },
                {
                    "definition": "Bad",
                    "value": {"floatValue": 0},
                },
            ],
        }
    },
    {
        "customMetricDefinition": {
            "name": "ThinkTagStructure",
            "instructions": (
                "Check if the response follows the expected format with reasoning in <think> tags "
                "followed by a clear final answer outside the tags. "
                "Prompt: {{prompt}}\nResponse: {{prediction}}"
            ),
            "ratingScale": [
                {
                    "definition": "Good",
                    "value": {"floatValue": 1},
                },
                {
                    "definition": "Bad",
                    "value": {"floatValue": 0},
                },
            ],
        }
    },
]

custom_metrics_json = json.dumps(custom_metrics_list)

In [None]:
from sagemaker.train.evaluate import LLMAsJudgeEvaluator

evaluator = LLMAsJudgeEvaluator(
    model=fine_tuned_model_package_arn,
    model_package_group=fine_tuned_model_package_group_arn,
    evaluator_model=EVALUATOR_MODEL,  # Required
    dataset=test_dataset,  # Required: S3 URI or Dataset ARN
    builtin_metrics=BUILTIN_METRICS,  # Optional: Can combine with custom metrics
    custom_metrics=custom_metrics_json,  # Optional: JSON string of custom metrics
    s3_output_path=output_path,  # Required
    evaluate_base_model=False,  # Skip base model evaluation to evaluate only custom model
)

In [None]:
execution = evaluator.evaluate()

In [None]:
execution

***

### Analyze evaluation results

In this section we will further analyze the LLMAJ evaluation results produced by SageMaker AI serverless evaluation jobs, which are still accessible on S3.

In [None]:
from rich.pretty import pprint
from sagemaker.train.common_utils import show_results_utils
from sagemaker.train.evaluate import EvaluationPipelineExecution
from sagemaker.train.evaluate.constants import EvalType

In [None]:
latest_succeeded = next(
    (
        e
        for e in EvaluationPipelineExecution.get_all(eval_type=EvalType.LLM_AS_JUDGE)
        if e.status.overall_status == "Succeeded"
    ),
    None,
)
pprint(latest_succeeded)

In [None]:
_original_format = show_results_utils._format_score
show_results_utils._format_score = lambda s: (
    f"{s * 100:.1f}%" if s is not None else "N/A"
)

latest_succeeded.show_results(limit=5, offset=0, show_explanations=False)

show_results_utils._format_score = _original_format  # restore

#### Download results

First we download the results from S3 as JSONL files.

In [None]:
import os
from urllib.parse import urlparse

In [None]:
parsed = urlparse(latest_succeeded.s3_output_path)
bucket = parsed.netloc
prefix = parsed.path.lstrip("/")

In [None]:
response = s3_client.list_objects_v2(
    Bucket=bucket, Prefix=f"{prefix}/custom-llmaj-eval-{latest_succeeded.name}"
)

# Find the jsonl file
jsonl_key = next(
    obj["Key"] for obj in response["Contents"] if obj["Key"].endswith("_output.jsonl")
)

os.mkdir("./tmp", exist_ok=True)
s3_client.download_file(bucket, jsonl_key, "./tmp/evaluation_results.jsonl")

#### Visualize results

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Utility functions used to create different charts

In [None]:
def load_evaluation_results(filepath):
    """Load evaluation results from JSONL file into DataFrame."""
    with open(filepath) as f:
        results = [json.loads(line) for line in f]

    rows = []
    for r in results:
        for score in r["automatedEvaluationResult"]["scores"]:
            rows.append({"metric": score["metricName"], "score": score["result"]})

    return pd.DataFrame(rows)


def plot_metrics_bar(df):
    """Horizontal bar chart of average scores by metric."""
    agg = df.groupby("metric")["score"].mean().sort_values()

    plt.figure(figsize=(8, 5))
    bars = plt.barh(agg.index, agg.values, color="steelblue")
    plt.xlabel("Average Score")
    plt.title("LLM-as-Judge Evaluation Results")
    plt.xlim(0, 1)

    for bar, val in zip(bars, agg.values):
        plt.text(
            val + 0.02, bar.get_y() + bar.get_height() / 2, f"{val:.1%}", va="center"
        )

    plt.tight_layout()
    plt.show()


def plot_metrics_radar(df):
    """Radar chart showing all metrics."""
    agg = df.groupby("metric")["score"].mean()
    metrics = agg.index.tolist()
    values = agg.values.tolist() + [agg.values[0]]
    angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist() + [0]

    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.plot(angles, values, "o-", linewidth=2, color="steelblue")
    ax.fill(angles, values, alpha=0.25, color="steelblue")
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([m.replace("Builtin.", "") for m in metrics], size=9)
    ax.set_ylim(0, 1)
    ax.set_title("Evaluation Metrics Overview")
    plt.tight_layout()
    plt.show()


def plot_metrics_bullet(df, target=0.8):
    """Bullet chart comparing scores against target."""
    agg = df.groupby("metric")["score"].mean().sort_values()

    fig, ax = plt.subplots(figsize=(8, 4))
    y_pos = range(len(agg))
    ax.barh(y_pos, [1] * len(agg), color="#eee", height=0.6)
    ax.barh(y_pos, [target] * len(agg), color="#ddd", height=0.6)
    ax.barh(y_pos, agg.values, color="steelblue", height=0.3)
    ax.axvline(target, color="red", linestyle="--", label=f"Target ({target:.0%})")
    ax.set_yticks(y_pos)
    ax.set_yticklabels(agg.index)
    ax.set_xlim(0, 1)
    ax.legend(loc="lower right")
    ax.set_title("Metrics vs Target")
    plt.tight_layout()
    plt.show()

In [None]:
df = load_evaluation_results("evaluation_results.jsonl")
plot_metrics_bar(df)
plot_metrics_radar(df)
plot_metrics_bullet(df, target=0.8)