# LLM evaluation on Amazon SageMaker AI

In this lab, we are going to evaluate the fine-tuned model by using Statistical metrics, such as `BERT`, `ROUGE`, and qualitative metrics through `LLM as a Judge`. Those metrics are evaluated against the base model. For the purpose of this lab, the base model was pre-deployed and accessible in the AWS account.

![image.png](./images/llm_eval_image.png)

## Prerequisites

In [None]:
%pip install -U bert-score \
    "datasets>=4.0.0" \
    mlflow \
    numpy \
    pandas \
    rouge-score \
    sagemaker-mlflow==0.1.0 \
    scikit-learn \
    sentence-transformers \
    textstat \
    tiktoken

Define the SageMaker endpoint name

In [None]:
model_id = "Qwen/Qwen3-VL-2B-Instruct"
eval_dataset = "pranavvmurthy26/DoclingMatix_500"

In [None]:
base_endpoint_name = "base-Qwen3-VL-2B-Instruct-ep"
tuned_endpoint_name = "tuned-Qwen3-VL-2B-Instruct-ep"

## Dataset

We are going to analyze the dataset used for evaluation

```json
[
    [
        {"images": [<PIL_IMAGE_1>, ..., <PIL_IMAGE_N>]},
        {"texts": [
                {"user": "<CONTENT>","assistant": "<CONTENT>","source": "<IMAGE_NAME>"},
                ...
                {"user": "<CONTENT>","assistant": "<CONTENT>","source": "<IMAGE_NAME>"}
            ]
        }
    ]
    ...
    [
        {"images": [<PIL_IMAGE_1>, ..., <PIL_IMAGE_N>]},
        {"texts": [
                {"user": "<CONTENT>","assistant": "<CONTENT>","source": "<IMAGE_NAME>"},
                ...
                {"user": "<CONTENT>","assistant": "<CONTENT>","source": "<IMAGE_NAME>"}
            ]
        }
    ]
]
```

In [None]:
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.session import get_execution_role
from datasets import Dataset
from datasets import load_dataset
from nb_local_utils.helpers import (
    pretty_print_html,
    get_mlflow_server_arn, 
    get_tracking_server_uri
)

In [None]:
region = boto3.Session().region_name

sess = Session(boto3.Session(region_name=region))

sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = get_execution_role()

In [None]:
dataset = load_dataset(eval_dataset, split="test", streaming=True).shuffle(
    buffer_size=1000
)

dataset = Dataset.from_generator(lambda: dataset, features=dataset.features)

The sample dataset contains 100 rows taken from [HuggingFaceM4/DoclingMatix](https://huggingface.co/datasets/HuggingFaceM4/DoclingMatix/viewer/default/train), a multi-turn and multi-modal dataset for document intelligence

In [None]:
dataset = dataset.select(list(range(10)))

In [None]:
dataset

In [None]:
from IPython.display import display

display(dataset[0]["images"][0])

In [None]:
import json
import random

random_index = random.randrange(len(dataset[0]["texts"]))

pretty_print_html(
    f"### First conversation:### \n\n{json.dumps(dataset[0]["texts"][random_index], indent=2)}"
)

***

## Create dataset for evaluation

In the following steps, we are going to generate the dataset for evaluation, by invoking the deployed model, saving the answers and the ground truth in a proper dataset used for the evaluation later on

In [None]:
import os
import random
import base64
import pandas as pd
from io import BytesIO

In [None]:
def pil_to_base64(pil_img, resize_perc=0.5):
    """Convert a PIL image to base64-encoded PNG string."""
    pil_img = pil_img.resize([int(resize_perc * s) for s in pil_img.size])
    buffer = BytesIO()
    pil_img.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [None]:
inference_dataset = []

for row in dataset:
    images = []
    for img in row["images"]:  # Fix: use row['images']
        image_base64 = pil_to_base64(img)
        images.append(
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{image_base64}"},
            }
        )

    random_index = random.randrange(len(row["texts"]))

    messages = [
        {
            "role": "user",
            "content": images + [
                {"type": "text", "text": row["texts"][random_index]["user"]}
            ],
        },
    ]

    inference_dataset.append(
        {
            "messages": messages,
            "question": row["texts"][random_index]["user"],
            "ground_truth": row["texts"][random_index]["assistant"],
        }
    )

In [None]:
# load into a df
df = pd.DataFrame(inference_dataset)

In [None]:
# write dataset to dict
os.makedirs("./data/eval", exist_ok=True)
df.to_json("./data/eval/dataset.json", orient="records", lines=True)

***

## Run model inference

In [None]:
import io
import boto3
import time
import time
from collections import deque
import json

In [None]:
sagemaker_client = boto3.client(service_name="sagemaker-runtime")

### Iterator class for streaming inference

Utility class to parse streaming responses

In [None]:
class LineIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()

            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]

            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise

            if "PayloadPart" not in chunk:
                continue

            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])

 Utility function to compute inference metrics

In [None]:
def compute_metrics(
    request_start, 
    request_end, 
    request_times, 
    first_token_time, 
    output_tokens
):
    """Compute latency, RPM, throughput, and TTFT for a single request."""
    # Calculate metrics
    latency_ms = (request_end - request_start)
    request_times.append(request_end)
    ttft_seconds = (first_token_time - request_start) if first_token_time else 0

    current_time = time.time()
    recent_requests = [t for t in request_times if current_time - t <= 60]
    rpm = len(recent_requests)
    throughput = (
        output_tokens / (request_end - request_start)
        if (request_end - request_start) > 0
        else 0
    )

    print(
        f"Latency: {latency_ms:.0f} s, RPM: {rpm}, Throughput: {throughput:.2f} tokens/s, TTFT: {ttft_seconds:.2f}s"
    )

Utility function to parse model answer

In [None]:
def parse_streaming_response(line_str):
    """Parse a single streaming response line and return content if found."""
    if not line_str.strip() or line_str.strip() == "data: [DONE]":
        return None

    if line_str.startswith("data: "):
        line_str = line_str[6:]

    try:
        data = json.loads(line_str)
        if "choices" in data:
            for choice in data["choices"]:
                if "delta" in choice and "content" in choice["delta"]:
                    return choice["delta"]["content"]
    except json.JSONDecodeError:
        pass

    return None

In [None]:
def run_streaming_eval(
    inference_dataset,
    sagemaker_client,
    endpoint_name,
    request_times,
):
    """Run streaming inference over a dataset for a given endpoint."""
    eval_dataset = []

    for index, el in enumerate(inference_dataset, start=1):
        print("Processing el", index)

        request_start = time.time()
        first_token_time = None
        generated_text = ""
        output_tokens = 0

        request_body = {
            "messages": el["messages"],
            "max_tokens": len(el["ground_truth"]),
            "temperature": 0.3,
            "top_p": 0.9,
            "stop": ["<|im_end|>"],
            "stream": True,
        }

        response = sagemaker_client.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(request_body),
            ContentType="application/json",
        )

        for line in LineIterator(response["Body"]):
            if line:
                content = parse_streaming_response(line.decode("utf-8"))
                if content:
                    if first_token_time is None:
                        first_token_time = time.time()
                    generated_text += content
                    output_tokens += len(content.split())

        request_end = time.time()

        compute_metrics(
            request_start=request_start,
            request_end=request_end,
            request_times=request_times,
            first_token_time=first_token_time,
            output_tokens=output_tokens,
        )

        eval_dataset.append(
            {
                "messages": el["messages"],
                "question": el["question"],
                "output_text": generated_text,
                "ground_truth": el["ground_truth"],
            }
        )

    return eval_dataset

**Run Inference with Base Model Endpoint**

In [None]:
request_times_base = deque()
eval_base = run_streaming_eval(inference_dataset, sagemaker_client, base_endpoint_name, request_times_base)

In [None]:
df_base = pd.DataFrame(eval_base)
df_base.to_json("./data/eval/base_dataset.json", orient="records", lines=True)

**Run Inference with Fine-tuned Model Endpoint**

In [None]:
request_times_tuned = deque()
eval_tuned = run_streaming_eval(inference_dataset, sagemaker_client, tuned_endpoint_name, request_times_tuned)

Save evaluation dataset to JSON

In [None]:
df_tuned = pd.DataFrame(eval_tuned)
df_tuned.to_json("./data/eval/tuned_dataset.json", orient="records", lines=True)

***

## Statistical evaluation using Managed MLflow

Define the MLflow tracking ARN

In [None]:
import boto3
import mlflow

In [None]:
MLFLOW_TRACKING_SERVER_ARN = get_mlflow_server_arn()

In [None]:
mlflow_link = get_tracking_server_uri(sess, MLFLOW_TRACKING_SERVER_ARN)
pretty_print_html(f'<a href="{mlflow_link}" target="_blank">ðŸ”— [Click Me to Open MLflow] ðŸ”—</a>')

In [None]:
if MLFLOW_TRACKING_SERVER_ARN:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_SERVER_ARN)
    mlflow.set_experiment(f"eval-{model_id.split('/')[-1].replace('.', '-')}")

**Statistical Metrics**

In [None]:
import mlflow
from pathlib import Path
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import plotly.express as px
from typing import Sequence

In [None]:
class StatisticalEvaluator:
    """Handles statistical evaluation metrics."""

    def __init__(self):
        self.rouge = rouge_scorer.RougeScorer(
            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
        )
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")

    def add_metrics(self, df):
        """Add statistical metrics to dataframe."""
        preds, refs = df.outputs.tolist(), df.ground_truth.tolist()

        # ROUGE scores
        rouge_scores = [self.rouge.score(r, p) for p, r in zip(preds, refs)]
        df["rouge1"] = [s["rouge1"].fmeasure for s in rouge_scores]
        df["rouge2"] = [s["rouge2"].fmeasure for s in rouge_scores]
        df["rougeL"] = [s["rougeL"].fmeasure for s in rouge_scores]

        # BERTScore
        P, R, F1 = bert_score(preds, refs, lang="en", verbose=False)
        df["bert_p"], df["bert_r"], df["bert_f1"] = P.tolist(), R.tolist(), F1.tolist()

        # Semantic similarity
        P_emb = self.embedder.encode(preds)
        R_emb = self.embedder.encode(refs)
        sims = [cosine_similarity([p], [r])[0][0] for p, r in zip(P_emb, R_emb)]
        df["sem_sim"] = sims

        return df

Utility function to run evaluation with MLflow

In [None]:
def run_evaluation(df, run_name):
    """Run MLflow evaluation for a single model."""
    with mlflow.start_run(run_name=run_name):

        def dummy_model(x):
            return df.outputs.tolist()

        results = mlflow.evaluate(
            model=dummy_model,
            data=df,
            model_type="text",
            evaluators="default",
            evaluator_config={
                "col_mapping": {
                    "inputs": "inputs",
                    "outputs": "outputs",
                    "targets": "ground_truth",
                }
            },
        )
        return results.tables["eval_results_table"]

Utility function to print result metrics

In [None]:
def print_summary(combined_results: pd.DataFrame, metrics_to_show: Sequence[str]) -> None:
    """Print evaluation summary and show a grouped bar chart."""
    print("\n===================================")
    print("          EVALUATION SUMMARY       ")
    print("===================================\n")

    print("Available columns:", ", ".join(combined_results.columns))
    print("\n----------------------\n")

    summary_rows = []

    for model_type in ["finetuned", "unfinetuned"]:
        sub = combined_results[combined_results["model_type"] == model_type]

        if sub.empty:
            print(f"{model_type.upper()}: No rows found\n")
            continue

        print(f"{model_type.upper()}:")

        for metric in metrics_to_show:
            if metric not in sub.columns:
                print(f"  - {metric}: [column not found]")
                continue

            values = sub[metric].dropna()
            if values.empty:
                print(f"  - {metric}: [no data]")
                continue

            mean_val = values.mean()
            print(f"  - {metric:<20}: {mean_val:>8.3f}")

            summary_rows.append(
                {
                    "model_type": model_type,
                    "metric": metric,
                    "value": mean_val,
                }
            )

        print()  # blank line between model types

    if not summary_rows:
        print("No data available to plot.")
        return

    summary_df = pd.DataFrame(summary_rows)

    # Build ordered, unique metric list, filtered to whatâ€™s actually present
    present_metrics = list(summary_df["metric"].unique())
    ordered_unique_metrics = []
    for m in metrics_to_show:
        if m in present_metrics and m not in ordered_unique_metrics:
            ordered_unique_metrics.append(m)

    if ordered_unique_metrics:
        summary_df["metric"] = pd.Categorical(
            summary_df["metric"],
            categories=ordered_unique_metrics,
            ordered=True,
        )

    fig = px.bar(
        summary_df,
        x="metric",
        y="value",
        color="model_type",
        barmode="group",
        text="value",
        title="Evaluation Metrics by Model Type",
        labels={
            "metric": "Metric",
            "value": "Mean Score",
            "model_type": "Model Type",
        },
    )

    fig.update_traces(
        texttemplate="%{text:.3f}",
        textposition="outside",
        hovertemplate="<b>%{customdata[0]}</b><br>"
                      "Metric: %{x}<br>"
                      "Mean: %{y:.3f}<extra></extra>",
        customdata=summary_df[["model_type"]],
    )

    fig.update_layout(
        template="plotly_white",
        uniformtext_minsize=10,
        uniformtext_mode="hide",
        xaxis=dict(title="Metric"),
        yaxis=dict(title="Mean Score", rangemode="tozero"),
        legend=dict(
            title="Model Type",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
        ),
        margin=dict(l=40, r=40, t=60, b=80),
    )

    fig.show()

Utility function to format the input dataset

In [None]:
def create_dataframe(df, model_type):
    """Create evaluation dataframe from JSONL rows."""
    outputs = []
    inputs = []
    gts = []

    for _, el in df.iterrows():
        # Combine reasoning + final output
        outputs.append(el["output_text"])
        inputs.append(json.dumps(el["messages"]))
        gts.append(el["ground_truth"])

    return pd.DataFrame(
        {
            "inputs": inputs,
            "outputs": outputs,
            "ground_truth": gts,
            "model_type": model_type,
        }
    )

### Run Evaluation: Base Model Results

In [None]:
evaluator_for_base = StatisticalEvaluator()

In [None]:
df_base_final = create_dataframe(df_base, "unfinetuned")
df_base_final = evaluator_for_base.add_metrics(df_base_final)

# run evaluation
results_base_final = run_evaluation(df_base_final, "unfinetuned_eval")

### Run Evaluation: Fine-Tuned Model Results

In [None]:
evaluator_for_tuned = StatisticalEvaluator()

In [None]:
df_tuned_final = create_dataframe(df_tuned, "finetuned")
df_tuned_final = evaluator_for_tuned.add_metrics(df_tuned_final)

# run evaluation
results_tuned_final = run_evaluation(df_tuned_final, "finetuned_eval")

### Display Final Results

In [None]:
# concat both base and tuned results
results_all = pd.concat(
    [results_base_final, results_tuned_final], 
    axis=0,
    ignore_index=True
)

In [None]:
path = "./evaluation_results"

Path(path).mkdir(exist_ok=True)
out_path = Path(path) / "statistical_eval_results.csv"
results_all.to_csv(out_path, index=False)

print_summary(
    results_all, 
    metrics_to_show=["toxicity/v1/score", "rouge1", "rouge2", "rougeL", "bert_p", "bert_r", "bert_f1", "sem_sim", "bert_f1", "sem_sim"]
)

***

## Qualitative evaluation using LLM as a judge

We are now going to evaluate the model with the LLM as a Judge evaluation task. First, let's define the Bedrock client

![llm_judge.png](./images/llm_judge.png)

LLM-as-a-Judge is a framework where a large language model (LLM) acts as an evaluator, scoring the coherence of AI-generated outputs based on given inputs. The the following cells, we are going to score outputs generated by both the base and the fine-tuned models.

In [None]:
import time
import re
import textwrap
import boto3
from botocore.config import Config
import sagemaker
from tqdm import tqdm

sagemaker_session = sagemaker.Session()

# Initialize Bedrock Runtime client
session = boto3.Session()
client = session.client(
    service_name="bedrock-runtime",
    region_name=sagemaker_session.boto_region_name,
    config=Config(
        connect_timeout=300,  # 5 minutes
        read_timeout=300,  # 5 minutes
        retries={"max_attempts": 3},
    ),
)

To invoke the fine-tuned model for inference, we'll set up the necessary clients and functions to interact with our model through the Bedrock Runtime API.

Inference Setup Components:
* Bedrock Runtime Client: AWS SDK client for making inference calls
* Helper Function: To handle retry logic and properly format requests
The generate function we're defining:

Applies the proper chat template to user messages
* Handles retry logic for robustness
* Sets appropriate generation parameters like temperature and top-p

This setup allows us to easily test how well our training worked by sending queries to the model and evaluating its responses.

In [None]:
def generate(
    model_id,
    messages,
    system_prompt=None,
    tools=None,
    temperature=0.3,
    max_tokens=4096,
    top_p=0.9,
    max_retries=10,
):
    """
    Generate response using the model with proper tokenization and retry mechanism

    Parameters:
        model_id (str): ID of the model to use
        messages (list): List of message dictionaries with 'role' and 'content'
        system_prompt (str, optional): System prompt to guide the model
        tools (dict, optional): Tool configuration for the model
        temperature (float): Controls randomness in generation (0.0-1.0)
        max_tokens (int): Maximum number of tokens to generate
        top_p (float): Nucleus sampling parameter (0.0-1.0)
        max_retries (int): Maximum number of retry attempts

    Returns:
        dict: Model response containing generated text and metadata
    """
    # Prepare base parameters for the API call
    kwargs = {
        "inferenceConfig": {
            "temperature": temperature,
            "maxTokens": max_tokens,
            "topP": top_p,
        },
    }

    # Add optional parameters if provided
    if tools:
        kwargs["toolConfig"] = tools
    if system_prompt:
        kwargs["system"] = [{"text": system_prompt}]

    # Retry logic
    for attempt in range(max_retries):
        try:
            return client.converse(modelId=model_id, messages=messages, **kwargs)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(30)
            else:
                print("Max retries reached. Unable to get response.")
                return None

In [None]:
system_prompt_base = f"""
You are an expert evaluator for document Q&A systems. Evaluate the generated answer against the ground truth.

## EVALUATION CRITERIA

1. **Factual Accuracy** (40 points): Is the information correct based on the document?
2. **Completeness** (30 points): Does it fully answer the question?
3. **Relevance** (20 points): Is the response directly related to the question?
4. **Clarity** (10 points): Is the answer clear and well-structured?

## EVALUATION
Provide scores for each criterion and explain your reasoning.

**Factual Accuracy**: X/40 (explanation)
**Completeness**: X/30 (explanation)  
**Relevance**: X/20 (explanation)
**Clarity**: X/10 (explanation)
**Total Score**: X/100

At the end of your evaluation, you MUST put your final preference in the tags <final_score>...</final_score> like:
<final_score>
Score 90/100
</final_score>

You MUST put the details in the tags <details>...</details>
<details>
Factual Accuracy: 35/40
Completeness: 30/30
Relevance: 20/20
Clarity: 5/10
</details>

This is the baseline to evaluate:

## QUESTION
{{question}}

## GROUND TRUTH ANSWER
{{ground_truth}}
"""

### Use Amazon Nova Pro as Judge

We invoke Amazon Nova Pro on the generated dataset to evaluate the fine-tuned model

In [None]:
def extract_final_score(text):
    pattern = r"<final_score>(.*?)</final_score>"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None


def extract_details(text):
    pattern = r"<details>(.*?)</details>"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None


def run_evaluator(
    df,
    system_prompt_base: str,
    rate_limit: int = 5,
    window_seconds: int = 60,
):
    """Run model-based evaluation for each row in a DataFrame with built-in rate limiting."""

    request_times = []
    results = []

    for index, el in df.iterrows():
        print(f"Processing el: {index + 1}")

        current_time = time.time()
        request_times = [t for t in request_times if current_time - t < window_seconds]

        if len(request_times) >= rate_limit:
            wait_time = window_seconds - (current_time - request_times[0])
            print(f"Rate limit reached â†’ waiting {wait_time:.2f}s...")
            time.sleep(wait_time)
            request_times = request_times[1:]

        request_times.append(time.time())

        prompt = system_prompt_base.format(
            question=el["question"],
            ground_truth=el["ground_truth"],
        )

        messages = [
            {"role": "user", "content": [{"text": el["output_text"]}]},
        ]
        
        response = generate(
            model_id="us.amazon.nova-pro-v1:0",
            system_prompt=textwrap.dedent(prompt).strip(),
            messages=messages,
            temperature=0.1,
            top_p=0.9,
        )

        model_text = response["output"]["message"]["content"][0]["text"]

        final_score = extract_final_score(model_text)
        details = extract_details(model_text)

        results.append(
            [
                index,
                el["question"],
                final_score,
                details,
            ]
        )

    return results

**Run Judge Evaluation for Base**

In [None]:
results_for_base = run_evaluator(df=df_base, system_prompt_base=system_prompt_base)

In [None]:
results_base_df = pd.DataFrame(
    results_for_base, columns=["index", "question", "final_score", "details"]
)

results_base_df.to_json("./evaluation_results/base_llm_judge_eval_results.json", orient="records")

**Run Judge Evaluation for Base**

In [None]:
results_for_tuned = run_evaluator(df=df_tuned, system_prompt_base=system_prompt_base)

Save results into a JSON file

In [None]:
results_tuned_df = pd.DataFrame(
    results_for_tuned, columns=["index", "question", "final_score", "details"]
)

results_tuned_df.to_json("./evaluation_results/tuned_llm_judge_eval_results.json", orient="records")

## Compare evaluation results

We are now comparing the evaluation results between the base and the fine-tuned model

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import re

In [None]:
# Load both files
with open("./evaluation_results/tuned_llm_judge_eval_results.json", "r") as f:
    data_finetuned = json.load(f)

with open("./evaluation_results/base_llm_judge_eval_results.json", "r") as f:
    data_base = json.load(f)

Utility function to extract scores and categorize results

In [None]:
def extract_score(score_str):
    match = re.search(r"(\d+)/100", score_str)
    return int(match.group(1)) if match else None


def categorize(score):
    if score >= 90:
        return "Excellent"
    elif score >= 80:
        return "Good"
    elif score >= 70:
        return "Fair"
    else:
        return "Poor"

In [None]:
df_ft = pd.DataFrame(data_finetuned)
df_ft = df_ft.dropna()
df_base = pd.DataFrame(data_base)
df_base = df_base.dropna()

df_ft["score"] = df_ft["final_score"].apply(extract_score)
df_base["score"] = df_base["final_score"].apply(extract_score)

Side-by-side comparison histogram

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.hist(df_ft["score"], bins=20, edgecolor="black", alpha=0.7, color="blue")
ax1.set_title("Fine-tuned Model Scores")
ax1.set_xlabel("Score")
ax1.set_ylabel("Frequency")

ax2.hist(df_base["score"], bins=20, edgecolor="black", alpha=0.7, color="orange")
ax2.set_title("Base Model Scores")
ax2.set_xlabel("Score")
ax2.set_ylabel("Frequency")
plt.tight_layout()
plt.show()

Overlapping distributions

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_ft["score"], bins=20, alpha=0.5, label="Fine-tuned", color="blue")
plt.hist(df_base["score"], bins=20, alpha=0.5, label="Base", color="orange")
plt.title("Score Distribution Comparison")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.legend()
plt.show()

# 3. Box plot comparison
plt.figure(figsize=(8, 6))
plt.boxplot([df_ft["score"], df_base["score"]], labels=["Fine-tuned", "Base"])
plt.title("Score Distribution Comparison")
plt.ylabel("Score")
plt.show()

Statistics comparison

In [None]:
print("=== STATISTICS COMPARISON ===\n")
print(f"Fine-tuned Model:")
print(f"  Average: {df_ft['score'].mean():.1f}")
print(f"  Median: {df_ft['score'].median():.1f}")
print(f"  Min: {df_ft['score'].min()}")
print(f"  Max: {df_ft['score'].max()}")
print(f"\nBase Model:")
print(f"  Average: {df_base['score'].mean():.1f}")
print(f"  Median: {df_base['score'].median():.1f}")
print(f"  Min: {df_base['score'].min()}")
print(f"  Max: {df_base['score'].max()}")
print(f"\nImprovement: {df_ft['score'].mean() - df_base['score'].mean():.1f} points")

Score categories comparison

In [None]:
df_ft["category"] = df_ft["score"].apply(categorize)
df_base["category"] = df_base["score"].apply(categorize)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
df_ft["category"].value_counts().plot.pie(ax=ax1, autopct="%1.1f%%")
ax1.set_title("Fine-tuned Model Categories")
df_base["category"].value_counts().plot.pie(ax=ax2, autopct="%1.1f%%")
ax2.set_title("Base Model Categories")
plt.tight_layout()
plt.show()

---
---
END OF LAB 3
--- 
---
---