# Comparing Model Performance after Fine-Tuning
In this example, we will take the pre-existing SageMaker endpoints that you deployed in previous exercises and use them to generate data that can be leveraged for quality comparison. This data can be used to take a quantitative approach to judge the efficacy of fine-tuning your models.

This example will run through samples of the medical-o1-reasoning dataset (FreedomIntelligence/medical-o1-reasoning-SFT) on the Hugging Face data hub for medical Q&A and use the [lighteval](https://huggingface.co/docs/lighteval/index) from Hugging Face for analysis.

## Prerequisites

In [None]:
%pip install -r ./scripts/requirements.txt

## This cell will restart the kernel. Click "OK".

In [None]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(True)

In [None]:
# Import libraries
import os
import json
import time
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datasets import load_dataset

# Import ROUGE scorer
from rouge_score import rouge_scorer
# Import v3 SDK
from sagemaker.core.resources import Endpoint

#### Fetch the saved endpoint names from previous sections, or set them manually by uncommenting the code below. 

In [None]:
%store -r BASE_ENDPOINT_NAME
%store -r TUNED_ENDPOINT_NAME

#BASE_ENDPOINT_NAME = ""
#TUNED_ENDPOINT_NAME = ""

print(f"Base Endpoint: {BASE_ENDPOINT_NAME}")
print(f"Tuned Endpoint: {TUNED_ENDPOINT_NAME}")

In [None]:
# Define the model to evaluate
model_to_evaluate = {
    "name": "Fine-tuned Model", 
    "endpoint": TUNED_ENDPOINT_NAME
}

Here you will use the the medical-o1-reasoning dataset. The dataset is pre-split into training and test data. We will limit the number of samples to evaluate for the fine-tuned and base models.

In [None]:
# Limit the number of samples to evaluate (for faster execution)
num_samples = 10

# Load the test split of the medical-o1-reasoning dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")

max_samples = len(dataset)

dataset = dataset.shuffle().select(range(min(num_samples, max_samples)))
print(f"Loaded medical-o1-reasoning dataset with {len(dataset)} samples out of {max_samples}")

# Display a sample from the dataset
sample = dataset[0]

print("\nQuestion:\n", sample["Question"], "\n\n====\n")
print("Complex_CoT:\n", sample["Complex_CoT"], "\n\n====\n")
print("Response:\n", sample["Response"], "\n\n====\n")

In [None]:
SYSTEM_PROMPT = """You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response."""


# template dataset to add prompt to each sample
def convert_to_messages(sample, system_prompt="", include_answer=True):
    
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": sample["Question"]},
    ]

    if include_answer:
        messages.append({"role": "assistant", "content": f"{sample['Complex_CoT']}\n\n{sample['Response']}"})
    
    return messages

#### Next, we will create functions to interact with the SageMaker endpoints, define metrics we want to calculate (ROUGE), and define how to evaluate the models with the medical-o1-reasoning dataset. 

In [None]:
# Initialize ROUGE scorer
rouge_metrics = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def calculate_metrics(predictions, references):
    metrics = {}
    rouge_scores = {'rouge1_f': [], 'rouge2_f': [], 'rougeL_f': []}
    
    for pred, ref in zip(predictions, references):
        rouge_result = rouge_metrics.score(ref, pred)
        rouge_scores['rouge1_f'].append(rouge_result['rouge1'].fmeasure)
        rouge_scores['rouge2_f'].append(rouge_result['rouge2'].fmeasure)
        rouge_scores['rougeL_f'].append(rouge_result['rougeL'].fmeasure)
    
    for key in rouge_scores:
        metrics[key] = sum(rouge_scores[key]) / len(rouge_scores[key])
    
    return metrics

In [None]:
def generate_summaries_with_model(endpoint_name, dataset):
    """
    Generate summaries using a model deployed on SageMaker.
    Uses v3 SDK Endpoint.invoke() instead of v2 Predictor.
    """
    predictions = []
    core_endpoint = Endpoint.get(endpoint_name=endpoint_name)

    for example in tqdm(dataset, desc="Generating Responses"):
        messages = convert_to_messages(example, system_prompt=SYSTEM_PROMPT, include_answer=False)
        
        payload = json.dumps({
            "messages": messages,
            "parameters": {
                "max_new_tokens": 512,
                "top_p": 0.9,
                "temperature": 0.6,
                "return_full_text": False
            }
        })
        
        try:
            response = core_endpoint.invoke(
                body=payload,
                content_type="application/json",
                accept="application/json",
            )
            result = json.loads(response.body.read().decode("utf-8"))
            prediction = result["choices"][0]["message"]["content"].strip()
        except Exception as e:
            print(f"Error invoking SageMaker endpoint {endpoint_name}: {e}")
            prediction = "Error generating summary."
        
        predictions.append(prediction)
    
    return predictions

In [None]:
def evaluate_model_on_dataset(model_config, dataset):
    model_name = model_config["name"]
    endpoint_name = model_config["endpoint"]

    print(f"\nEvaluating model: {model_name} on endpoint: {endpoint_name}")
    
    # Get references
    references = ["\n".join([example["Complex_CoT"], example["Response"]]) for example in dataset]
    
    # Generate summaries
    print("\nGenerating Responses...")
    predictions = generate_summaries_with_model(endpoint_name, dataset)
    
    # Calculate automated metrics using LightEval
    print("\nCalculating evaluation metrics with LightEval...")
    metrics = calculate_metrics(predictions, references)
    
    results = {
        "model_name": model_name,
        "endpoint_name": endpoint_name,
        "num_samples": len(dataset),
        "metrics": metrics,
        "predictions": predictions[:5],
        "references": references[:5]
    }
    
    print(f"\nResults for {model_name}:")
    print(f"ROUGE-1 F1: {metrics['rouge1_f']:.4f}")
    print(f"ROUGE-2 F1: {metrics['rouge2_f']:.4f}")
    print(f"ROUGE-L F1: {metrics['rougeL_f']:.4f}")
    
    return results

#### Evaluate both models

**Note: Since the model you trained in this example was only exposed to a small amount of training data and the testing sample is small, you may see varied results.**

In [None]:
# Evaluate the base and fine-tuned models using LightEval metrics
start_time = time.time()

base_model_config = {
    "name": "Base Model",
    "endpoint": BASE_ENDPOINT_NAME
}

# Evaluate base model
base_model_results = evaluate_model_on_dataset(base_model_config, dataset)
base_model_results["evaluation_time"] = time.time() - start_time

In [None]:
# Start timing fine-tuned model
start_time = time.time()

# Evaluate fine-tuned model
finetuned_model_results = evaluate_model_on_dataset(model_to_evaluate, dataset)
finetuned_model_results["evaluation_time"] = time.time() - start_time

# Save results
base_file_name = base_model_config["name"].replace(' ', '_').lower()
finetuned_file_name = model_to_evaluate["name"].replace(' ', '_').lower()

with open(f"{base_file_name}_results.json", "w") as f:
    json.dump(base_model_results, f)
    
with open(f"{finetuned_file_name}_results.json", "w") as f:
    json.dump(finetuned_model_results, f)

Create a tabular view to compare the base model and fine-tuned model performance metrics

In [None]:
# Create a comparison DataFrame
comparison_data = []

comparison_data.append({
    "Model": base_model_config["name"],
    "ROUGE-1 F1": base_model_results["metrics"]["rouge1_f"],
    "ROUGE-2 F1": base_model_results["metrics"]["rouge2_f"],
    "ROUGE-L F1": base_model_results["metrics"]["rougeL_f"],
    "Evaluation Time (s)": base_model_results["evaluation_time"]
})

comparison_data.append({
    "Model": model_to_evaluate["name"],
    "ROUGE-1 F1": finetuned_model_results["metrics"]["rouge1_f"],
    "ROUGE-2 F1": finetuned_model_results["metrics"]["rouge2_f"],
    "ROUGE-L F1": finetuned_model_results["metrics"]["rougeL_f"],
    "Evaluation Time (s)": finetuned_model_results["evaluation_time"]
})

comparison_df = pd.DataFrame(comparison_data)
print("Model Comparison:")
comparison_df

Show a bar chart

In [None]:
# Plot ROUGE metrics for both models
metrics_to_plot = ["ROUGE-1 F1", "ROUGE-2 F1", "ROUGE-L F1"]
models = comparison_df["Model"].tolist()

plt.figure(figsize=(12, 6))
bar_width = 0.2
index = np.arange(len(metrics_to_plot))

for i, model in enumerate(models):
    values = [comparison_df.loc[i, metric] for metric in metrics_to_plot]
    plt.bar(index + i*bar_width, values, bar_width, label=model)

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Summarization Performance Comparison')
plt.xticks(index + bar_width/2, metrics_to_plot)
plt.legend()
plt.tight_layout()
plt.grid(axis='y', alpha=0.3)
plt.show()

In [None]:
# Calculate improvement from base to fine-tuned model
improvement_data = {}

for metric in ["ROUGE-1 F1", "ROUGE-2 F1", "ROUGE-L F1"]:
    base_value = comparison_df.loc[0, metric]
    finetuned_value = comparison_df.loc[1, metric]
    
    if not pd.isna(base_value) and not pd.isna(finetuned_value):
        abs_improvement = finetuned_value - base_value
        pct_improvement = (abs_improvement / base_value) * 100 if base_value > 0 else float('inf')
        
        improvement_data[metric] = {
            "Base Model": base_value,
            "Fine-tuned Model": finetuned_value,
            "Absolute Improvement": abs_improvement,
            "% Improvement": pct_improvement
        }

improvement_df = pd.DataFrame({
    "Metric": list(improvement_data.keys()),
    "Base Score": [improvement_data[m]["Base Model"] for m in improvement_data],
    "Fine-tuned Score": [improvement_data[m]["Fine-tuned Model"] for m in improvement_data],
    "Absolute Improvement": [improvement_data[m]["Absolute Improvement"] for m in improvement_data],
    "% Improvement": [f"{improvement_data[m]['% Improvement']:.2f}%" for m in improvement_data]
})

print("Improvement Analysis:")
improvement_df

## Larger Training/Evaluation Results

If you were to train **Qwen3-4B-Instruct-2507** on **5000** samples and evaluate on **100** test items (total training time 32 mins on an ml.g5.12xlarge instance), you would see the following results:

![](./images/sft_5000_train_100_test_scores.png)

![](images/sft_5000_train_100_test_bars.png)

![](images/sft_5000_train_100_test_compare.png)


## Detailed Comparison Between Models

In [None]:
# Display example predictions from both models
num_examples = min(2, len(dataset))

for i in range(num_examples):
    print(f"\nExample {i+1}:")
    print(f"Question: {dataset[i]['Question']}")

    ref_cot_answer = '\n'.join([dataset[i]['Complex_CoT'],dataset[i]['Response']])
    print(f"\nReference CoT+Answer: {ref_cot_answer}")
    
    print(f"\nBase Model Summary: {base_model_results['predictions'][i]}")
    print(f"\nFine-tuned Model Summary: {finetuned_model_results['predictions'][i]}")
    
    # Calculate ROUGE scores for this example
    base_rouge = rouge_metrics.score(ref_cot_answer, base_model_results['predictions'][i])
    finetuned_rouge = rouge_metrics.score(ref_cot_answer, finetuned_model_results['predictions'][i])
    
    print("\nROUGE Scores:")
    print(f"Base Model - ROUGE-1: {base_rouge['rouge1'].fmeasure:.4f}, ROUGE-2: {base_rouge['rouge2'].fmeasure:.4f}, ROUGE-L: {base_rouge['rougeL'].fmeasure:.4f}")
    print(f"Fine-tuned - ROUGE-1: {finetuned_rouge['rouge1'].fmeasure:.4f}, ROUGE-2: {finetuned_rouge['rouge2'].fmeasure:.4f}, ROUGE-L: {finetuned_rouge['rougeL'].fmeasure:.4f}")
    
    print("\n" + "="*80)

# Clean Up Endpoints

Run the following code to clean up your base endpoint. It is no longer needed.

In [None]:
sagemaker_client = boto3.client('sagemaker')

delete_base_response = sagemaker_client.delete_endpoint(
    EndpointName=BASE_ENDPOINT_NAME
)

print(delete_base_response)

In [None]:
delete_basecfg_response = sagemaker_client.delete_endpoint_config(
    EndpointConfigName=BASE_ENDPOINT_NAME
)
print(delete_basecfg_response)