# 04 - Evaluation Analysis

This notebook provides post-training evaluation and analysis of the fine-tuned Qwen2-VL model
for nutrition table detection.

**What this notebook covers:**
1. Load the fine-tuned model (with LoRA adapters)
2. Evaluate on the validation dataset using IoU metrics
3. Compare performance with the base model (no fine-tuning)
4. Visualize IoU distribution and predictions
5. Analyze failure cases

**Prerequisites:**
- Completed training (have saved LoRA adapters)
- `src/` modules installed (run from project root)

## 1. Setup and Imports

In [1]:
import sys
sys.path.insert(0, '..')  # Add parent directory to path for src imports

import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from PIL import Image

# Project imports
from src.training.evaluation import evaluate_model, print_evaluation_results, compare_models
from src.models.inference import parse_qwen_bbox_output
from src.utils.visualization import visualize_bbox_on_image, visualize_ground_truth_bbox
from src.utils.gpu import clear_memory

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")



PyTorch version: 2.4.1+cu121
CUDA available: True
CUDA device: NVIDIA RTX 6000 Ada Generation


## 2. Configuration

Set paths to your trained model and configure evaluation parameters.

In [2]:
# === CONFIGURATION ===
# Update these paths based on your training output

# Path to saved LoRA adapters (from training)
# Model v1: Trained on all tokens (original collator)
ADAPTER_PATH_V1 = "/ssd1/zhuoyuan/vlm_outputs/qwen2vl-nutrition-detection-lora"

# Model v3: Trained with assistant-only collator (collate_fn_fixed_3)
ADAPTER_PATH_V3 = "/ssd1/zhuoyuan/vlm_outputs/qwen2vl-nutrition-detection-lora-assistantonly"

# Base model ID
BASE_MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"

# Evaluation settings
NUM_EVAL_SAMPLES = 50  # Number of samples to evaluate (set lower for quick tests)
DEVICE_MAP = "balanced"  # "balanced" for multi-GPU, "auto" for single GPU

# Visualization settings
NUM_VISUALIZATION_SAMPLES = 5  # Number of predictions to visualize

print(f"Model v1 (all tokens): {ADAPTER_PATH_V1}")
print(f"Model v3 (assistant-only): {ADAPTER_PATH_V3}")
print(f"Base model: {BASE_MODEL_ID}")
print(f"Evaluation samples: {NUM_EVAL_SAMPLES}")

Model v1 (all tokens): /ssd1/zhuoyuan/vlm_outputs/qwen2vl-nutrition-detection-lora
Model v3 (assistant-only): /ssd1/zhuoyuan/vlm_outputs/qwen2vl-nutrition-detection-lora-assistantonly
Base model: Qwen/Qwen2-VL-7B-Instruct
Evaluation samples: 50


## 3. Load Dataset

In [4]:
# Load the OpenFoodFacts nutrition table detection dataset
print("Loading dataset...")
dataset = load_dataset("openfoodfacts/nutrition-table-detection")

# Get evaluation split
eval_dataset = dataset['validation']  # or 'validation' depending on dataset

print(f"Evaluation dataset size: {len(eval_dataset)}")
print(f"Dataset features: {eval_dataset.features}")

# Preview a sample
sample = eval_dataset[0]
print(f"\nSample keys: {sample.keys()}")
print(f"Image size: {sample['image'].size}")
print(f"Number of objects: {len(sample['objects']['bbox'])}")

Loading dataset...


KeyError: 'validation'

## 4. Load Fine-tuned Models

Load both trained models for comparison:
- **Model v1**: Trained on all tokens (original collator)
- **Model v3**: Trained with assistant-only collator (collate_fn_fixed_3)

In [None]:
from peft import PeftModel
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor

# --- Load Model v1 (All Tokens) ---
print("Loading Model v1 (trained on all tokens)...")

model_v1 = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map=DEVICE_MAP,
    attn_implementation="flash_attention_2",
)
model_v1 = PeftModel.from_pretrained(
    model_v1,
    ADAPTER_PATH_V1,
    torch_dtype=torch.bfloat16,
)
processor_v1 = Qwen2VLProcessor.from_pretrained(ADAPTER_PATH_V1)

print(f"  Loaded from: {ADAPTER_PATH_V1}")

In [None]:
# --- Load Model v3 (Assistant-Only) ---
print("\nLoading Model v3 (trained with assistant-only collator)...")

model_v3 = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map=DEVICE_MAP,
    attn_implementation="flash_attention_2",
)
model_v3 = PeftModel.from_pretrained(
    model_v3,
    ADAPTER_PATH_V3,
    torch_dtype=torch.bfloat16,
)
processor_v3 = Qwen2VLProcessor.from_pretrained(ADAPTER_PATH_V3)

print(f"  Loaded from: {ADAPTER_PATH_V3}")
print("\nBoth models loaded successfully!")

## 5. Evaluate Both Fine-tuned Models

In [None]:
# --- Evaluate Model v1 ---
print(f"\nEvaluating Model v1 (all tokens) on {NUM_EVAL_SAMPLES} samples...")
print("This may take a few minutes...\n")

metrics_v1, ious_v1 = evaluate_model(
    model_v1,
    processor_v1,
    eval_dataset,
    num_samples=NUM_EVAL_SAMPLES
)

print_evaluation_results(metrics_v1, "Model v1 (All Tokens)")

In [None]:
# --- Evaluate Model v3 ---
print(f"\nEvaluating Model v3 (assistant-only) on {NUM_EVAL_SAMPLES} samples...")
print("This may take a few minutes...\n")

metrics_v3, ious_v3 = evaluate_model(
    model_v3,
    processor_v3,
    eval_dataset,
    num_samples=NUM_EVAL_SAMPLES
)

print_evaluation_results(metrics_v3, "Model v3 (Assistant-Only)")

In [None]:
# --- Compare v1 vs v3 ---
print("\n" + "="*60)
print("COMPARISON: Model v1 (All Tokens) vs Model v3 (Assistant-Only)")
print("="*60)

v1_better = metrics_v1['mean_iou'] > metrics_v3['mean_iou']
diff = abs(metrics_v1['mean_iou'] - metrics_v3['mean_iou'])

print(f"\nMean IoU:")
print(f"  Model v1: {metrics_v1['mean_iou']:.4f}")
print(f"  Model v3: {metrics_v3['mean_iou']:.4f}")
print(f"  Winner: {'v1' if v1_better else 'v3'} (by {diff:.4f})")

print(f"\nDetection Rate:")
print(f"  Model v1: {metrics_v1['detection_rate']:.2%}")
print(f"  Model v3: {metrics_v3['detection_rate']:.2%}")

print(f"\nIoU > 0.5:")
print(f"  Model v1: {metrics_v1['iou_threshold_0.5']:.2%}")
print(f"  Model v3: {metrics_v3['iou_threshold_0.5']:.2%}")

# For backward compatibility, set the best model as "finetuned"
if v1_better:
    model_finetuned = model_v1
    processor_finetuned = processor_v1
    metrics_finetuned = metrics_v1
    ious_finetuned = ious_v1
    print("\n--> Using Model v1 for visualizations (better performance)")
else:
    model_finetuned = model_v3
    processor_finetuned = processor_v3
    metrics_finetuned = metrics_v3
    ious_finetuned = ious_v3
    print("\n--> Using Model v3 for visualizations (better performance)")

## 6. Load and Evaluate Base Model (Optional Comparison)

Compare with the base model (without fine-tuning) to measure improvement.

In [None]:
# Set to True to run base model comparison (takes additional time)
RUN_BASE_COMPARISON = True

if RUN_BASE_COMPARISON:
    print("\nLoading base model for comparison...")

    # Clear some memory first
    torch.cuda.empty_cache()

    base_model = Qwen2VLForConditionalGeneration.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=torch.bfloat16,
        device_map=DEVICE_MAP,
        attn_implementation="flash_attention_2",
    )
    base_processor = Qwen2VLProcessor.from_pretrained(BASE_MODEL_ID)

    print(f"\nEvaluating base model on {NUM_EVAL_SAMPLES} samples...")
    metrics_base, ious_base = evaluate_model(
        base_model,
        base_processor,
        eval_dataset,
        num_samples=NUM_EVAL_SAMPLES
    )

    print_evaluation_results(metrics_base, "Base Model (No Fine-tuning)")

    # Compare models
    compare_models(metrics_base, metrics_finetuned)

    # Clean up base model
    del base_model
    del base_processor
    torch.cuda.empty_cache()
else:
    print("Skipping base model comparison (set RUN_BASE_COMPARISON=True to enable)")
    metrics_base = None
    ious_base = None

## 7. Visualize IoU Distribution

In [None]:
# Compare v1 vs v3 IoU distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# IoU histogram for Model v1
axes[0].hist(ious_v1, bins=20, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(metrics_v1['mean_iou'], color='red', linestyle='--',
                label=f"Mean: {metrics_v1['mean_iou']:.3f}")
axes[0].axvline(0.5, color='green', linestyle=':', alpha=0.7, label='IoU=0.5')
axes[0].set_xlabel('IoU Score')
axes[0].set_ylabel('Number of Samples')
axes[0].set_title('Model v1 (All Tokens) - IoU Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# IoU histogram for Model v3
axes[1].hist(ious_v3, bins=20, edgecolor='black', alpha=0.7, color='darkorange')
axes[1].axvline(metrics_v3['mean_iou'], color='red', linestyle='--',
                label=f"Mean: {metrics_v3['mean_iou']:.3f}")
axes[1].axvline(0.5, color='green', linestyle=':', alpha=0.7, label='IoU=0.5')
axes[1].set_xlabel('IoU Score')
axes[1].set_ylabel('Number of Samples')
axes[1].set_title('Model v3 (Assistant-Only) - IoU Distribution')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Comparison bar chart: v1 vs v3 vs base (if available)
metrics_names = ['Mean IoU', 'Median IoU', 'Det. Rate', 'IoU>0.5', 'IoU>0.7']
v1_values = [
    metrics_v1['mean_iou'],
    metrics_v1['median_iou'],
    metrics_v1['detection_rate'],
    metrics_v1['iou_threshold_0.5'],
    metrics_v1['iou_threshold_0.7']
]
v3_values = [
    metrics_v3['mean_iou'],
    metrics_v3['median_iou'],
    metrics_v3['detection_rate'],
    metrics_v3['iou_threshold_0.5'],
    metrics_v3['iou_threshold_0.7']
]

x = np.arange(len(metrics_names))
width = 0.25

if metrics_base:
    base_values = [
        metrics_base['mean_iou'],
        metrics_base['median_iou'],
        metrics_base['detection_rate'],
        metrics_base['iou_threshold_0.5'],
        metrics_base['iou_threshold_0.7']
    ]
    axes[2].bar(x - width, base_values, width, label='Base', alpha=0.8, color='gray')
    axes[2].bar(x, v1_values, width, label='v1 (All)', alpha=0.8, color='steelblue')
    axes[2].bar(x + width, v3_values, width, label='v3 (Asst)', alpha=0.8, color='darkorange')
else:
    axes[2].bar(x - width/2, v1_values, width, label='v1 (All Tokens)', alpha=0.8, color='steelblue')
    axes[2].bar(x + width/2, v3_values, width, label='v3 (Assistant-Only)', alpha=0.8, color='darkorange')

axes[2].set_ylabel('Score')
axes[2].set_title('Model Comparison')
axes[2].set_xticks(x)
axes[2].set_xticklabels(metrics_names, rotation=45, ha='right')
axes[2].legend()
axes[2].grid(True, alpha=0.3, axis='y')
axes[2].set_ylim(0, 1)

plt.tight_layout()
plt.show()

## 8. Visualize Predictions

Show example predictions comparing ground truth vs model predictions.

In [None]:
from qwen_vl_utils import process_vision_info

def run_inference_single(model, processor, image, prompt="Detect the bounding box of the nutrition table."):
    """Run inference on a single image and return parsed bbox."""
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)

    generated_ids_trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]

    return output_text, parse_qwen_bbox_output(output_text)

In [None]:
print(f"\nVisualizing {NUM_VISUALIZATION_SAMPLES} predictions...")

fig, axes = plt.subplots(NUM_VISUALIZATION_SAMPLES, 2, figsize=(12, 5*NUM_VISUALIZATION_SAMPLES))

for idx in range(NUM_VISUALIZATION_SAMPLES):
    example = eval_dataset[idx]
    image = example['image']
    gt_bbox = example['objects']['bbox'][0]
    gt_category = example['objects']['category_name'][0]

    # Run inference
    output_text, parsed_bbox = run_inference_single(model_finetuned, processor_finetuned, image)

    # Ground truth visualization
    img_gt = visualize_ground_truth_bbox(
        image,
        [gt_bbox],
        [gt_category],
        format='openfoodfacts'
    )

    # Prediction visualization
    if parsed_bbox:
        img_pred = visualize_bbox_on_image(image, parsed_bbox, normalize_coords=True)
        pred_label = f"Predicted: {parsed_bbox.get('object', 'detected')}"
    else:
        img_pred = image.copy()
        pred_label = "No detection"

    # Plot
    axes[idx, 0].imshow(img_gt)
    axes[idx, 0].set_title(f"Sample {idx+1} - Ground Truth: {gt_category}")
    axes[idx, 0].axis('off')

    axes[idx, 1].imshow(img_pred)
    axes[idx, 1].set_title(f"Sample {idx+1} - {pred_label}")
    axes[idx, 1].axis('off')

plt.tight_layout()
plt.show()

## 9. Analyze Failure Cases

Look at samples with low IoU to understand where the model struggles.

In [None]:
# Find failure cases (IoU < 0.5)
failure_threshold = 0.5
failure_indices = [i for i, iou in enumerate(ious_finetuned) if iou < failure_threshold]

print(f"\nFailure Analysis (IoU < {failure_threshold}):")
print(f"  Total failures: {len(failure_indices)} / {len(ious_finetuned)} ({100*len(failure_indices)/len(ious_finetuned):.1f}%)")

if failure_indices:
    print(f"\n  Failure case IoU scores:")
    for i, idx in enumerate(failure_indices[:10]):  # Show first 10
        print(f"    Sample {idx}: IoU = {ious_finetuned[idx]:.3f}")

In [None]:
# Visualize a few failure cases
NUM_FAILURE_VIS = min(3, len(failure_indices))

if NUM_FAILURE_VIS > 0:
    print(f"\nVisualizing {NUM_FAILURE_VIS} failure cases...")

    fig, axes = plt.subplots(NUM_FAILURE_VIS, 2, figsize=(12, 5*NUM_FAILURE_VIS))
    if NUM_FAILURE_VIS == 1:
        axes = axes.reshape(1, -1)

    for i, idx in enumerate(failure_indices[:NUM_FAILURE_VIS]):
        example = eval_dataset[idx]
        image = example['image']
        gt_bbox = example['objects']['bbox'][0]
        gt_category = example['objects']['category_name'][0]

        # Run inference
        output_text, parsed_bbox = run_inference_single(model_finetuned, processor_finetuned, image)

        # Ground truth
        img_gt = visualize_ground_truth_bbox(image, [gt_bbox], [gt_category], format='openfoodfacts')

        # Prediction
        if parsed_bbox:
            img_pred = visualize_bbox_on_image(image, parsed_bbox, normalize_coords=True)
        else:
            img_pred = image.copy()

        axes[i, 0].imshow(img_gt)
        axes[i, 0].set_title(f"Failure {i+1} (idx={idx}) - Ground Truth\nIoU: {ious_finetuned[idx]:.3f}")
        axes[i, 0].axis('off')

        axes[i, 1].imshow(img_pred)
        axes[i, 1].set_title(f"Failure {i+1} - Model Prediction")
        axes[i, 1].axis('off')

    plt.tight_layout()
    plt.show()
else:
    print("No failure cases to visualize!")

## 10. Summary

Print final summary of evaluation results.

In [None]:
print("\n" + "="*70)
print("EVALUATION SUMMARY")
print("="*70)

print(f"\nEvaluated on: {NUM_EVAL_SAMPLES} samples")

print(f"\n{'='*70}")
print(f"{'Model':<30} {'Mean IoU':>10} {'Det Rate':>10} {'IoU>0.5':>10} {'IoU>0.7':>10}")
print(f"{'='*70}")

if metrics_base:
    print(f"{'Base (No Fine-tuning)':<30} {metrics_base['mean_iou']:>10.4f} {metrics_base['detection_rate']:>10.2%} {metrics_base['iou_threshold_0.5']:>10.2%} {metrics_base['iou_threshold_0.7']:>10.2%}")

print(f"{'v1 (All Tokens)':<30} {metrics_v1['mean_iou']:>10.4f} {metrics_v1['detection_rate']:>10.2%} {metrics_v1['iou_threshold_0.5']:>10.2%} {metrics_v1['iou_threshold_0.7']:>10.2%}")
print(f"{'v3 (Assistant-Only)':<30} {metrics_v3['mean_iou']:>10.4f} {metrics_v3['detection_rate']:>10.2%} {metrics_v3['iou_threshold_0.5']:>10.2%} {metrics_v3['iou_threshold_0.7']:>10.2%}")

print(f"{'='*70}")

# Determine winner
v1_better = metrics_v1['mean_iou'] > metrics_v3['mean_iou']
winner = "v1 (All Tokens)" if v1_better else "v3 (Assistant-Only)"
diff = abs(metrics_v1['mean_iou'] - metrics_v3['mean_iou'])

print(f"\nBest Model: {winner}")
print(f"  Margin: {diff:.4f} IoU difference")

if metrics_base:
    best_iou = max(metrics_v1['mean_iou'], metrics_v3['mean_iou'])
    improvement = best_iou - metrics_base['mean_iou']
    print(f"\nImprovement over Base Model:")
    print(f"  Mean IoU: +{improvement:.4f} ({100*improvement/max(metrics_base['mean_iou'], 0.001):.1f}%)")

print("\n" + "="*70)

## Next Steps

Based on these results, you might consider:

1. **If IoU is low**:
   - Train for more epochs
   - Adjust learning rate
   - Check data quality

2. **If detection rate is low**:
   - Model might not be learning the task format
   - Check collator is working correctly

3. **If both are good**:
   - Merge LoRA weights for faster inference
   - Deploy with vLLM or similar