# VLM Comparison: Base vs Fine-tuned

Compare your fine-tuned VLM model against the base model to evaluate improvements.

## Usage
1. Set your model name and adapter path
2. Define your test prompt
3. Point to your test images folder
4. Run all cells

## 1. Configuration

In [None]:
# Model settings
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"  # Base model name
ADAPTER_PATH = "/path/to/your/vlm/adapter"   # Fine-tuned adapter path
PRECISION = "4bit"                           # 4bit, 8bit, or fp16

In [None]:
# Test images folder
TEST_FOLDER = "test_images"  # Folder containing test images (with bbox drawn)

# Your evaluation prompt - customize for your use case
# The images should have red bounding boxes already drawn
PROMPT = """
Look at the red bounding box in the image.
Is there a [YOUR_OBJECT] inside the marked area?

Answer with only: Yes or No
""".strip()

# Optional: System prompt (set to None if not used during training)
SYSTEM_PROMPT = """
You are an object detection assistant.
When shown an image with a red bounding box, identify what is inside the marked area.
""".strip()

# Set to None if you didn't use system prompt during training
# SYSTEM_PROMPT = None

In [None]:
# Find all test images
import glob

image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.webp"]
test_images = []

for ext in image_extensions:
    test_images.extend(glob.glob(f"{TEST_FOLDER}/{ext}"))
    test_images.extend(glob.glob(f"{TEST_FOLDER}/**/{ext}", recursive=True))

test_images = sorted(set(test_images))

print(f"Found {len(test_images)} test images:")
for img in test_images[:10]:  # Show first 10
    print(f"  - {img}")
if len(test_images) > 10:
    print(f"  ... and {len(test_images) - 10} more")

## 2. Load Base Model

In [None]:
from yologen.models.vlm.qwen import QwenVLM

print("Loading base model...")
base_vlm = QwenVLM(
    model_name=MODEL_NAME,
    load_in_4bit=(PRECISION == "4bit"),
    load_in_8bit=(PRECISION == "8bit"),
    use_lora=False,
)
base_vlm.load_model()
print("Base model ready!")

## 3. Run Base Model Inference

In [None]:
base_results = []

for i, image_path in enumerate(test_images):
    print(f"\r[Base] Processing {i+1}/{len(test_images)}: {image_path.split('/')[-1]}", end="")
    
    response = base_vlm.generate(
        image=image_path,
        question=PROMPT,
        system_prompt=SYSTEM_PROMPT,
    )
    
    base_results.append({
        "image": image_path,
        "response": response.strip()
    })

print(f"\nBase model: {len(base_results)} images processed")

## 4. Clear GPU Memory

In [None]:
import torch
import gc

del base_vlm
gc.collect()
torch.cuda.empty_cache()
print("GPU memory cleared.")

## 5. Load Fine-tuned Model

In [None]:
print("Loading fine-tuned model...")
finetuned_vlm = QwenVLM(
    model_name=MODEL_NAME,
    load_in_4bit=(PRECISION == "4bit"),
    load_in_8bit=(PRECISION == "8bit"),
    use_lora=False,
)
finetuned_vlm.load_model()
finetuned_vlm.load_adapter(ADAPTER_PATH)
print(f"Fine-tuned model ready! Adapter: {ADAPTER_PATH}")

## 6. Run Fine-tuned Model Inference

In [None]:
finetuned_results = []

for i, image_path in enumerate(test_images):
    print(f"\r[Fine-tuned] Processing {i+1}/{len(test_images)}: {image_path.split('/')[-1]}", end="")
    
    response = finetuned_vlm.generate(
        image=image_path,
        question=PROMPT,
        system_prompt=SYSTEM_PROMPT,
    )
    
    finetuned_results.append({
        "image": image_path,
        "response": response.strip()
    })

print(f"\nFine-tuned model: {len(finetuned_results)} images processed")

## 7. Results Summary

In [None]:
from collections import Counter

# Count Yes/No responses
def count_responses(results):
    counts = Counter()
    for r in results:
        resp = r["response"].lower()
        if "yes" in resp:
            counts["Yes"] += 1
        elif "no" in resp:
            counts["No"] += 1
        else:
            counts["Other"] += 1
    return counts

base_counts = count_responses(base_results)
ft_counts = count_responses(finetuned_results)

# Agreement rate
agreements = sum(1 for b, f in zip(base_results, finetuned_results) 
                 if b["response"].lower().strip() == f["response"].lower().strip())
agreement_rate = agreements / len(test_images) * 100

print("=" * 50)
print("RESULTS SUMMARY")
print("=" * 50)
print(f"\nTotal images: {len(test_images)}")
print(f"\nBase Model:       Yes={base_counts['Yes']}, No={base_counts['No']}, Other={base_counts['Other']}")
print(f"Fine-tuned Model: Yes={ft_counts['Yes']}, No={ft_counts['No']}, Other={ft_counts['Other']}")
print(f"\nAgreement rate: {agreement_rate:.1f}% ({agreements}/{len(test_images)})")

## 8. Detailed Comparison Table

In [None]:
from IPython.display import display, HTML

html = "<table style='width:100%; border-collapse: collapse; font-size:12px;'>"
html += "<tr style='background:#333; color:white;'>"
html += "<th style='padding:8px; border:1px solid #ddd;'>#</th>"
html += "<th style='padding:8px; border:1px solid #ddd;'>Image</th>"
html += "<th style='padding:8px; border:1px solid #ddd;'>Base Model</th>"
html += "<th style='padding:8px; border:1px solid #ddd;'>Fine-tuned</th>"
html += "<th style='padding:8px; border:1px solid #ddd;'>Match</th>"
html += "</tr>"

for i, (base, ft) in enumerate(zip(base_results, finetuned_results)):
    base_resp = base["response"][:30]
    ft_resp = ft["response"][:30]
    match = base["response"].lower().strip() == ft["response"].lower().strip()
    match_icon = "✓" if match else "✗"
    match_color = "#90EE90" if match else "#FFB6C1"
    
    html += f"<tr>"
    html += f"<td style='padding:6px; border:1px solid #ddd;'>{i+1}</td>"
    html += f"<td style='padding:6px; border:1px solid #ddd;'>{base['image'].split('/')[-1]}</td>"
    html += f"<td style='padding:6px; border:1px solid #ddd;'>{base_resp}</td>"
    html += f"<td style='padding:6px; border:1px solid #ddd;'>{ft_resp}</td>"
    html += f"<td style='padding:6px; border:1px solid #ddd; background:{match_color}; text-align:center;'>{match_icon}</td>"
    html += f"</tr>"

html += "</table>"
display(HTML(html))

## 9. Visual Comparison

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import math

# Grid layout
n_images = min(len(test_images), 12)  # Show max 12 images
n_cols = 3
n_rows = math.ceil(n_images / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5*n_rows))
axes = axes.flatten() if n_images > 1 else [axes]

for i in range(n_images):
    base = base_results[i]
    ft = finetuned_results[i]
    
    try:
        img = Image.open(base["image"])
        axes[i].imshow(img)
        
        # Color: Yes=green, No=red, Other=gray
        def get_color(resp):
            if "yes" in resp.lower(): return "green"
            if "no" in resp.lower(): return "red"
            return "gray"
        
        base_color = get_color(base["response"])
        ft_color = get_color(ft["response"])
        
        # Labels on image
        axes[i].text(0.02, 0.98, f"Base: {base['response'][:15]}", 
                     transform=axes[i].transAxes, fontsize=9, fontweight='bold',
                     color='white', backgroundcolor=base_color,
                     verticalalignment='top')
        axes[i].text(0.02, 0.88, f"FT: {ft['response'][:15]}", 
                     transform=axes[i].transAxes, fontsize=9, fontweight='bold',
                     color='white', backgroundcolor=ft_color,
                     verticalalignment='top')
        
        axes[i].set_title(base["image"].split("/")[-1], fontsize=8)
        axes[i].axis('off')
    except Exception as e:
        axes[i].text(0.5, 0.5, f"Error: {e}", ha='center', va='center')
        axes[i].axis('off')

# Hide empty subplots
for j in range(n_images, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.savefig("comparison_results.png", dpi=150, bbox_inches='tight')
plt.show()
print("\nSaved to: comparison_results.png")

## 10. Export Results to CSV

In [None]:
import csv

output_file = "comparison_results.csv"

with open(output_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Image", "Base_Response", "Finetuned_Response", "Match"])
    
    for base, ft in zip(base_results, finetuned_results):
        match = base["response"].lower().strip() == ft["response"].lower().strip()
        writer.writerow([
            base["image"],
            base["response"],
            ft["response"],
            "Yes" if match else "No"
        ])

print(f"Results exported to: {output_file}")

## Cleanup

In [None]:
del finetuned_vlm
gc.collect()
torch.cuda.empty_cache()
print("Done!")