# 06 - Florence-2 Visual Grounding Test

Test the Florence-2 vision-language model for:
- Scene captioning ("What is in front of me?")
- Visual grounding ("Where is my phone?")
- Object detection

All functionality is in `src/` modules - this notebook only calls functions.

In [None]:
import sys
sys.path.insert(0, '..')

import cv2
import numpy as np
from IPython.display import display, Image
import matplotlib.pyplot as plt

from src.config import Config, FlorenceConfig
from src.florence import FlorenceModel

## 1. Load Florence-2 Model

In [None]:
# Configure Florence-2
florence_config = FlorenceConfig(
    enabled=True,
    model="microsoft/Florence-2-base",  # Options: Florence-2-base, Florence-2-large
    device="cpu",  # Use "cuda" if GPU available
)

# Create and load model
florence = FlorenceModel(florence_config)
print("Loading Florence-2 model (this may take a minute)...")
success = florence.load()
print(f"Model loaded: {success}")

## 2. Load Test Image

Use a captured frame from the Pi camera or a sample indoor image.

In [None]:
# Option 1: Load from captured frames
import glob

captured_frames = glob.glob('../data/captures/*.jpg')
if captured_frames:
    test_image_path = captured_frames[0]
    print(f"Using captured frame: {test_image_path}")
    test_image = cv2.imread(test_image_path)
else:
    # Option 2: Create a simple test image
    print("No captured frames found, creating test image")
    test_image = np.zeros((480, 640, 3), dtype=np.uint8)
    test_image[:] = (200, 200, 200)  # Gray background
    # Draw some shapes to simulate objects
    cv2.rectangle(test_image, (100, 100), (250, 350), (139, 69, 19), -1)  # Brown door
    cv2.rectangle(test_image, (400, 200), (550, 400), (50, 50, 150), -1)  # Blue chair
    cv2.circle(test_image, (300, 150), 30, (100, 100, 100), -1)  # Gray circle

# Display the image
plt.figure(figsize=(10, 7))
plt.imshow(cv2.cvtColor(test_image, cv2.COLOR_BGR2RGB))
plt.title("Test Image")
plt.axis('off')
plt.show()

print(f"Image shape: {test_image.shape}")

## 3. Scene Captioning

Answer: "What is in front of me?"

In [None]:
# Simple caption
print("Generating simple caption...")
simple_caption = florence.caption(test_image, detailed=False)
print(f"Simple Caption: {simple_caption}")

print()

# Detailed caption
print("Generating detailed caption...")
detailed_caption = florence.caption(test_image, detailed=True)
print(f"Detailed Caption: {detailed_caption}")

## 4. Object Detection

In [None]:
# Detect all objects
print("Running object detection...")
detections = florence.detect(test_image)

print(f"\nDetected {len(detections)} objects:")
for det in detections:
    print(f"  - {det.phrase}: bbox={det.bbox}, center={det.center}")

# Visualize
result_img = florence.draw_results(test_image, detections)
plt.figure(figsize=(10, 7))
plt.imshow(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB))
plt.title(f"Object Detection ({len(detections)} objects)")
plt.axis('off')
plt.show()

## 5. Visual Grounding (Object Search)

Answer: "Where is my phone?" / "Find the door"

In [None]:
# Search for specific objects
search_targets = ["door", "chair", "table", "phone", "person"]

for target in search_targets:
    print(f"\nSearching for: '{target}'")
    results = florence.ground(test_image, target)
    
    if results:
        print(f"  Found {len(results)} match(es):")
        for r in results:
            print(f"    - {r.phrase}: bbox={r.bbox}, center={r.center}")
        
        # Visualize first result
        result_img = florence.draw_results(test_image, results)
        plt.figure(figsize=(8, 6))
        plt.imshow(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB))
        plt.title(f"Found: {target}")
        plt.axis('off')
        plt.show()
    else:
        print(f"  Not found")

## 6. Test with Pipeline

Test the full interactive pipeline.

In [None]:
from src.config import Config
from src.pipeline import SmartAidPipeline

# Configure pipeline (disable voice for notebook testing)
config = Config()
config.pipeline.use_voice_input = False
config.pipeline.use_voice_output = False
config.pipeline.use_florence = True
config.pipeline.use_yolo_world = True
config.pipeline.use_depth = True

# Use the already-loaded Florence model's config
config.florence = florence_config

print("Creating pipeline...")

In [None]:
# Create and load pipeline
pipeline = SmartAidPipeline(config)
print("Loading pipeline components...")
pipeline.load()

In [None]:
# Test queries
test_queries = [
    "What is in front of me?",
    "Where is the door?",
    "Find the chair",
    "What objects are here?",
    "How many chairs?",
]

print("=" * 60)
print("Testing Interactive Queries")
print("=" * 60)

for query in test_queries:
    print(f"\n> {query}")
    response = pipeline.process_query(query, test_image)
    print(f"< {response}")

## 7. Performance Measurement

In [None]:
import time

# Measure caption time
start = time.time()
_ = florence.caption(test_image, detailed=True)
caption_time = time.time() - start
print(f"Caption time: {caption_time:.2f}s")

# Measure detection time
start = time.time()
_ = florence.detect(test_image)
detect_time = time.time() - start
print(f"Detection time: {detect_time:.2f}s")

# Measure grounding time
start = time.time()
_ = florence.ground(test_image, "door")
ground_time = time.time() - start
print(f"Grounding time: {ground_time:.2f}s")

print(f"\nTotal estimated query time: {caption_time + detect_time + ground_time:.2f}s")

## 8. Summary

Florence-2 provides:
- **Captioning**: Describe scenes in natural language
- **Object Detection**: Find all objects in an image
- **Visual Grounding**: Locate specific objects by description

This enables conversational queries like:
- "What is in front of me?" → Uses captioning
- "Where is my phone?" → Uses visual grounding
- "What objects are here?" → Uses detection