# YoloGen Demo

**YOLO Object Detection + VLM Description Pipeline**

This notebook demonstrates how to use trained models for inference and visualization.

## 1. Setup

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
import cv2
import numpy as np
import json
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont

## 2. YOLO Detection Only

In [None]:
from yologen.core.predictor import YOLOPredictor

# Load YOLO model
# Replace with your trained model path
yolo = YOLOPredictor(weights="../runs/your_experiment/yolo/weights/best.pt")

# Run prediction
results = yolo.predict("path/to/your/image.jpg")

# Display results
for det in results[0]['detections']:
    print(f"Class: {det['class_name']}, Conf: {det['confidence']:.2f}, BBox: {det['bbox']}")

## 3. YOLO + VLM (Unified Predictor)

In [None]:
from yologen.core.predictor import UnifiedPredictor

# Initialize unified predictor (YOLO + VLM)
# Replace with your trained model paths
predictor = UnifiedPredictor(
    yolo_weights="../runs/your_experiment/yolo/weights/best.pt",
    vlm_adapter="../runs/your_experiment/vlm/best",
    vlm_precision="4bit",
)

In [None]:
# Run prediction with VLM description
img_path = "path/to/your/image.jpg"  # Replace with your image path

results = predictor.predict(
    source=img_path,
    vlm_question="What is in the red marked area?",
)

# Display results as text
for det in results[0]['detections']:
    print(f"\n[{det['class_name']}] conf={det['confidence']:.2f}")
    print(f"  BBox: {det['bbox']}")
    print(f"  VLM: {det['vlm_answer']}")

# Quick visualization
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

for det in results[0]['detections']:
    x1, y1, x2, y2 = [int(v) for v in det['bbox']]
    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 3)
    cv2.putText(img, det['class_name'], (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)

plt.figure(figsize=(12, 8))
plt.imshow(img)
plt.axis('off')

# Show VLM answer below image
vlm_answer = results[0]['detections'][0]['vlm_answer'] if results[0]['detections'] else "No detection"
plt.figtext(0.5, 0.02, f"VLM: {vlm_answer}", ha='center', fontsize=11, 
            wrap=True, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
plt.title("YOLO + VLM Prediction", fontsize=12)
plt.tight_layout()
plt.show()

## 4. Visualize Predictions

Display the image with detection boxes and VLM descriptions below.

In [None]:
def visualize_with_vlm(image_path, detections, box_color=(255, 0, 0), box_thickness=3):
    """
    Visualize detections with bounding boxes and VLM descriptions.
    
    Args:
        image_path: Path to image file
        detections: List of detection results with vlm_answer
        box_color: RGB tuple for box color (default: red)
        box_thickness: Line thickness for boxes
    """
    # Load image
    img = cv2.imread(str(image_path))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Draw boxes and labels
    for i, det in enumerate(detections):
        x1, y1, x2, y2 = [int(v) for v in det['bbox']]
        
        # Draw bounding box
        cv2.rectangle(img, (x1, y1), (x2, y2), box_color, box_thickness)
        
        # Draw label with index
        label = f"[{i+1}] {det['class_name']}: {det['confidence']:.2f}"
        cv2.putText(img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, box_color, 2)
    
    # Create figure with image on top, VLM results below
    fig = plt.figure(figsize=(14, 10))
    
    # Image subplot (top 70%)
    ax_img = fig.add_axes([0.05, 0.30, 0.9, 0.65])
    ax_img.imshow(img)
    ax_img.axis('off')
    ax_img.set_title('YOLO Detection + VLM Description', fontsize=14, fontweight='bold')
    
    # VLM results text (bottom 25%)
    ax_text = fig.add_axes([0.05, 0.02, 0.9, 0.25])
    ax_text.axis('off')
    
    # Build VLM results text
    vlm_text = "VLM Descriptions:\n" + "=" * 50 + "\n\n"
    for i, det in enumerate(detections):
        vlm_text += f"[{i+1}] {det['class_name']} (conf: {det['confidence']:.2f})\n"
        vlm_text += f"    â†’ {det['vlm_answer']}\n\n"
    
    ax_text.text(0, 1, vlm_text, transform=ax_text.transAxes, fontsize=11,
                 verticalalignment='top', fontfamily='monospace',
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.show()
    return fig

In [None]:
# Visualize the prediction results
fig = visualize_with_vlm(
    image_path=img_path,
    detections=results[0]['detections'],
    box_color=(255, 0, 0),  # Red (RGB)
    box_thickness=3,
)

## 5. Load Settings from Training Config

For consistent results, load box color and thickness from the training config.

In [None]:
def load_vlm_config(vlm_adapter_path):
    """
    Load VLM settings from training config.json.
    
    Returns dict with: box_thickness, box_color (RGB), system_prompt, class_names
    """
    adapter_path = Path(vlm_adapter_path)
    
    # Search for config.json in adapter directory or parent
    search_paths = [
        adapter_path / "config.json",
        adapter_path.parent / "config.json",
    ]
    
    for config_path in search_paths:
        if config_path.exists():
            with open(config_path) as f:
                config = json.load(f)
            return {
                'box_thickness': config.get('box_thickness', 3),
                'box_color': tuple(config.get('box_color', [255, 0, 0])),  # RGB
                'system_prompt': config.get('system_prompt', None),
                'class_names': config.get('class_names', []),
            }
    
    # Default values if config not found
    return {'box_thickness': 3, 'box_color': (255, 0, 0), 'system_prompt': None, 'class_names': []}


# Load config from training
# Replace with your VLM adapter path
vlm_config = load_vlm_config("../runs/your_experiment/vlm/best")

print("Loaded VLM Config:")
print(f"  box_thickness: {vlm_config['box_thickness']}")
print(f"  box_color: {vlm_config['box_color']} (RGB)")
print(f"  system_prompt: {'Yes' if vlm_config['system_prompt'] else 'No'}")
print(f"  class_names: {vlm_config['class_names']}")

In [None]:
# ============================================================
# OPTIONAL: Override settings if needed
# ============================================================
# Uncomment and modify to override training config:
#
# vlm_config['box_color'] = (0, 255, 0)    # Green
# vlm_config['box_color'] = (0, 0, 255)    # Blue  
# vlm_config['box_thickness'] = 5          # Thicker lines

# Visualize with loaded/overridden config
fig = visualize_with_vlm(
    image_path=img_path,
    detections=results[0]['detections'],
    box_color=vlm_config['box_color'],
    box_thickness=vlm_config['box_thickness'],
)

## 6. Custom Questions

Ask different questions about the detected objects.

In [None]:
# Example questions to ask the VLM
questions = [
    "What is in the red marked area?",
    "Describe the object in the red box.",
    "What color is the vehicle?",
    "Is this a truck or a car?",
]

print("Asking different questions about the same detection:\n")
for q in questions:
    results = predictor.predict(
        source=img_path,
        vlm_question=q,
    )
    answer = results[0]['detections'][0]['vlm_answer'] if results[0]['detections'] else "No detection"
    print(f"Q: {q}")
    print(f"A: {answer}\n")

## 7. Batch Processing

Process multiple images from a directory.

In [None]:
# Process multiple images from a directory
# Replace with your image directory
image_dir = Path("../data/your_dataset/images/val")
images = list(image_dir.glob("*.jpg"))[:5]  # First 5 images

print(f"Processing {len(images)} images...\n")

for img_path in images:
    results = predictor.predict(
        source=str(img_path),
        vlm_question="What is in the red marked area?",
    )
    
    print(f"{img_path.name}:")
    for det in results[0]['detections']:
        vlm_short = det['vlm_answer'][:60] + "..." if len(det['vlm_answer']) > 60 else det['vlm_answer']
        print(f"  [{det['class_name']}] {vlm_short}")
    print()

## 8. Save Results

Save predictions with visualizations and VLM descriptions.

In [None]:
# Save results with --save flag
# This creates both annotated images and .txt files with VLM answers

results = predictor.predict(
    source=img_path,
    vlm_question="What is in the red marked area?",
    save=True,
    save_dir="./output",
)

print("Results saved to ./output/")
print("  - *_result.jpg: Annotated image")
print("  - *_result.txt: VLM descriptions")

## 9. Training Your Own Model

```bash
# Train YOLO + VLM with single command
python train.py --config configs/car_detection.yaml

# Train YOLO only
python train.py --data data/your_dataset/dataset.yaml --epochs 100

# Skip YOLO, train VLM only (if YOLO already trained)
python train.py --config configs/car_detection.yaml --skip-yolo
```