# YoloGen Demo

Object Detection + VLM Description Pipeline

**3 Usage Modes:**
- **YOLO Only** - Object detection
- **VLM Only** - Image description (with or without bbox)
- **YOLO + VLM** - Detection + Description

## 1. Setup

In [None]:
import sys
sys.path.insert(0, '..')

# Your model paths (update these)
YOLO_WEIGHTS = "../runs/your_experiment/yolo/weights/best.pt"
VLM_ADAPTER = "../runs/your_experiment/vlm/best"
TEST_IMAGE = "path/to/your/image.jpg"

## 2. YOLO Only

Just object detection, no VLM.

In [None]:
from yologen.core.predictor import YOLOPredictor

yolo = YOLOPredictor(weights=YOLO_WEIGHTS)
results = yolo.predict(TEST_IMAGE)

for det in results[0]['detections']:
    print(f"{det['class_name']}: {det['confidence']:.2f} | bbox: {det['bbox']}")

## 3. VLM Only

Image description without YOLO detection.

In [None]:
from yologen.core.predictor import VLMPredictor

# Default: loads box_color and box_thickness from training config
vlm = VLMPredictor(vlm_adapter=VLM_ADAPTER, vlm_precision="4bit")

# Or override manually (use same as training for best results!)
# vlm = VLMPredictor(
#     vlm_adapter=VLM_ADAPTER,
#     vlm_precision="4bit",
#     box_color=(255, 0, 0),   # RGB Red
#     box_thickness=3,
# )

In [None]:
# Case A: You provide bbox coordinates -> we draw the red box
answer = vlm.predict(
    image=TEST_IMAGE,
    bbox=[100, 100, 300, 300],  # [x1, y1, x2, y2]
    question="What is in the red marked area?",
)
print(f"Answer: {answer}")

In [None]:
# Case B: Image already has red box drawn -> just ask question
answer = vlm.predict(
    image="image_with_box_already.jpg",
    bbox=None,  # No bbox = don't draw
    question="What is in the red marked area?",
)
print(f"Answer: {answer}")

In [None]:
# Case C: Ask about whole image (no box)
answer = vlm.predict(
    image=TEST_IMAGE,
    question="What do you see in this image?",
)
print(f"Answer: {answer}")

## 4. YOLO + VLM (Unified)

Detection + Description in one call.

In [None]:
from yologen.core.predictor import UnifiedPredictor

# Default: loads box_color and box_thickness from training config
predictor = UnifiedPredictor(
    yolo_weights=YOLO_WEIGHTS,
    vlm_adapter=VLM_ADAPTER,
    vlm_precision="4bit",
    # box_color=(255, 0, 0),   # Optional: override (RGB)
    # box_thickness=3,         # Optional: override
)

results = predictor.predict(
    source=TEST_IMAGE,
    vlm_question="What is in the red marked area?",
)

for det in results[0]['detections']:
    print(f"[{det['class_name']}] {det['confidence']:.2f}")
    print(f"  VLM: {det['vlm_answer']}\n")

## 5. Visualize

In [None]:
import cv2
import matplotlib.pyplot as plt

img = cv2.imread(TEST_IMAGE)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

for det in results[0]['detections']:
    x1, y1, x2, y2 = [int(v) for v in det['bbox']]
    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 3)
    cv2.putText(img, det['class_name'], (x1, y1-10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)

plt.figure(figsize=(12, 8))
plt.imshow(img)
plt.axis('off')
plt.show()

---

## Advanced Options

In [None]:
# Save results to file
results = predictor.predict(
    source=TEST_IMAGE,
    vlm_question="What is in the red marked area?",
    save=True,
    save_dir="./output",
)

In [None]:
# Batch processing
from pathlib import Path

for img_path in Path("./images").glob("*.jpg"):
    results = predictor.predict(source=str(img_path))
    print(f"{img_path.name}: {len(results[0]['detections'])} detections")

In [None]:
# Custom questions
questions = [
    "What is in the red marked area?",
    "Is this dangerous?",
    "Describe this object.",
]

for q in questions:
    results = predictor.predict(source=TEST_IMAGE, vlm_question=q)
    answer = results[0]['detections'][0]['vlm_answer'] if results[0]['detections'] else "No detection"
    print(f"Q: {q}")
    print(f"A: {answer}\n")