In [None]:
import os, sys
import time
import yaml
from PIL.Image import Image as PILImage
import cv2

# sys.path.append("/home/alicranck/almog/projects/vision-tools/vision_tools")

from vision_tools.core.tools.detection import OpenVocabularyDetector
from vision_tools.core.tools.captioning import Captioner
from vision_tools.core.tools.base_tool import BaseVisionTool
from vision_tools.engine.video_engine import VideoInferenceEngine
from vision_tools.utils.image_utils import base64_encode


## Test tools

### Helpers

In [None]:
demo_video_url = "https://cdn.pixabay.com/video/2020/11/13/56310-479197605_large.mp4"


def time_run(tool: BaseVisionTool, n_frames: int):
    
    
    cap = cv2.VideoCapture(demo_video_url)

    processed_frames = 0
    times = []
    while cap.isOpened() and processed_frames < n_frames:
        ret, frame = cap.read()
        
        if not ret:
            break

        start_time = time.time()
        response = tool.process(frame, {})
        end_time = time.time()
        
        processed_frames += 1
        times.append(end_time - start_time)

    print(f"Average time: {sum(times[2:]) / len(times[2:])}")
    

### Detector / Segmentor

In [None]:
cfg_path = "/home/alicranck/almog/projects/vision-tools/vision_tools/core/configs/ov_detection.yaml"
with open(cfg_path, 'r') as f:
    cfg = yaml.safe_load(f)

cfg["vocabulary"] = ["person", "car", "bus"]

detector = OpenVocabularyDetector(cfg['model'], cfg)

In [None]:
detector.model.names

### Captioner

In [None]:
captioner = LlamaCppCaptioner("ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0", {"imgsz": 480})

In [None]:
time_run(captioner, 10)

In [None]:
def extrapolate_box(boxes: list) -> list:
    xyxy_boxes = np.array([box.xyxy.cpu().tolist() for box in boxes])
    diffs = np.diff(xyxy_boxes, axis=0)
    mean_diff = np.ma.average(diffs, axis=0, 
                        weights=range(len(diffs)))
    next_xyxy_box = xyxy_boxes[-1] + mean_diff
    return next_xyxy_box.tolist()[0]