In [None]:
import os, sys
import time
import yaml
from PIL.Image import Image as PILImage
import cv2

# sys.path.append("/home/alicranck/almog/projects/vision-tools/vision_tools")

from vision_tools.core.tools.detection import OpenVocabularyDetector
from vision_tools.core.tools.captioning import Captioner
from vision_tools.core.tools.embedder import CLIPEmbedder, JinaEmbedder
from vision_tools.core.tools.base_tool import BaseVisionTool
from vision_tools.engine.video_engine import VideoInferenceEngine
from vision_tools.core.tools.pipeline import VisionPipeline, PipelineConfig
from vision_tools.utils.image_utils import base64_encode


## Test tools

### Helpers

In [None]:
demo_video_url = "https://cdn.pixabay.com/video/2020/11/13/56310-479197605_large.mp4"


def time_run(tool: BaseVisionTool, n_frames: int):
    
    
    cap = cv2.VideoCapture(demo_video_url)

    processed_frames = 0
    times = []
    while cap.isOpened() and processed_frames < n_frames:
        ret, frame = cap.read()
        
        if not ret:
            break

        start_time = time.time()
        response = tool.process(frame, {})
        end_time = time.time()
        
        processed_frames += 1
        times.append(end_time - start_time)

    print(f"Average time: {sum(times[2:]) / len(times[2:])}")
    

### Detector / Segmentor

In [None]:
cfg_path = "/home/alicranck/almog/projects/vision-tools/vision_tools/core/configs/ov_detection.yaml"
with open(cfg_path, 'r') as f:
    cfg = yaml.safe_load(f)

cfg["vocabulary"] = ["person", "car", "bus"]

detector = OpenVocabularyDetector(cfg['model'], cfg)

In [None]:
detector.model.names

### Captioner

In [None]:
captioner = LlamaCppCaptioner("ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0", {"imgsz": 480})

In [None]:
time_run(captioner, 10)

In [None]:
def extrapolate_box(boxes: list) -> list:
    xyxy_boxes = np.array([box.xyxy.cpu().tolist() for box in boxes])
    diffs = np.diff(xyxy_boxes, axis=0)
    mean_diff = np.ma.average(diffs, axis=0, 
                        weights=range(len(diffs)))
    next_xyxy_box = xyxy_boxes[-1] + mean_diff
    return next_xyxy_box.tolist()[0]

### Embedding

In [None]:
from vision_tools.core.tools.embedder import SigLIP2Embedder


embedder = SigLIP2Embedder(model_id="google/siglip2-base-patch16-384", config={}, device="cpu")

In [None]:
image_path = "/home/alicranck/Downloads/download.jpeg"
image_embedding = embedder.process(image_path, {}, None)[0]['embedding']

In [None]:
text_embedding = embedder.encode_text("a red car")[0]

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np


cosine_similarity(np.array(image_embedding).reshape(1, -1), np.array(text_embedding).reshape(1, -1))

In [None]:
image_path = "/home/alicranck/Downloads/download.jpeg"
image = Image.open(image_path).convert("RGB")

inputs = processor(images=[image], return_tensors="pt").to(model.device)
with torch.no_grad():
    image_features = model.get_image_features(**inputs)
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
    image_embeddings = image_features.cpu().numpy()

In [None]:
texts = ["a green field", "a red car", "a black cat on a red couch", "a potato", "black shoes"]
tokens = tokenizer(texts, padding="max_length",
                    max_length=64, return_tensors="pt").to(model.device)
with torch.no_grad():
    text_features = model.get_text_features(**tokens)

text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
text_embeddings = text_features.cpu().numpy()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np


similarities = cosine_similarity(image_embeddings, text_embeddings)

print("Semantic Search Results (Cosine Similarity):")
for i, text in enumerate(texts):
    print(f"Score: {similarities[0][i]:.4f} | Text: {text}")

## Video engine

In [None]:
pipeline_config = PipelineConfig(
    tool_settings={
        "embedding": {
            "trigger": {"type": "stride", "value": 150}
        },
        "ov_detection": {
            "vocabulary": ["person", "car", "dog", "cat", "chair"],
            "trigger": {"type": "stride", "value": 150}
        }
    }
)

pipeline = VisionPipeline(pipeline_config)
engine = VideoInferenceEngine(pipeline, "/home/alicranck/Downloads/הכנסה.mp4")

In [None]:
async def _persist_data(data):
    tools_run = data['tools_run']
    if not tools_run:
        return
    
    timestamp = data['metadata']['timestamp']
    metadata = {
        "timestamp": timestamp,
        "data": data["embedding"]
    }

    print(metadata)
    


# Run engine
async for _ in engine.run_inference(
    on_data=_persist_data, 
    buffer_delay=0, 
    realtime=False
):
    pass

# Cleanup
pipeline.unload_tools()
logger.info(f"Finished indexing {video_id}")