In [None]:
import os, sys
import time
import yaml
from PIL.Image import Image as PILImage
import cv2

sys.path.append("/home/almog_elharar/almog/Flash-Findr")

from app.ml_core.tools.detection import OpenVocabularyDetector
from app.ml_core.tools.captioning import LlamaCppCaptioner
from app.ml_core.tools.base_tool import BaseVisionTool
from app.api.engine import VideoInferenceEngine
from app.utils.image_utils import base64_encode


## Test tools

### Helpers

In [None]:
demo_video_url = "https://cdn.pixabay.com/video/2020/11/13/56310-479197605_large.mp4"


def time_run(tool: BaseVisionTool, n_frames: int):
    
    
    cap = cv2.VideoCapture(demo_video_url)

    processed_frames = 0
    times = []
    while cap.isOpened() and processed_frames < n_frames:
        ret, frame = cap.read()
        
        if not ret:
            break

        start_time = time.time()
        response = tool.process(frame, {})
        end_time = time.time()
        
        processed_frames += 1
        times.append(end_time - start_time)

    print(f"Average time: {sum(times[2:]) / len(times[2:])}")
    

### Detector / Segmentor

In [None]:
cfg_path = "/home/almog_elharar/almog/Flash-Findr/app/ml_core/configs/ov_detection.yaml"
with open(cfg_path, 'r') as f:
    cfg = yaml.safe_load(f)

cfg["vocabulary"] = ["person", "car", "bus"]

detector = OpenVocabularyDetector(cfg['model'], cfg)

In [None]:
detector.model.names

### Captioner

In [None]:
captioner = LlamaCppCaptioner("ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0", {"imgsz": 480})

In [None]:
frame = cv2.imread('/home/almog_elharar/almog/Flash-Findr/app/ferrari-e-suv-2-copy-680287cac36b2.jpg')
ret = captioner.process(frame, {})

In [None]:
ret

In [None]:
import base64
import cv2

frame = cv2.imread('/home/almog_elharar/almog/Flash-Findr/app/ferrari-e-suv-2-copy-680287cac36b2.jpg')

h, w = frame.shape[:2]
if max(h, w) > 480:
    scale = 480 / max(h, w)
    new_w = int(w * scale)
    new_h = int(h * scale)
    frame = cv2.resize(frame, (new_w, new_h))

with open("/home/almog_elharar/almog/Flash-Findr/app/ferrari-e-suv-2-copy-680287cac36b2.jpg", "rb") as image_file:
    encoded_string = base64.b64encode(image_file.read()).decode('utf-8')

# payload = {
#     "model": "smolvlm2",
#     "prompt": f"USER:[img-0]Describe the image\nASSISTANT:",
#     "image_data": [{"data": encoded_string, "id": 0}],
#     "n_predict": 128,
# }

payload = {
    "messages": [
        {
            "role": "user",
            "content": [{
                "type": "text",
                "text": "Describe the image",
                },
                {
                    "type": "image_url",
                    "url": "data:image/jpeg;base64," + encoded_string,
                }
            ]

        }
    ],
    "stream": False,
}

In [None]:
data = captioner.inference({"messages":payload})

In [None]:
{
    "messages":[
        {"role":"user",
        "content":[
            {"type":"text","text":"describe the image"},
            {"type":"image_url","image_url":{"url":"data:image/jpeg;base64,"}}
            ]
            }
        ],
"stream":true,"reasoning_format":"auto","temperature":0.8,"max_tokens":-1,"dynatemp_range":0,"dynatemp_exponent":1,"top_k":40,"top_p":0.95,"min_p":0.05,
"xtc_probability":0,"xtc_threshold":0.1,"typ_p":1,"repeat_last_n":64,"repeat_penalty":1,"presence_penalty":0,"frequency_penalty":0,
"dry_multiplier":0,"dry_base":1.75,"dry_allowed_length":2,"dry_penalty_last_n":8192,
"samplers":["penalties","dry","top_n_sigma","top_k","typ_p","top_p","min_p","xtc","temperature"],"timings_per_token":true}

In [None]:
import requests


response = requests.post("http://127.0.0.1:34481/v1/chat/completions", json={
    "messages": [
        {"role": "user", "content": 
            [
                {"type": "text", "text": "Describe the image"},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_string}"}
                }
            ]
        },
    ]
})

In [None]:
response.json()

In [None]:
captioner.unload_tool()