<img src="https://drive.google.com/uc?export=view&id=1JIIlkTWa2xbft5bTpzhGK1BxYL83bJNU" width="800"/>

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# 🔥 Video Search Demo
---

In this demo, we’ll use NOS to build an end-to-end semantic video search utility.

In [None]:
from nos.test.utils import get_benchmark_video
get_benchmark_video()
FILENAME = "test_video.mp4"


In [None]:
from IPython.display import Video
Video(FILENAME, width=640)

#### Frame Inference

Let's embed the video frame by frame with NOS.

In [None]:
from nos.common import tqdm
from nos.common.io.video.opencv import VideoReader
from nos.client import Client, TaskType

client = Client()
client.WaitForServer()
client.IsHealthy()

# Load the first image
images = VideoReader(FILENAME)

In [None]:
from nos.common import tqdm
from nos.common.io.video.opencv import VideoReader
import torch
import numpy as np
from itertools import islice

images = VideoReader(FILENAME)
features = []
for img in tqdm(islice(images, 0, None, 100), position=0, leave=True):
    features.append(client.Run(TaskType.IMAGE_EMBEDDING, "openai/clip", inputs={"images" : img})['embedding'])

print(features)

# normalize embeddings
video_features = torch.from_numpy(np.stack(features))
video_features /= video_features.norm(dim=-1, keepdim=True)

In [None]:
from IPython.display import HTML, display
from nos.common.io import VideoReader
from PIL import Image

video = VideoReader(FILENAME)

def search_video(query: str, video_features: np.ndarray, topk: int = 3):
    """Semantic video search demo in 8 lines of code"""
    # Encode text and normalize
    with torch.inference_mode():
        text_features = client.Run(TaskType.TEXT_EMBEDDING, "openai/clip", inputs={"texts":[query]})["embedding"]
        text_features = torch.from_numpy(text_features)
        text_features /= text_features.norm(dim=-1, keepdim=True)

    # Compute the similarity between the search query and each video frame
    similarities = (video_features @ text_features.T)
    _, best_photo_idx = similarities.topk(topk, dim=0)
    
    # Display the top k frames
    results = np.hstack([video[int(frame_id)] for frame_id in best_photo_idx])
    display(Image.fromarray(results).resize((600, 400)))

### 🔍 Sample Queries

In [None]:
search_video("golden gate bridge", video_features, topk=1)

In [None]:
search_video("alcatraz prison", video_features, topk=1)

In [None]:
search_video("fishermans wharf", video_features, topk=1)

In [None]:
search_video("golden gate park windmill", video_features, topk=1)

In [None]:
search_video("chinatown", video_features, topk=1)

In [None]:
search_video("lombard street", video_features, topk=1)

In [None]:
search_video("pier 39", video_features, topk=1)

In [None]:
search_video("riding the tram", video_features, topk=1)

In [None]:
search_video("ferry building", video_features, topk=1)