<img src="https://drive.google.com/uc?export=view&id=1JIIlkTWa2xbft5bTpzhGK1BxYL83bJNU" width="800"/>

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# 🔥 Video Search Demo
---

In this demo, we’ll use NOS to build an end-to-end semantic video search utility.

In [8]:
from nos.test.utils import get_benchmark_video
get_benchmark_video()
FILENAME = "test_video.mp4"


In [9]:
from IPython.display import Video
Video(FILENAME, width=640)

### 🔥 1. Inference with 🤗 transformers (OpenAI CLIP)
---

Let's say we're using the popular OpenAI CLIP for extracting image and text embeddings, and we're using the 🤗 transformers library. 

In [6]:
from typing import Union, List
from PIL import Image

import numpy as np
import torch

class CLIP:
    """Text and image encoding using OpenAI CLIP"""
    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
        from transformers import CLIPModel
        from transformers import CLIPProcessor, CLIPTokenizer
        
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.tokenizer = CLIPTokenizer.from_pretrained(model_name)
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = CLIPModel.from_pretrained(model_name).to(device)
        self.model.eval()
        self.device = self.model.device
        
    def encode_image(self, images: Union[Image.Image, np.ndarray, List[Image.Image], List[np.ndarray]]):
        """Encode image into an embedding."""
        with torch.inference_mode():
            inputs = self.processor(images=images, return_tensors="pt").to(self.device)
            return self.model.get_image_features(**inputs).cpu().numpy()

    def encode_text(self, texts: Union[str, List[str]]) -> np.ndarray:
        """Encode text into an embedding."""
        with torch.inference_mode():
            if isinstance(texts, str):
                texts = [texts]
            inputs = self.tokenizer(
                texts,
                padding=True,
                return_tensors="pt",
            ).to(self.device)
            text_features = self.model.get_text_features(**inputs)
            return text_features.cpu().numpy()

#### Frame Inference

Let's embed the video frame by frame with the encoding scheme defined above to see the iterations per second:

In [19]:
from nos.common import tqdm
from nos.common.io.video.opencv import VideoReader

# Load the first image
images = VideoReader(FILENAME)
image = next(images)

# Load the Pytorch model
clip = CLIP()

In [20]:
for _ in tqdm(duration=5, desc="Naive implementation", unit=" images"):
    clip.encode_image(image)

Naive implementation: 284 images [00:05, 56.72 images/s]


### 2. 🔍 Video Search

The following snippet extracts embeddings from all the frames in the video and uses CLIP to cross-reference text queries with the image embeddings. 

In [None]:
from nos.client import Client, ask

In [25]:
from nos.common import tqdm
from nos.common.io.video.opencv import VideoReader

# Load frames from the video lazily
images = VideoReader(FILENAME)
images = ({"images": img} for img in tqdm(images, unit="images"))

# Set up a model using the Module syntax introduced in 'welcome to NOS'
model = client.Module(TaskType.IMAGE_EMBEDDING, "openai/clip")

# Batch inference using auto-scaled model, then normalize embeddings
video_features = torch.from_numpy(np.vstack(list(model.imap(images))))
video_features /= video_features.norm(dim=-1, keepdim=True)

  0%|                                                                                                 | 0/6059 [00:00<?, ?images/s]

NameError: name 'client' is not defined

In [26]:
from IPython.display import HTML, display
from nos.common.io import VideoReader

encode_text = CLIP().encode_text
video = VideoReader(FILENAME)

def search_video(query: str, video_features: np.ndarray, topk: int = 3):
    """Semantic video search demo in 8 lines of code"""
    # Encode text and normalize
    with torch.inference_mode():
        text_features = encode_text(texts=[query])
        text_features = torch.from_numpy(text_features)
        text_features /= text_features.norm(dim=-1, keepdim=True)

    # Compute the similarity between the search query and each video frame
    similarities = (video_features @ text_features.T)
    _, best_photo_idx = similarities.topk(topk, dim=0)
    
    # Display the top k frames
    results = np.hstack([video[int(frame_id)] for frame_id in best_photo_idx])
    display(Image.fromarray(results).resize((600, 400)))

### 🔍 Sample Queries

In [None]:
search_video("golden gate bridge", video_features, topk=1)

In [None]:
search_video("alcatraz prison", video_features, topk=1)

In [None]:
search_video("fishermans wharf", video_features, topk=1)

In [None]:
search_video("golden gate park windmill", video_features, topk=1)

In [None]:
search_video("chinatown", video_features, topk=1)

In [None]:
search_video("lombard street", video_features, topk=1)

In [None]:
search_video("pier 39", video_features, topk=1)

In [None]:
search_video("riding the tram", video_features, topk=1)

In [None]:
search_video("ferry building", video_features, topk=1)