### Import Libraries

In [None]:
import importlib
import os
import json
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from models import clip, xclip
from actions import helpers

importlib.reload(clip)
importlib.reload(xclip)
importlib.reload(helpers)
from models.clip import CLIPModel
from models.xclip import XCLIPVideoClassifier

In [2]:
import torch
# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.9.0+cu126
CUDA available: True
CUDA version: 12.6


In [3]:
CLIP_OUTPUT_JSON_PATH = "../../data/results/clip_results.json"
XCLIP_OUTPUT_JSON_PATH = "../../data/results/xclip_results.json"

### Get video paths and candidate labels

In [4]:
# Get video paths from dataset metadata
video_metadata = pd.read_csv(os.getenv("VIDEO_METADATA_PATH"))
video_paths = video_metadata[~video_metadata['is_size_outlier']]['video_path'].tolist()
print(f"Total videos to process: {len(video_paths)}")

Total videos to process: 3556


In [5]:
vision_labels, associated_objects = helpers.extract_vision_data(
    os.getenv("LABELS_PATH")
)
primary_labels = list(vision_labels.keys())

### Run CLIP

In [8]:
# 1. Initialize CLIP model
clip_model = CLIPModel(
    model_name="openai/clip-vit-base-patch32",  # or "openai/clip-vit-large-patch14"
    sample_rate=10,  # Extract every 10th frame
    batch_size=8,  # Process 8 frames at a time
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/pytorch_model.bin: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Using a slow image processor as `use_fast` is unset an

Loaded CLIP model: openai/clip-vit-base-patch32


In [11]:
await clip_model.process_all_videos(
    video_paths=video_paths,
    candidate_labels=primary_labels,
    output_json=CLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per frame
)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 3556 videos


In [12]:
await clip_model.process_all_videos(
    video_paths=video_paths,
    candidate_labels=associated_objects,
    output_json=CLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per frame
)

Processed 3556 videos


### Run X-CLIP

In [7]:
# Initialize X-CLIP model
xclip_model = XCLIPVideoClassifier(
    model_name="microsoft/xclip-base-patch16-zero-shot",  # or "microsoft/xclip-large-patch14"
    clip_len=32,  # Number of frames per segment
    frame_sample_rate=2,  # Sample every 2nd frame
)

Loaded XCLIP model: microsoft/xclip-base-patch16-zero-shot


In [14]:
await xclip_model.process_all_videos(
    video_paths=video_paths,
    labels=primary_labels,
    output_json=XCLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per segment
)

  return self.preprocess(images, **kwargs)


Processed 3556 videos


In [8]:
await xclip_model.process_all_videos(
    video_paths=video_paths,
    labels=associated_objects,
    output_json=XCLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per segment
)

  return self.preprocess(images, **kwargs)


Processed 3556 videos
