### Import Libraries

In [8]:
import importlib
import os
import json
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from models import clip, xclip
from actions import helpers

importlib.reload(clip)
importlib.reload(xclip)
importlib.reload(helpers)
from models.clip import CLIPModel
from models.xclip import XCLIPVideoClassifier

In [9]:
CLIP_OUTPUT_JSON_PATH = "../../data/results/clip_results.json"
XCLIP_OUTPUT_JSON_PATH = "../../data/results/xclip_results.json"

### Get video paths and candidate labels

In [10]:
xd_violence = os.path.join(os.getenv("XD_VIOLENCE_PATH"), "abuse2.mp4")

In [11]:
vision_labels, associated_objects = helpers.extract_vision_data(
    os.getenv("LABELS_PATH")
)
primary_labels = list(vision_labels.keys())

### Run CLIP

In [15]:
# 1. Initialize CLIP model
clip_model = CLIPModel(
    model_name="openai/clip-vit-base-patch32",  # or "openai/clip-vit-large-patch14"
    sample_rate=10,  # Extract every 10th frame
    batch_size=8,  # Process 8 frames at a time
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu


Loaded CLIP model: openai/clip-vit-base-patch32


In [16]:
await clip_model.process_all_videos(
    video_paths=[xd_violence],
    candidate_labels=primary_labels,
    output_json=CLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per frame
)

Processed 1 videos


In [17]:
await clip_model.process_all_videos(
    video_paths=[xd_violence],
    candidate_labels=associated_objects,
    output_json=CLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per frame
)

Processed 1 videos


### Run X-CLIP

In [12]:
# Initialize X-CLIP model
xclip_model = XCLIPVideoClassifier(
    model_name="microsoft/xclip-base-patch16-zero-shot",  # or "microsoft/xclip-large-patch14"
    clip_len=32,  # Number of frames per segment
    frame_sample_rate=2,  # Sample every 2nd frame
)

Loaded XCLIP model: microsoft/xclip-base-patch16-zero-shot


In [None]:
await xclip_model.process_all_videos(
    video_paths=[xd_violence],
    labels=primary_labels,
    output_json=XCLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per segment
)

In [None]:
await xclip_model.process_all_videos(
    video_paths=[xd_violence],
    labels=associated_objects,
    output_json=XCLIP_OUTPUT_JSON_PATH,
    overwrite=False,  # Set to True to start fresh
    top_k=3,  # Keep top 3 labels per segment
)