### Import Libraries

In [1]:
import importlib
import os
import json
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from models import clap, beats
from actions import helpers

importlib.reload(clap)
importlib.reload(beats)
importlib.reload(helpers)
from models.clap import CLAPModel
from models.beats import BeatsModel

In [2]:
CLAP_OUTPUT_JSON_PATH = "../../data/results/clap_results.json"
BEATS_OUTPUT_JSON_PATH = "../../data/results/beats_results.json"

### Get video paths and candidate labels

In [3]:
# Get video paths from dataset metadata
video_metadata = pd.read_csv(os.getenv("VIDEO_METADATA_PATH"))
video_paths = video_metadata[~video_metadata['is_size_outlier']]['video_path'].tolist()
print(f"Total videos to process: {len(video_paths)}")

Total videos to process: 3556


In [4]:
audio_labels, audioset_labels = helpers.extract_audio_data(os.getenv("LABELS_PATH"))
primary_labels = list(audio_labels.keys())

### Run Clap

In [5]:
#  Initialize CLAP model
clap_model = CLAPModel(
    model_name="laion/clap-htsat-fused",
    sr=48000,  # CLAP expects 48kHz audio
    window_s=10.0,  # 10-second windows
    hop_s=10.0,  # Non-overlapping chunks
)

Device set to use cuda:0


Loaded CLAP model: ClapFeatureExtractor {
  "chunk_length_s": 10,
  "feature_extractor_type": "ClapFeatureExtractor",
  "feature_size": 64,
  "fft_window_size": 1024,
  "frequency_max": 14000,
  "frequency_min": 50,
  "hop_length": 480,
  "max_length_s": 10,
  "n_fft": 1024,
  "nb_frequency_bins": 513,
  "nb_max_frames": 1000,
  "nb_max_samples": 480000,
  "padding": "repeatpad",
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "ClapProcessor",
  "return_attention_mask": false,
  "sampling_rate": 48000,
  "top_db": null,
  "truncation": "fusion"
}



In [None]:
await clap_model.process_all_videos(
    video_paths=video_paths,
    labels=primary_labels,
    output_json=CLAP_OUTPUT_JSON_PATH,
    overwrite=False,
    top_k=3,
)

In [10]:
await clap_model.process_all_videos(
    video_paths=video_paths,
    labels=audioset_labels,
    output_json=CLAP_OUTPUT_JSON_PATH,
    overwrite=False,
    top_k=3,
)

Processed 3556 videos


### Run BEATS

In [None]:
# Initialize BEATs model
beats_model = BeatsModel(
    checkpoint_path=os.getenv("BEATS_MODEL_PATH"),
    human_labels_path=os.getenv("ONTOLOGY_JSON_PATH"),
    sr=16000,  # BEATs expects 16kHz audio
    win_ms=2000,  # 2-second windows
    hop_ms=2000,  # Non-overlapping chunks
)

In [None]:
await beats_model.process_all_videos(
    video_paths=video_paths[1000:],
    output_json=BEATS_OUTPUT_JSON_PATH,
    overwrite=False,
    top_k=3,
)