### Import Libraries

In [None]:
import importlib
import os
import sys
import plotly.express as px
import pandas as pd
import torch
from tqdm import tqdm

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from actions import extract_dataset
from actions import extract_metadata
from models import vad

importlib.reload(extract_dataset)
importlib.reload(extract_metadata)
importlib.reload(vad)
from actions.extract_metadata import VideoMetadataExtractor
from models.vad import SileroVAD

In [None]:
# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("PyTorch is running on CPU only")

In [2]:
VIDEO_URLS_PATH = "../data/metadata/video_links_labels.csv"
LABEL_COLUMN = "binary_label"

### Extract urls from dataset in Hugging Face along with labels (binary and multilabel)

In [None]:
builder = extract_dataset.XDViolence()
extract_dataset.extract_video_links_and_labels_to_csv(builder, VIDEO_URLS_PATH)
print("CSV file created: video_links_labels.csv")

CSV file created: video_links_labels.csv


In [4]:
df = pd.read_csv(VIDEO_URLS_PATH)
# Count occurrences of each class
counts = df[LABEL_COLUMN].value_counts().sort_index()
counts.index = counts.index.map({0: "Non-violent", 1: "Violent"})

fig = px.bar(
    x=counts.index,
    y=counts.values,
    labels={"x": "Class", "y": "Count"},
    title="Distribution of Violence Labels",
    width=500,
    height=400,
)
fig.show()

### Get local video paths

In [5]:
video_folder = os.getenv("VIDEOS_PATH")
video_paths = [os.path.join(video_folder, fname) for fname in os.listdir(video_folder)]

### Extract video metadata

In [None]:
# Initialize the extractor
extractor = VideoMetadataExtractor(video_paths)
# Extract metadata for all videos
print(f"Extracting metadata for {len(video_paths)} videos...")
metadata_list = await extractor.extract_all_metadata()

In [7]:
# Convert to DataFrame
metadata_df = pd.DataFrame(metadata_list)
print(f"Extracted metadata for {len(metadata_df)} videos")
metadata_df.head(1)

Extracted metadata for 3950 videos


Unnamed: 0,video_path,video_name,has_audio,frame_count,fps,duration,audio_sr,size_bytes,size_mb
0,C:/Users/CG/Desktop/Multimodal-Techniques-for-...,A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A,True,1561,24.0,65.041667,48000.0,6828150,6.51


In [18]:
# Merge with labels
labels_df = pd.read_csv(VIDEO_URLS_PATH)
merged_df = pd.merge(metadata_df, labels_df, left_on="video_name", right_on="id")
merged_df.drop(columns=["id"], inplace=True)
merged_df.head(1)

Unnamed: 0,video_path,video_name,has_audio,frame_count,fps,duration,audio_sr,size_bytes,size_mb,download_url,binary_label,multilabel
0,C:/Users/CG/Desktop/Multimodal-Techniques-for-...,A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A,True,1561,24.0,65.041667,48000.0,6828150,6.51,https://huggingface.co/datasets/jherng/xd-viol...,0,0


### Visualizations on video features

In [20]:
# Video file size distribution
fig = px.box(merged_df, y="size_mb", 
             title="Video File Size Distribution (MB)",
             labels={"size_mb": "Size (MB)"},
             points="all",  # Show all points
             hover_data=["video_name"])
fig.update_layout(height=400, width=700)
fig.show()

# Identify outliers (files > 90th percentile) and add flag column  
# 90th = flags top 10% of videos 
size_threshold = merged_df["size_mb"].quantile(0.9)
outliers = merged_df[merged_df["size_mb"] > size_threshold]
print(f"Outliers (size > {size_threshold:.2f} MB): {len(outliers)} videos")
merged_df["is_size_outlier"] = merged_df["size_mb"] > size_threshold

# New video file size distribution
fig = px.box(merged_df[~ merged_df["is_size_outlier"]], y="size_mb", 
             title="Video File Size Distribution after removing outliers (MB)",
             labels={"size_mb": "Size (MB)"},
             points="all",  # Show all points
             hover_data=["video_name"])
fig.update_layout(height=400, width=700)
fig.show()

Outliers (size > 33.21 MB): 394 videos


In [21]:
################# After filtering out outliers #########################
# Count occurrences of each class
counts = merged_df[~ merged_df["is_size_outlier"]]["has_audio"].value_counts().sort_index()
print(counts)
# 5. Audio presence by label
audio_stats = merged_df[~ merged_df["is_size_outlier"]].groupby(["binary_label", "has_audio"]).size().reset_index(name="count")
fig = px.bar(audio_stats, x="binary_label", y="count", color="has_audio",
             title="Audio Presence by Violence Label",
             labels={"binary_label": "Label (0=Non-violent, 1=Violent)", "count": "Number of Videos"},
             barmode="group")
fig.update_layout(height=400, width=700)
fig.show()

has_audio
True    3556
Name: count, dtype: int64


### Extract VAD for videos (not outliers)

In [11]:
# Initialize SileroVAD with default 16kHz sampling rate
vad = SileroVAD(sr=16000)

In [None]:
# Filter videos that are not outliers and have audio
videos_to_process = merged_df[
    (~merged_df["is_size_outlier"]) & 
    (merged_df["has_audio"] == True)
].copy()
print(f"Processing VAD for {len(videos_to_process)} videos with audio (non-outliers)...")

# Initialize columns for VAD results
merged_df["vad_segments"] = None
merged_df["speech_ratio"] = 0.0  # speech duration / total duration

for idx, row in tqdm(videos_to_process.iterrows(), total=len(videos_to_process)):
    video_path = row["video_path"]
    video_name = row["video_name"]
    
    # Extract speech segments
    segments = vad.extract_speech_segments(video_path)
    # Calculate metrics
    total_speech = sum(seg['end'] - seg['start'] for seg in segments)
    speech_ratio = total_speech / row["duration"] if row["duration"] > 0 else 0

    # Update the main DataFrame
    merged_df.at[idx, "vad_segments"] = segments
    merged_df.at[idx, "speech_ratio"] = speech_ratio = speech_ratio
print("\nVAD extraction completed!")

In [26]:
# Save to CSV
merged_df.to_csv(os.getenv("VIDEO_METADATA_PATH"), index=False)