### Import Libraries

In [1]:
import torch

torch.cuda.is_available()

True

In [3]:
import importlib
import os
import json
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from models import whisper
from actions import helpers

importlib.reload(whisper)
importlib.reload(helpers)
from models.whisper import WhisperSegmentTranscriber

In [4]:
WHISPER_OUTPUT_JSON_PATH = "../../data/results/whisper_transcripts.json"
SR = 16000
# Configuration for filtering vad_segments 
MIN_SPEECH_RATIO = 0.005  # 0.5% of video must contain speech
MIN_TOTAL_SPEECH_SECONDS = 0.5  # At least 0.5 seconds total speech
MIN_SEGMENT_DURATION = 0.3  # Each individual segment >= 0.3s

### Get video paths and metadata

In [5]:
# Get video paths from dataset metadata
video_metadata = pd.read_csv(os.getenv("VIDEO_METADATA_PATH"))
# Filter out size outliers
filtered_df = video_metadata[~video_metadata['is_size_outlier']].copy()
after_outliers = len(filtered_df)
print(f"After removing size outliers: {after_outliers} videos")
print(f"\nSpeech ratio statistics:")
print(filtered_df['speech_ratio'].describe())

After removing size outliers: 3556 videos

Speech ratio statistics:
count    3556.000000
mean        0.263725
std         0.281537
min         0.000000
25%         0.000000
50%         0.150753
75%         0.482025
max         0.976950
Name: speech_ratio, dtype: float64


In [6]:
# Remove videos for which there are no vad segments
no_segments = len(filtered_df.loc[filtered_df['vad_segments']=='[]'])
print(f"Videos with no segments: {no_segments}")
filtered_df = filtered_df[filtered_df['vad_segments']!="[]"].copy()

Videos with no segments: 913


In [7]:
import plotly.express as px
import plotly.graph_objects as go

# 1. Distribution of speech ratio
fig = px.histogram(
    filtered_df, 
    x='speech_ratio',
    nbins=50,
    title='Distribution of Speech Ratio',
    labels={'speech_ratio': 'Speech Ratio', 'count': 'Number of Videos'},
    marginal='box'  # Add box plot on top
)
fig.update_layout(height=400, width=800)
fig.show()

In [8]:
#  Filter out videos with very low speech ratio
low_speech_videos = filtered_df[filtered_df['speech_ratio'] < MIN_SPEECH_RATIO]
print(f"\nVideos with speech ratio < {MIN_SPEECH_RATIO}: {len(low_speech_videos)}")
print(f"Percentage: {len(low_speech_videos)/len(filtered_df)*100:.2f}%")


Videos with speech ratio < 0.005: 45
Percentage: 1.70%


In [10]:
import json
import ast

# Apply initial filters
final_df = filtered_df[
    (filtered_df['speech_ratio'] >= MIN_SPEECH_RATIO) &
    (filtered_df['has_audio'] == True)
].copy()

print(f"Initial videos: {after_outliers}")
print(f"After removing videos with no vad-segments: {no_segments}")
print(f"After speech ratio filter (>= {MIN_SPEECH_RATIO}): {len(final_df)} videos")

# Create dictionary with additional filtering
videos_to_process = {}
filtered_out = {
    'no_segments': 0,
    'too_short_total': 0,
    'too_short_segments': 0,
    'success': 0
}

for idx, row in final_df.iterrows():
    video_path = row['video_path']
    vad_segments = row['vad_segments']
    
    # Parse VAD segments if stored as JSON string
    if isinstance(vad_segments, str):
        vad_segments = ast.literal_eval(vad_segments)
    
    # Calculate total speech duration
    total_speech = sum(seg['end'] - seg['start'] for seg in vad_segments)
    
    # Filter: Check minimum total speech duration
    if total_speech < MIN_TOTAL_SPEECH_SECONDS:
        filtered_out['too_short_total'] += 1
        continue
    
    # Add to processing dictionary
    videos_to_process[video_path] = vad_segments
    filtered_out['success'] += 1



print(f"Videos whose total speech < {MIN_TOTAL_SPEECH_SECONDS}s:            {filtered_out['too_short_total']}")
print(f"\n✅ Ready for Whisper:                {filtered_out['success']}")

Initial videos: 3556
After removing videos with no vad-segments: 913
After speech ratio filter (>= 0.005): 2598 videos
Videos whose total speech < 0.5s:            17

✅ Ready for Whisper:                2581


### Run Whisper

In [None]:
# Initialize Whisper model
whisper_transcriber = WhisperSegmentTranscriber(
    model_name="openai/whisper-large-v3-turbo"
)

In [None]:
# Transcribe only these segments
whisper_transcriber.process_all_videos(
    video_segments_dict=videos_to_process,
    output_json=WHISPER_OUTPUT_JSON_PATH,
    sr=SR,
)