Then, we add the summary of the video by the transcription and the ai agent supported by youtube

In [5]:
import os
import json
import random
import shutil
import whisper
from yt_dlp import YoutubeDL
import subprocess
import cv2
import base64
import isodate
def parse_duration_iso8601(duration_str):
    try:
        duration = isodate.parse_duration(duration_str)
        return duration.total_seconds()
    except Exception:
        return None

def get_all_video_ids_with_duration_limit(base_dir, extension=".json", max_duration=120):
    video_ids = []
    for category in os.listdir(base_dir):
        category_dir = os.path.join(base_dir, category)
        if not os.path.isdir(category_dir):
            continue
        for filename in os.listdir(category_dir):
            if filename.endswith(extension):
                file_path = os.path.join(category_dir, filename)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        data = json.load(f)
                    duration_str = data.get("contentDetails", {}).get("duration", None)
                    duration = parse_duration_iso8601(duration_str) if duration_str else None
                    if duration is not None and duration < max_duration:
                        video_id = os.path.splitext(filename)[0]
                        video_ids.append(video_id)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return video_ids

def sample_video_ids_from_dir(base_dir, k=200, output_path="sampled_100_ids.json"):
    all_ids = get_all_video_ids_with_duration_limit(base_dir,max_duration=180)
    sampled = random.sample(all_ids, min(k, len(all_ids)))

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(sampled, f, indent=2)

    print(f"Found {len(all_ids)} videos. Sampled {len(sampled)} IDs to {output_path}.")
    return sampled




In [9]:
from yt_dlp import YoutubeDL
import os
import glob

def is_video_available(video_id):
    url = f"https://www.youtube.com/watch?v={video_id}"
    ydl_opts = {'quiet': True, 'noplaylist': True, 'skip_download': True}

    try:
        with YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            return info.get("is_live", False) is False
    except Exception:
        return False

def download_low_quality_video(video_id, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    url = f"https://www.youtube.com/watch?v={video_id}"
    output_template = os.path.join(output_dir, f"{video_id}.%(ext)s")

    if not is_video_available(video_id):
        print(f"[SKIPPED] {video_id} is unavailable.")
        return False

    ydl_opts = {
        'quiet': True,
        'noprogress': True,
        'format': 'worstvideo[ext=mp4]+worstaudio[ext=m4a]/worst[ext=mp4]',
        'outtmpl': output_template,
        'merge_output_format': 'mp4',
        'noplaylist': True,
        'retries': 3,
        'force_generic_extractor': False,
        'ignoreerrors': False,
        'postprocessors': [{
            'key': 'FFmpegVideoConvertor',
            'preferedformat': 'mp4'
        }]
    }

    try:
        with YoutubeDL(ydl_opts) as ydl:
            print(f"[DOWNLOADING LOW QUALITY] {video_id}...")
            ydl.download([url])

        matches = glob.glob(os.path.join(output_dir, f"{video_id}.*"))
        if matches:
            print(f"[SUCCESS] Downloaded {video_id}")
            return True
        else:
            print(f"[ERROR] No file found for {video_id}")
            return False

    except Exception as e:
        print(f"[ERROR] Failed to download {video_id}: {e}")
        return False


def get_frame_every_20s(video_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"[ERROR] Failed to open video: {video_path}")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = int(total_frames / fps)

    saved_paths = []
    for t in range(0, duration, 20):
        frame_number = int(t * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        success, frame = cap.read()
        if success:
            frame_path = os.path.join(output_dir, f"frame_{t:04d}.jpg")
            cv2.imwrite(frame_path, frame)
            saved_paths.append((t, frame_path))
    return saved_paths

def generate_captions_with_videollama(frame_dir, output_json_path):
    import openai
    import base64
    import json
    import os
  
    openai.api_key = "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" 

    def encode_img(img_path):
        with open(img_path, "rb") as f:
            return base64.b64encode(f.read()).decode("utf-8")

    def batch_get_image_captions(img_paths):
        images = [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_img(p)}"}} for p in img_paths]
        prompt = [{"type": "text", "text": "Give a one-sentence caption for each image in order. Return a list of captions."}] + images
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500
        )
        return response.choices[0].message.content.strip().split("\n")

    frame_files = sorted([
        f for f in os.listdir(frame_dir)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ])
    batch_size = 6
    captions = {}
    for i in range(0, len(frame_files), batch_size):
        batch = frame_files[i:i + batch_size]
        batch_paths = [os.path.join(frame_dir, f) for f in batch]
        try:
            batch_captions = batch_get_image_captions(batch_paths)
            for fname, cap in zip(batch, batch_captions):
                cap_clean = cap.strip("-•123. ").strip()
                captions[fname] = cap_clean
                print(f"{fname}: {cap_clean}")
        except Exception as e:
            print(f"[ERROR] Batch {i} failed: {e}")

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(captions, f, ensure_ascii=False, indent=2)
    print(f"Saved captions to {output_json_path}")

from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import whisper
def get_transcript_or_whisper(video_id, video_path, whisper_model="base"):
    """
    Try to get YouTube transcript first.
    If not available, fall back to Whisper transcription.
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
        grouped = {}
        for seg in transcript:
            t = int(seg['start']) // 10 * 10
            grouped.setdefault(t, []).append(seg['text'])

        grouped_transcript = [
            {"start": t, "end": t + 20, "text": " ".join(lines)}
            for t, lines in sorted(grouped.items())
        ]
        print(f"[INFO] YouTube transcript used for {video_id}")
        return grouped_transcript

    except (TranscriptsDisabled, NoTranscriptFound):
        print(f"[WARN] No YouTube transcript found for {video_id}, falling back to Whisper...")
        model = whisper.load_model(whisper_model)
        result = model.transcribe(video_path, verbose=False, word_timestamps=False)
        segments = result.get("segments", [])
        grouped = {}
        for seg in segments:
            t = int(seg['start']) // 10 * 10
            grouped.setdefault(t, []).append(seg['text'])

        grouped_transcript = [
            {"start": t, "end": t + 20, "text": " ".join(lines)}
            for t, lines in sorted(grouped.items())
        ]
        print(f"[INFO] Whisper transcript used for {video_id}")
        return grouped_transcript

In [10]:
import os
import subprocess
import librosa
import openai

def extract_audio_from_video(video_path, audio_path):
    cmd = f'ffmpeg -y -i "{video_path}" -vn -acodec pcm_s16le -ar 44100 -ac 2 "{audio_path}"'
    print(f"Extracting audio: {cmd}") 
    subprocess.run(cmd, shell=True, check=True)

def separate_vocals_bgm_demucs(audio_path, output_dir):
    import os
    import subprocess
    os.makedirs(output_dir, exist_ok=True)
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"not_exit: {audio_path}")
    cmd = f'demucs --two-stems=vocals "{audio_path}" -o "{output_dir}"'
    print(f"Running demucs command: {cmd}")
    subprocess.run(cmd, shell=True, check=True)
    vocals_path = None
    bgm_path = None
    for root, dirs, files in os.walk(output_dir):
        if "vocals.wav" in files and "no_vocals.wav" in files:
            vocals_path = os.path.join(root, "vocals.wav")
            bgm_path = os.path.join(root, "no_vocals.wav")
            break
    if not vocals_path or not bgm_path:
        for root, dirs, files in os.walk(output_dir):
            print("DEBUG:", root, files)
        raise FileNotFoundError(f"[ERROR] Demucs output not found for {audio_path}")
    return vocals_path, bgm_path
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None, mono=True)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    centroid = float(librosa.feature.spectral_centroid(y=y, sr=sr).mean())
    zcr = float(librosa.feature.zero_crossing_rate(y).mean())
    rms = float(librosa.feature.rms(y=y).mean())
    chroma = float(librosa.feature.chroma_stft(y=y, sr=sr).mean())
    return {
        "tempo": round(float(tempo), 2),
        "spectral_centroid": round(centroid, 2),
        "zero_crossing_rate": round(zcr, 4),
        "rms_energy": round(rms, 4),
        "chroma_stft": round(chroma, 4),
    }

def classify_bgm_with_gpt(audio_path):
    API_KEY = "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"
    openai.api_key = API_KEY
    features = extract_audio_features(audio_path)
    prompt = f"""Given the following audio features extracted from a video's background music:\n- Tempo: {features['tempo']} BPM\n- Spectral Centroid: {features['spectral_centroid']}\n- Zero Crossing Rate: {features['zero_crossing_rate']}\n- RMS Energy: {features['rms_energy']}\n- Chroma STFT: {features['chroma_stft']}\n\nDescribe the likely style of the background music in one concise sentence.\nFor example, you may say "lo-fi instrumental", "upbeat dance music", or "slow orchestral background music".\n"""
    client = openai.OpenAI(api_key=API_KEY)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        max_tokens=60
    )
    return response.choices[0].message.content.strip()

  from pkg_resources import resource_filename


In [11]:
def process_video(video_id, video_dir, work_dir, whisper_model="base"):
    """
    Process a single video:
    - Transcribe speech
    - Separate BGM
    - Classify BGM style using GPT
    - Extract frames
    - Generate captions via VideoLLaMA
    """
    print(f"[INFO] Downloading video: {video_id}")
    success = download_low_quality_video(video_id, video_dir)
    video_path = os.path.join(video_dir, f"{video_id}.mp4")

    if not success or not os.path.exists(video_path):
        raise FileNotFoundError(f"Video download failed or file not found: {video_path}")
    
    out_dir = os.path.join(work_dir, video_id)
    os.makedirs(out_dir, exist_ok=True)

    try:
       transcription_segments = get_transcript_or_whisper(video_id=video_id, video_path=video_path, whisper_model=whisper_model)

    except Exception as e:
        raise RuntimeError(f"[{video_id}] Whisper transcription failed: {e}")   
    try:
        audio_path = os.path.join(out_dir, "audio.wav")
        extract_audio_from_video(video_path, audio_path)
        vocals_path, bgm_path = separate_vocals_bgm_demucs(audio_path, out_dir)
        bgm_style = classify_bgm_with_gpt(bgm_path)
    except Exception as e:
        raise RuntimeError(f"[{video_id}] BGM extraction/classification failed: {e}")
    
    try:
        frame_dir = os.path.join(out_dir, "frames")
        frame_infos = get_frame_every_20s(video_path, frame_dir)
    except Exception as e:
        raise RuntimeError(f"[{video_id}] Frame extraction failed: {e}")
    
    try:
        caption_json = os.path.join(out_dir, "videollama_captions.json")
        generate_captions_with_videollama(frame_dir, caption_json)
        with open(caption_json, "r", encoding="utf-8") as f:
            llama_captions = json.load(f)
    except Exception as e:
        raise RuntimeError(f"[{video_id}] Caption generation or loading failed: {e}")
    
    caption_segments = []
    for t, frame_path in frame_infos:
        summary = llama_captions.get(os.path.basename(frame_path), "[No Caption]")
        caption_segments.append({"start": t, "end": t+10, "summary": summary})

    return {
        "video_id": video_id,
        "captions": caption_segments,
        "transcription_segments": transcription_segments,
        "bgm_style": bgm_style
    }
def batch_process_videos(video_ids, video_dir, work_dir, output_json_path, whisper_model="base", max_success=100):
    results = []
    idx = 0
    while len(results) < max_success and idx < len(video_ids):
        video_id = video_ids[idx]
        print(f"\n=== Processing {idx + 1}/{len(video_ids)}: {video_id} ===")
        try:
            result = process_video(video_id, video_dir, work_dir, whisper_model)
            results.append(result)
        except Exception as e:
            print(f"[ABORT] Processing failed for {video_id}. Reason:\n{e}")
            print("Skipping this video and continuing with the next one.")
        finally:
            os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
            with open(output_json_path, "w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
        idx += 1

    print(f"\nSuccessfully processed {len(results)} videos.")



In [3]:
def run_pipeline():
    from pathlib import Path
    import os
    # Configurations
    current_dir = Path.cwd()
    project_root = current_dir.parent
    base_dir = "youtube_videos/short"
    video_dir = project_root / base_dir  # Directory with original videos
    work_dir = project_root /"processed_results"  # Output dir for all results
    output_json_path = work_dir / "final_results.json"
    whisper_model = "base"  # or "small", "medium", etc.
  
    # Ensure output directory exists
    work_dir.mkdir(parents=True, exist_ok=True)

    # Step 1: Sample video IDs
    sampled_ids = sample_video_ids_from_dir( base_dir = project_root/"youtube_videos"/"short", k=100, output_path=work_dir / "sampled_100_ids.json")

    # Step 2: Run batch processing
    batch_process_videos(
        video_ids=sampled_ids,
        video_dir=video_dir,
        work_dir=work_dir,
        output_json_path=output_json_path,
        whisper_model=whisper_model
    )

if __name__ == "__main__":
    run_pipeline()


NameError: name 'sample_video_ids_from_dir' is not defined

In [12]:
import json
from pathlib import Path
import os
    # Configurations
current_dir = Path.cwd()
project_root = current_dir.parent
base_dir = "youtube_videos/short"
video_dir = project_root / base_dir  # Directory with original videos
work_dir = project_root /"processed_results"  # Output dir for all results
output_json_path = work_dir / "final_results.json"
with open(work_dir / "final_results.json", "r", encoding="utf-8") as f:
    old_results = json.load(f)
old_success_ids = set(r["video_id"] for r in old_results)
more_sampled_ids = sample_video_ids_from_dir(
    base_dir=project_root/"youtube_videos"/"short",
    k=20,
    output_path=work_dir /"sampled_more_ids.json"
)

remaining_ids = [vid for vid in more_sampled_ids if vid not in old_success_ids]

batch_process_videos(
    video_ids=remaining_ids,
    video_dir=video_dir,
    work_dir=work_dir,
    output_json_path=work_dir / "new_results.json", 
    max_success=10
)

with open(work_dir / "new_results.json", "r", encoding="utf-8") as f:
    new_results = json.load(f)
all_results = old_results + new_results
with open(work_dir / "final_results.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"finally success: {len(all_results)}")

Found 13479 videos. Sampled 20 IDs to /Users/lyfan/Desktop/data/processed_results/sampled_more_ids.json.

=== Processing 1/20: HfycScoL2zA ===
[INFO] Downloading video: HfycScoL2zA
[DOWNLOADING LOW QUALITY] HfycScoL2zA...
[SUCCESS] Downloaded HfycScoL2zA
[INFO] YouTube transcript used for HfycScoL2zA
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/HfycScoL2zA.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/HfycScoL2zA/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/HfycScoL2zA/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/HfycScoL2zA"


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/HfycScoL2zA/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/HfycScoL2zA/audio.wav


100%|████████████████████████████████████████████████████████████████████████| 29.25/29.25 [00:11<00:00,  2.48seconds/s]


frame_0000.jpg: A gamer is intensely focused during a streaming session
frame_0020.jpg: The streamer smiles with "OKAY" text overlayed during gameplay
Saved captions to /Users/lyfan/Desktop/data/processed_results/HfycScoL2zA/videollama_captions.json

=== Processing 2/20: Qpc8m5jrGow ===
[INFO] Downloading video: Qpc8m5jrGow




[DOWNLOADING LOW QUALITY] Qpc8m5jrGow...
[SUCCESS] Downloaded Qpc8m5jrGow
[WARN] No YouTube transcript found for Qpc8m5jrGow, falling back to Whisper...




Detected language: German


100%|██████████| 1465/1465 [00:01<00:00, 1418.47frames/s]
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enabl

[INFO] Whisper transcript used for Qpc8m5jrGow
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/Qpc8m5jrGow.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/Qpc8m5jrGow/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/Qpc8m5jrGow/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/Qpc8m5jrGow"
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/Qpc8m5jrGow/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/Qpc8m5jrGow/audio.wav


100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:07<00:00,  2.41seconds/s]


frame_0000.jpg: "A woman smiles as she stands next to a person wearing a funny mask in a garden."
Saved captions to /Users/lyfan/Desktop/data/processed_results/Qpc8m5jrGow/videollama_captions.json

=== Processing 3/20: AGdSjTHv4_M ===
[INFO] Downloading video: AGdSjTHv4_M
[DOWNLOADING LOW QUALITY] AGdSjTHv4_M...
[SUCCESS] Downloaded AGdSjTHv4_M
[WARN] No YouTube transcript found for AGdSjTHv4_M, falling back to Whisper...




Detected language: English


100%|██████████| 1462/1462 [00:01<00:00, 801.12frames/s]
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable

[INFO] Whisper transcript used for AGdSjTHv4_M
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/AGdSjTHv4_M.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/AGdSjTHv4_M/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/AGdSjTHv4_M/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/AGdSjTHv4_M"
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/AGdSjTHv4_M/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/AGdSjTHv4_M/audio.wav


100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:07<00:00,  2.45seconds/s]


frame_0000.jpg: "The game screen indicates maxed-out inventory slots at 25/25 and unopened packages at 33/33."
Saved captions to /Users/lyfan/Desktop/data/processed_results/AGdSjTHv4_M/videollama_captions.json

=== Processing 4/20: QocbdMMlqQM ===
[INFO] Downloading video: QocbdMMlqQM


         player = https://www.youtube.com/s/player/e12fbea4/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[DOWNLOADING LOW QUALITY] QocbdMMlqQM...


         player = https://www.youtube.com/s/player/e12fbea4/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
ERROR: [youtube] QocbdMMlqQM: Requested format is not available. Use --list-formats for a list of available formats


[ERROR] Failed to download QocbdMMlqQM: ERROR: [youtube] QocbdMMlqQM: Requested format is not available. Use --list-formats for a list of available formats
[ABORT] Processing failed for QocbdMMlqQM. Reason:
Video download failed or file not found: /Users/lyfan/Desktop/data/youtube_videos/short/QocbdMMlqQM.mp4
Skipping this video and continuing with the next one.

=== Processing 5/20: XVuooOqR4qQ ===
[INFO] Downloading video: XVuooOqR4qQ
[DOWNLOADING LOW QUALITY] XVuooOqR4qQ...
[SUCCESS] Downloaded XVuooOqR4qQ
[INFO] YouTube transcript used for XVuooOqR4qQ
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/XVuooOqR4qQ.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/XVuooOqR4qQ/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/XVuooOqR4qQ/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/XVuooOqR4qQ"


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/XVuooOqR4qQ/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/XVuooOqR4qQ/audio.wav


100%|██████████████████████████████████████████████████████████████████████| 146.25/146.25 [00:53<00:00,  2.72seconds/s]


frame_0000.jpg: Two commentators discuss the backlash over political defection
frame_0020.jpg: Discussion focuses on significant predicted Conservative losses in upcoming local elections
frame_0040.jpg: The conversation continues with analysis of political forecasts and potential impacts
frame_0060.jpg: 4. A lively exchange takes place about the expected Conservative setbacks in elections
frame_0080.jpg: 5. The debate highlights concerns over predicted political shifts in 2024
frame_0100.jpg: 6. Commentary concludes with reflections on the polling predictions and their implications
frame_0120.jpg: A television segment discusses the predictions for the 2024 local elections and possible Conservative losses to Labour
frame_0140.jpg: The TALK logo is set against a dark blue abstract background
Saved captions to /Users/lyfan/Desktop/data/processed_results/XVuooOqR4qQ/videollama_captions.json

=== Processing 6/20: 5fKYnRHtLiQ ===
[INFO] Downloading video: 5fKYnRHtLiQ
[DOWNLOADING LOW QUALITY

ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/5fKYnRHtLiQ/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/5fKYnRHtLiQ"
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/5fKYnRHtLiQ/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/5fKYnRHtLiQ/audio.wav


100%|██████████████████████████████████████████████| 163.79999999999998/163.79999999999998 [01:00<00:00,  2.71seconds/s]


frame_0000.jpg: A screenshot shows a discussion on an internet forum
frame_0020.jpg: A person is speaking in front of a bookshelf during an interview
frame_0040.jpg: A group of people wearing helmets and masks gather outdoors
frame_0060.jpg: 4. The same person continues their discussion from a different angle
frame_0080.jpg: 5. Another person gestures while speaking in a formal setting
frame_0100.jpg: 6. A woman delivers a speech at a podium in a congressional chamber
frame_0120.jpg: A crowd gathers near a government building with flags and banners
frame_0140.jpg: A person walks past a memorial with crosses and flowers
Saved captions to /Users/lyfan/Desktop/data/processed_results/5fKYnRHtLiQ/videollama_captions.json

=== Processing 7/20: WjWvdmYcZDI ===
[INFO] Downloading video: WjWvdmYcZDI


         player = https://www.youtube.com/s/player/e12fbea4/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[DOWNLOADING LOW QUALITY] WjWvdmYcZDI...


         player = https://www.youtube.com/s/player/e12fbea4/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
ERROR: [youtube] WjWvdmYcZDI: Requested format is not available. Use --list-formats for a list of available formats


[ERROR] Failed to download WjWvdmYcZDI: ERROR: [youtube] WjWvdmYcZDI: Requested format is not available. Use --list-formats for a list of available formats
[ABORT] Processing failed for WjWvdmYcZDI. Reason:
Video download failed or file not found: /Users/lyfan/Desktop/data/youtube_videos/short/WjWvdmYcZDI.mp4
Skipping this video and continuing with the next one.

=== Processing 8/20: 6Fjs9ZU1f1w ===
[INFO] Downloading video: 6Fjs9ZU1f1w
[DOWNLOADING LOW QUALITY] 6Fjs9ZU1f1w...
[SUCCESS] Downloaded 6Fjs9ZU1f1w
[WARN] No YouTube transcript found for 6Fjs9ZU1f1w, falling back to Whisper...




Detected language: Hindi


100%|██████████| 4126/4126 [00:14<00:00, 276.48frames/s]
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable

[INFO] Whisper transcript used for 6Fjs9ZU1f1w
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/6Fjs9ZU1f1w.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/6Fjs9ZU1f1w/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/6Fjs9ZU1f1w/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/6Fjs9ZU1f1w"
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/6Fjs9ZU1f1w/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/6Fjs9ZU1f1w/audio.wav


100%|██████████████████████████████████████████████████████████████████████████| 46.8/46.8 [00:17<00:00,  2.63seconds/s]


frame_0000.jpg: The man gestures with his hand in a thoughtful pose
frame_0020.jpg: A person smiles while text on the image discusses cookies and tracking
frame_0040.jpg: The individual taps their forehead, suggesting an idea or insight
Saved captions to /Users/lyfan/Desktop/data/processed_results/6Fjs9ZU1f1w/videollama_captions.json

=== Processing 9/20: SSVrGnFsgbs ===
[INFO] Downloading video: SSVrGnFsgbs
[DOWNLOADING LOW QUALITY] SSVrGnFsgbs...
[SUCCESS] Downloaded SSVrGnFsgbs
[INFO] YouTube transcript used for SSVrGnFsgbs
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/SSVrGnFsgbs.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/SSVrGnFsgbs/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/SSVrGnFsgbs/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/SSVrGnFsgbs"


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/SSVrGnFsgbs/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/SSVrGnFsgbs/audio.wav


100%|████████████████████████████████████████████████████████████████████████| 64.35/64.35 [00:24<00:00,  2.64seconds/s]


frame_0000.jpg: ["A person discusses a water-powered engine.", "The speaker explains how water is broken apart to power the engine.", "The concept releases only water vapor into the air."]
Saved captions to /Users/lyfan/Desktop/data/processed_results/SSVrGnFsgbs/videollama_captions.json

=== Processing 10/20: hB0p43Ezx0M ===
[INFO] Downloading video: hB0p43Ezx0M
[DOWNLOADING LOW QUALITY] hB0p43Ezx0M...
[SUCCESS] Downloaded hB0p43Ezx0M
[INFO] YouTube transcript used for hB0p43Ezx0M
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/hB0p43Ezx0M.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/hB0p43Ezx0M/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/hB0p43Ezx0M/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/hB0p43Ezx0M"


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/hB0p43Ezx0M/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/hB0p43Ezx0M/audio.wav


100%|████████████████████████████████████████████████████████████████████████| 64.35/64.35 [00:24<00:00,  2.65seconds/s]


frame_0000.jpg: "Do 10% of developers do *nothing*?"
frame_0020.jpg: "A tweet highlights 'ghost engineers' who avoid making changes."
frame_0040.jpg: "Exploring the idea of being paged for doing minimal work."
Saved captions to /Users/lyfan/Desktop/data/processed_results/hB0p43Ezx0M/videollama_captions.json

=== Processing 11/20: d4JdPY4n0Qg ===
[INFO] Downloading video: d4JdPY4n0Qg
[DOWNLOADING LOW QUALITY] d4JdPY4n0Qg...




[SUCCESS] Downloaded d4JdPY4n0Qg
[WARN] No YouTube transcript found for d4JdPY4n0Qg, falling back to Whisper...




Detected language: Nynorsk


100%|██████████| 1879/1879 [00:05<00:00, 336.41frames/s]
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable

[INFO] Whisper transcript used for d4JdPY4n0Qg
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/d4JdPY4n0Qg.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/d4JdPY4n0Qg/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/d4JdPY4n0Qg/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/d4JdPY4n0Qg"
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/d4JdPY4n0Qg/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/d4JdPY4n0Qg/audio.wav


100%|██████████████████████████████████████████████████████████████████████████| 23.4/23.4 [00:09<00:00,  2.52seconds/s]


frame_0000.jpg: "A tal amount of blue paperclips are neatly arranged in rows on a white surface."
Saved captions to /Users/lyfan/Desktop/data/processed_results/d4JdPY4n0Qg/videollama_captions.json

=== Processing 12/20: wqntEFzNd1g ===
[INFO] Downloading video: wqntEFzNd1g


ERROR: [youtube] wqntEFzNd1g: Video unavailable. This video contains content from SME, who has blocked it in your country on copyright grounds


[SKIPPED] wqntEFzNd1g is unavailable.
[ABORT] Processing failed for wqntEFzNd1g. Reason:
Video download failed or file not found: /Users/lyfan/Desktop/data/youtube_videos/short/wqntEFzNd1g.mp4
Skipping this video and continuing with the next one.

=== Processing 13/20: bAqQeq2M-LM ===
[INFO] Downloading video: bAqQeq2M-LM
[DOWNLOADING LOW QUALITY] bAqQeq2M-LM...
[SUCCESS] Downloaded bAqQeq2M-LM
[WARN] No YouTube transcript found for bAqQeq2M-LM, falling back to Whisper...




Detected language: Nynorsk


100%|██████████| 1304/1304 [00:05<00:00, 236.39frames/s]
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable

[INFO] Whisper transcript used for bAqQeq2M-LM
Extracting audio: ffmpeg -y -i "/Users/lyfan/Desktop/data/youtube_videos/short/bAqQeq2M-LM.mp4" -vn -acodec pcm_s16le -ar 44100 -ac 2 "/Users/lyfan/Desktop/data/processed_results/bAqQeq2M-LM/audio.wav"
Running demucs command: demucs --two-stems=vocals "/Users/lyfan/Desktop/data/processed_results/bAqQeq2M-LM/audio.wav" -o "/Users/lyfan/Desktop/data/processed_results/bAqQeq2M-LM"
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/lyfan/Desktop/data/processed_results/bAqQeq2M-LM/htdemucs
Separating track /Users/lyfan/Desktop/data/processed_results/bAqQeq2M-LM/audio.wav


100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:07<00:00,  2.42seconds/s]


frame_0000.jpg: "A creative lipstick art with emojis showcasing food and flowers."
Saved captions to /Users/lyfan/Desktop/data/processed_results/bAqQeq2M-LM/videollama_captions.json

Successfully processed 10 videos.
finally success: 110
