In [None]:
!pip install yt-dlp




import yt_dlp
import os

def download_video(video_id, download_root='ExpandedVideos'):
    # Create root folder if it doesn't exist
    if not os.path.exists(download_root):
        os.makedirs(download_root)

    video_url = f"https://www.youtube.com/watch?v={video_id}"
    # Create a subfolder for each video using the video ID
    output_template = os.path.join(download_root, video_id, f"{video_id}.%(ext)s")

    def progress_hook(d):
        if d['status'] == 'downloading':
            downloaded = d.get('_percent_str', '0%').strip()
            print(f"Downloading {video_id}: {downloaded}", end='\r')
        elif d['status'] == 'finished':
            print(f"\nFinished downloading {video_id}")

    ydl_opts = {
        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4',
        'outtmpl': output_template,  # This saves into a folder named after the video ID
        'noplaylist': True,
        'quiet': True,
        'retries': 3,
        'ignoreerrors': True,
        'merge_output_format': 'mp4',
        'progress_hooks': [progress_hook],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            print(f"Starting download for {video_url}")
            ydl.download([video_url])
        except Exception as e:
            print(f"Failed to download {video_url}: {e}")

import pandas as pd

# Load the CSV file
df = pd.read_csv("ground_truth.csv")

# Extract as a list of video IDs
video_ids = df["Video ID"].dropna().unique().tolist()

# Print or use
# print(video_ids)

for vid_id in video_ids:
    download_video(vid_id)


In [None]:
import os

# Your list of original video IDs
urls = []
download_root = 'ExpandedVideos'
failed_log = 'missing_or_failed_videos.txt'

# Get list of subfolders that actually exist (successful downloads)
existing_folders = set(os.listdir(download_root))
# You might want to check for valid .mp4 inside each folder too:
successful_ids = set()

for folder in existing_folders:
    video_path = os.path.join(download_root, folder, f"{folder}.mp4")
    if os.path.exists(video_path) and os.path.getsize(video_path) > 1 * 1024 * 1024:
        successful_ids.add(folder)

# Find which video IDs are missing or incomplete
missing_or_failed = [vid for vid in urls if vid not in successful_ids]

# Output failed ones to a text file
with open(failed_log, 'w') as f:
    for vid in missing_or_failed:
        f.write(vid + '\n')

# Print summary
print(f"Total videos: {len(urls)}")
print(f"Successful downloads: {len(successful_ids)}")
print(f"Failed or missing: {len(missing_or_failed)}")
print(f"Failed video IDs written to: {failed_log}")


In [None]:
from pytubefix import YouTube
import os

def download_pytube_video(video_id, root_folder='ExpandedVideos'):
    video_url = f"https://www.youtube.com/watch?v={video_id}"
    target_folder = os.path.join(root_folder, video_id)

    try:
        print(f"Starting download for {video_url}")
        yt = YouTube(video_url)

        stream = yt.streams.filter(progressive=True, file_extension='mp4', res='720p').first()
        if not stream:
            # fallback: highest progressive stream
            stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()

        if not stream:
            print(f"No downloadable stream found for {video_id}")
            return False

        if not os.path.exists(target_folder):
            os.makedirs(target_folder)

        output_path = os.path.join(target_folder, f"{video_id}.mp4")
        if os.path.exists(output_path):
            print(f"Already downloaded: {video_id}")
            return True

        print(f"Downloading to {output_path}")
        stream.download(output_path=target_folder, filename=f"{video_id}.mp4")
        print(f"Finished downloading {video_id}")
        return True

    except Exception as e:
        print(f"Error downloading {video_id}: {e}")
        return False

def load_video_ids(file_path='missing_or_failed_videos.txt'):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def save_failed_video(video_id, file_path='failed_downloads.txt'):
    with open(file_path, 'a') as f:
        f.write(video_id + '\n')

video_ids = load_video_ids()
failed_log = 'failed_downloads.txt'

if os.path.exists(failed_log):
    os.remove(failed_log)

for vid in video_ids:
    success = download_pytube_video(vid)
    if not success:
        save_failed_video(vid, failed_log)

print("\nâœ… Download complete.")
print(f"Any failed videos were written to: {failed_log}")


In [None]:
import os
from moviepy.editor import VideoFileClip

def load_video_ids(file_path='missing_or_failed_videos.txt'):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def extract_audio_for_listed_videos(video_ids, root_folder='ExpandedVideos'):
    for video_id in video_ids:
        subfolder = os.path.join(root_folder, video_id)
        if not os.path.isdir(subfolder):
            print(f"Folder not found: {subfolder}")
            continue

        mp4_file = os.path.join(subfolder, f"{video_id}.mp4")
        if not os.path.exists(mp4_file):
            print(f"Video not found: {mp4_file}")
            continue

        audio_output_path = os.path.join(subfolder, 'audio.mp3')
        if os.path.exists(audio_output_path):
            print(f"Already extracted: {audio_output_path}")
            continue

        try:
            print(f"Extracting audio from {mp4_file}...")
            clip = VideoFileClip(mp4_file)
            clip.audio.write_audiofile(audio_output_path)
            clip.close()
            print(f"Saved audio to {audio_output_path}")
        except Exception as e:
            print(f"Failed to extract audio for {video_id}: {e}")

# Load list and run
video_ids_to_process = load_video_ids('missing_or_failed_videos.txt')
extract_audio_for_listed_videos(video_ids_to_process)


In [None]:
import os
import shutil

# List of video IDs to delete
video_ids_to_delete = []

root_folder = 'ExpandedVideos'

# Delete each matching folder   
for vid_id in video_ids_to_delete:
    folder_path = os.path.join(root_folder, vid_id)
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        try:
            shutil.rmtree(folder_path)
            print(f"Deleted folder: {folder_path}")
        except Exception as e:
            print(f"Failed to delete {folder_path}: {e}")
    else:
        print(f"Folder not found or already deleted: {folder_path}")
