In [None]:
import os
import subprocess

In [None]:
# Create folders to store videos and images
# Images represents the frame extracted from the videos
os.makedirs("videos", exist_ok=True)
os.makedirs("images", exist_ok=True)

# Use cookies from a file to bypass Youtube restrictions
COOKIES_PATH = "youtube_cookies.txt"

# If there is no cookies file, print error
if not os.path.exists(COOKIES_PATH):
    print(f"Cookie file not found: {COOKIES_PATH}")
else:
    # for each video in the CSV file, download the video and extract a frame
    for _, row in df_sampled.iterrows():
        video_id = row["video_id"]
        url = row["url"]
        start = row.get("start time")
        end = row.get("end time")

        # Skip if start or end time is invalid
        if pd.isna(start) or pd.isna(end):
            print(f"Skipping {video_id}: missing start or end time")
            continue

        try:
            # calculate the mid point of the given clip
            mid = round((float(start) + float(end)) / 2, 2)
        except:
            # if start or end time is not able to convert to float, skip the video
            print(f"Skipping {video_id}: invalid start/end values")
            continue

        # Define paths for video and image
        video_path = f"videos/{video_id}.mp4"
        image_path = f"images/{video_id}.jpg"

        # If the video is not already downloaded, download it
        if not os.path.exists(video_path):
            print(f"Downloading {video_id}...")
            try:
                # Use yt-dlp to download the video with the cookies
                subprocess.run([
                    "yt-dlp",
                    "--cookies", COOKIES_PATH,
                    "-f", "bv*+ba/best", 
                    "--merge-output-format", "mp4",
                    "-o", f"videos/{video_id}.mp4",
                    url
                ], check=True)
                print(f"Downloaded {video_id}")
            # except to handle errors
            except subprocess.CalledProcessError:
                print(f"Failed to download {video_id}")
                continue
        else:
            # If the video is already downloaded, print a message
            print(f"{video_id} already downloaded.")

        # Extract frame if it doesn't exist
        if not os.path.exists(image_path):
            print(f"Extracting frame at {mid}s for {video_id}...")
            try:
                # Use ffmpeg to extract the frame at the mid point
                subprocess.run([
                    "ffmpeg", "-ss", str(mid), "-i", video_path,
                    "-frames:v", "1", "-q:v", "2", image_path
                ], check=True)
                print(f"Extracted frame for {video_id}")
            # except to handle errors
            except subprocess.CalledProcessError:
                print(f"Failed to extract frame for {video_id}")
        else:
            # If the image is already extracted, print a message
            print(f"Frame for {video_id} already exists.")
