In [None]:
import os
import re
from googleapiclient.discovery import build
from datetime import timedelta
 # Installs the yt-dlp package.
import yt_dlp


API_KEY = "AIzaSyBtRMQnNQ3XwlCbS1i4td5r3qDd40WeA10"
CHANNEL_ID = "UC_HK0fs_YyxhvkkiPoL_w6A"

# Initialize YouTube API
youtube = build("youtube", "v3", developerKey=API_KEY)

# Function to get video IDs from a YouTube channel
def get_video_ids(channel_id):
    video_ids = []
    next_page_token = None

    while True:
        # Fetch playlist items (uploads)
        response = youtube.search().list(
            part="id",
            channelId=channel_id,
            maxResults=50,
            pageToken=next_page_token,
            type="video"
        ).execute()

        # Collect video IDs
        for item in response.get("items", []):
            video_ids.append(item["id"]["videoId"])

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return video_ids

# Function to convert ISO 8601 duration to seconds
def parse_duration(duration):
    match = re.match(
        r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?",
        duration
    )
    if not match:
        return 0

    hours, minutes, seconds = match.groups()
    delta = timedelta(
        hours=int(hours) if hours else 0,
        minutes=int(minutes) if minutes else 0,
        seconds=int(seconds) if seconds else 0,
    )
    return int(delta.total_seconds())

# Function to filter videos longer than 5 minutes
def get_video_durations(video_ids):
    long_videos = []
    for i in range(0, len(video_ids), 50):
        # Fetch video details
        response = youtube.videos().list(
            part="contentDetails",
            id=",".join(video_ids[i:i+50])
        ).execute()

        for item in response["items"]:
            duration = item["contentDetails"]["duration"]

            # Convert ISO 8601 duration to seconds
            seconds = parse_duration(duration)
            if seconds > 300:  # 300 seconds = 5 minutes
                long_videos.append(f"https://www.youtube.com/watch?v={item['id']}")

    return long_videos

# Function to download subtitles
def download_subtitles(video_urls, download_folder):
    os.makedirs(download_folder, exist_ok=True)

    # Create yt-dlp options for subtitles
    ydl_opts = {
        'writeautomaticsub': True,  # Download auto-generated subtitles if available
        'writesubtitles': True,     # Download subtitles if available
        'subtitleslangs': ['en'],   # Download subtitles in English
        'skip_download': True,      # Skip the video download, only download subtitles
        'outtmpl': f'{download_folder}/%(id)s.%(ext)s'  # Save subtitles with video ID as filename
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        for url in video_urls:
            try:
                print(f"Downloading subtitles for {url}...")
                ydl.download([url])
            except Exception as e:
                print(f"Error downloading subtitles for {url}: {e}")

# Main flow
def main():
    # Get video IDs from the channel
    video_ids = get_video_ids(CHANNEL_ID)
    print(f"Found {len(video_ids)} videos on the channel.")

    # Filter out videos longer than 5 minutes
    long_videos = get_video_durations(video_ids)
    print(f"Found {len(long_videos)} videos longer than 5 minutes.")

    # Specify the folder to save subtitles locally
    download_folder = "/content/drive/MyDrive/Downloads/subtitles"

    # Download subtitles for videos longer than 5 minutes
    download_subtitles(long_videos, download_folder)
    print(f"Subtitles downloaded to {download_folder}")

# Run the script
main()


Found 12 videos on the channel.
Found 12 videos longer than 5 minutes.
Downloading subtitles for https://www.youtube.com/watch?v=NJAmQoYH75U...
[youtube] Extracting URL: https://www.youtube.com/watch?v=NJAmQoYH75U
[youtube] NJAmQoYH75U: Downloading webpage
[youtube] NJAmQoYH75U: Downloading ios player API JSON
[youtube] NJAmQoYH75U: Downloading mweb player API JSON
[youtube] NJAmQoYH75U: Downloading m3u8 information
[info] NJAmQoYH75U: Downloading subtitles: en
[info] NJAmQoYH75U: Downloading 1 format(s): 244+251
[info] Writing video subtitles to: /content/drive/MyDrive/Downloads/subtitles/NJAmQoYH75U.en.vtt
[download] Destination: /content/drive/MyDrive/Downloads/subtitles/NJAmQoYH75U.en.vtt
[download] 100% of  442.23KiB in 00:00:00 at 2.46MiB/s
Downloading subtitles for https://www.youtube.com/watch?v=3ORZlDTesng...
[youtube] Extracting URL: https://www.youtube.com/watch?v=3ORZlDTesng
[youtube] 3ORZlDTesng: Downloading webpage
[youtube] 3ORZlDTesng: Downloading ios player API JSON
[y

In [None]:
import os

# Directory where .vtt files are saved
vtt_folder = "/content/drive/MyDrive/Downloads/subtitles"
# New folder to store .txt files
txt_folder = "/content/drive/MyDrive/Downloads/text_subtitles"
os.makedirs(txt_folder, exist_ok=True)

# Function to convert .vtt to .txt
def convert_vtt_to_txt(vtt_folder, txt_folder):
    # Iterate over all .vtt files in the vtt_folder
    for filename in os.listdir(vtt_folder):
        if filename.endswith(".vtt"):
            vtt_path = os.path.join(vtt_folder, filename)

            # Read the .vtt file content
            with open(vtt_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            # Extract subtitle text (ignoring timestamps and metadata)
            subtitle_text = []
            for line in lines:
                if "-->" not in line:  # Ignore timestamps
                    subtitle_text.append(line.strip())

            # Combine all text and save to a .txt file
            txt_filename = filename.replace(".vtt", ".txt")
            txt_path = os.path.join(txt_folder, txt_filename)

            with open(txt_path, "w", encoding="utf-8") as file:
                file.write("\n".join(subtitle_text))

            print(f"Converted {filename} to {txt_filename}")

# Run the conversion
convert_vtt_to_txt(vtt_folder, txt_folder)
print(f"Subtitles converted to text and saved in {txt_folder}")


Converted RnZbCnf6MMw.en.vtt to RnZbCnf6MMw.en.txt
Converted OOYljgw7P6M.en.vtt to OOYljgw7P6M.en.txt
Converted LRLGVq9OGHs.en.vtt to LRLGVq9OGHs.en.txt
Converted 8FhLkwvSHkA.en.vtt to 8FhLkwvSHkA.en.txt
Converted vnvQAqzcBAM.en.vtt to vnvQAqzcBAM.en.txt
Converted _UQeqPRyup0.en.vtt to _UQeqPRyup0.en.txt
Converted wSi9F00yfBc.en.vtt to wSi9F00yfBc.en.txt
Converted JGxLVCHiMGs.en.vtt to JGxLVCHiMGs.en.txt
Converted ysaZw1EPWx8.en.vtt to ysaZw1EPWx8.en.txt
Converted H3sB3SGj-5o.en.vtt to H3sB3SGj-5o.en.txt
Converted TbvgVnFR_tY.en.vtt to TbvgVnFR_tY.en.txt
Converted KMTtzM_n8Cg.en.vtt to KMTtzM_n8Cg.en.txt
Converted pVhVrP6ceko.en.vtt to pVhVrP6ceko.en.txt
Converted tleuOOLhyH8.en.vtt to tleuOOLhyH8.en.txt
Converted YNbY9Juix6g.en.vtt to YNbY9Juix6g.en.txt
Converted 7V-dlNgMkWM.en.vtt to 7V-dlNgMkWM.en.txt
Converted M90hQZEOMGc.en.vtt to M90hQZEOMGc.en.txt
Converted Ig7Ux3PP4As.en.vtt to Ig7Ux3PP4As.en.txt
Converted QsHs4XXebjw.en.vtt to QsHs4XXebjw.en.txt
Converted juwrQVV5Vgc.en.vtt to

In [None]:
import shutil
from google.colab import files

# Path to the folder to download
folder_to_download = "/content/drive/MyDrive/Downloads/subtitles"
# Path to the output zip file
zip_path = "/content/subtitles.zip"

# Zip the folder
shutil.make_archive(zip_path.replace(".zip", ""), 'zip', folder_to_download)

# Download the zip file
files.download(zip_path)

print(f"The folder has been zipped and is ready for download: {zip_path}")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The folder has been zipped and is ready for download: /content/subtitles.zip
