# **Youtube Video to Audio**
This Google Colab file automates the process of downloading YouTube videos as MP3 files and uploading them to an Amazon S3 bucket. The workflow involves the following steps:

*   **Extract Video ID**: The notebook extracts the video ID from a given YouTube URL.
*   **Download Video**: It downloads the audio from the YouTube video in the best available format.

*   **Convert to MP3**: The downloaded audio is then converted to MP3 format.

*   **Check for Duplicates**: Before downloading, it checks if the video already exists in the S3 bucket to avoid duplicates.
*   **Upload to S3**: The converted MP3 files are then uploaded to an Amazon S3 bucket.

<br>

**Instructions to Run in Google Colab:**




*   **Set Bucket Names:** Update the variables BUCKET_NAME and CHUNKS_BUCKET_NAME in the notebook with your Amazon S3 bucket names where the MP3 files will be uploaded.
*   **Provide YouTube URLs:**In the notebook, input the list of YouTube URLs you want to download and convert to MP3 format.



*   **Run the Notebook:**Execute each cell one by one by clicking Run or pressing Shift + Enter to download the YouTube videos, convert them to MP3 format, and upload them to your S3 bucket.

*  **Check for Duplicate Files:** The script will automatically check if the video has already been uploaded to the S3 bucket and skip downloading if a duplicate is found.


# Run The followig cells

In [None]:
!pip install git+https://github.com/yt-dlp/yt-dlp.git -q
!pip install boto3 -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for yt-dlp (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.2/83.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import os
from yt_dlp import YoutubeDL
import boto3

In [None]:
from google.colab import userdata

AWS_ACCESS_KEY = userdata.get("aws_access_key")
AWS_SECRET_KEY = userdata.get("aws_secret_key")

s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)

In [None]:
BUCKET_NAME = 'yt-dl-mp3'
CHUNKS_BUCKET_NAME = 'yt-chunk-mp3'

def check_s3_for_file(file_key, bucket_name):
    """Checks if a file with the given key exists in the specified S3 bucket."""
    try:
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=file_key)
        for obj in response.get('Contents', []):
            if obj['Key'] == file_key:
                print(f"File {file_key} already exists in S3 (bucket: {bucket_name}).")
                return True
    except Exception as e:
        print(f"Error checking S3 for file {file_key} in bucket {bucket_name}: {e}")
    return False


def uploader_to_s3(file_path):
    """Uploads a file to the specified S3 bucket if it doesn't exist in either bucket."""
    try:
        s3_key = os.path.relpath(file_path, start='downloads')
        file_id = os.path.splitext(os.path.basename(s3_key))[0]  # Extract file ID (without extension)

        s3.upload_file(file_path, BUCKET_NAME, s3_key)
        print(f"Uploaded to S3: s3://{BUCKET_NAME}/{s3_key}")

    except Exception as e:
        print(f"Failed to upload {file_path} to S3: {e}")

def on_complete(d):
    """Callback when a file download is complete."""
    if d['status'] == 'finished' and d['info_dict'].get('filepath') is not None:
        filename = d['info_dict']['filepath']
        if filename.endswith(".mp3"):
            print(f"Saved: {filename}")
            uploader_to_s3(filename)  # Upload the file to S3 if not a duplicate
            os.remove(filename)
    elif d['status'] == 'error':
        print("Download failed")
        print(d['error'])

ydl_opts = {
    'abort_on_unavailable_fragments': True,
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '128',
    }],
    'postprocessor_args': [
        '-ar', '16000',
        '-ac', '1'
    ],
    'prefer_ffmpeg': True,
    'keepvideo': False,
    'outtmpl': 'downloads/%(id)s.%(ext)s',
    'postprocessor_hooks': [on_complete]
}

urls = [
    "https://www.youtube.com/@kollol"
]

In [None]:
def list_videos(url):
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,  # Extract only video metadata, no download
        'force_generic_extractor': False,
    }
    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
    return info

In [None]:
actual_urls = []

with YoutubeDL(ydl_opts) as ydl:
    for url in urls:
        print(f"Received URL: {url}")

        if not url.startswith('https://www.youtube.com/playlist?list=') and not url.startswith('https://www.youtube.com/@') and not url.startswith('https://www.youtube.com/watch?v='):
            print(f"Skipping invalid URL: {url}")
            continue

        if url.startswith('https://www.youtube.com/@') and not url.endswith('/videos'):
            url = url + '/videos'

        videos = list_videos(url)
        print("Video metadata downloaded")
        if videos.get('entries') is not None:
            print(f"Found {len(videos['entries'])} videos in the URL.")
            for video in videos['entries']:
                video_url = video['url']
                actual_urls.append(video_url)
        else:
            actual_urls.append(url)

Received URL: https://www.youtube.com/@kollol
Video metadata downloaded
Found 1005 videos in the URL.


In [None]:
len(actual_urls)

1005

In [None]:
from concurrent.futures import ThreadPoolExecutor
import re

def extract_video_id(url):
    """Extracts the video ID from a YouTube URL."""
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return match.group(1) if match else None

def check_for_duplicates(video_id):
    """Checks for duplicates in both S3 buckets."""
    mp3_key = f"{video_id}.mp3"
    chunks_prefix = f"chunks/{video_id}"

    return check_s3_for_file(mp3_key, BUCKET_NAME) or check_s3_for_file(chunks_prefix, CHUNKS_BUCKET_NAME)

def download_video(url):
    """Downloads the video if it doesn't exist in S3."""
    video_id = extract_video_id(url)
    if not video_id:
        print(f"Invalid URL: {url}")
        return

    print(f"Checking for duplicates: {video_id}")
    if check_for_duplicates(video_id):
        print(f"Skipping download: {video_id} already exists in S3.")
        return

    ydl_opts = {
        'abort_on_unavailable_fragments': True,
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '128',
        }],
        'postprocessor_args': [
            '-ar', '16000',
            '-ac', '1'
        ],
        'prefer_ffmpeg': True,
        'keepvideo': False,
        'outtmpl': f'downloads/{video_id}.%(ext)s',
        'postprocessor_hooks': [on_complete]
    }

    with YoutubeDL(ydl_opts) as ydl:
        try:
            print(f"Downloading: {url}")
            ydl.download([url])
        except Exception as e:
            print(f"Error downloading {url}: {e}")

# Run downloads in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(download_video, url) for url in actual_urls]

# Wait for all downloads to complete
for future in futures:
    future.result()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[youtube] 2rQeONhd5SE: Downloading webpage
[youtube] 2rQeONhd5SE: Downloading ios player API JSON
[youtube] 2rQeONhd5SE: Downloading mweb player API JSON
[youtube] 2rQeONhd5SE: Downloading m3u8 information
[info] 2rQeONhd5SE: Downloading 1 format(s): 251
[download] Destination: downloads/2rQeONhd5SE.webm
[download]  39.4% of   35.24MiB at    9.64MiB/s ETA 00:02Deleting original file downloads/bFVy8m7iDQw.webm (pass -k to keep)
Saved: downloads/bFVy8m7iDQw.mp3
[download] 100% of   35.24MiB in 00:00:03 at 9.34MiB/s   
[ExtractAudio] Destination: downloads/2rQeONhd5SE.mp3
Uploaded to S3: s3://yt-dl-mp3/bFVy8m7iDQw.mp3
Checking for duplicates: 5rLoe0CFn3c
Downloading: https://www.youtube.com/watch?v=5rLoe0CFn3c
[youtube] Extracting URL: https://www.youtube.com/watch?v=5rLoe0CFn3c
[youtube] 5rLoe0CFn3c: Downloading webpage
[youtube] 5rLoe0CFn3c: Downloading ios player API JSON
[youtube] 5rLoe0CFn3c: Downloading mweb player API