#### Dependencies

In [None]:
import yt_dlp as youtube_dl
import os
import pandas as pd
import multiprocessing
import os
from dotenv import load_dotenv

#### Global Constants

In [None]:
load_dotenv()
DOWNLOAD_FOLDER = os.getenv("DOWNLOAD_FOLDER")
CPU_THREADS = multiprocessing.cpu_count()

#### Data

In [None]:
songs_data = pd.read_csv("data/songs_final.csv")

#### Functions

In [None]:
def is_song_downloaded(song_filename: str):
    files = set(entry.name for entry in os.scandir(DOWNLOAD_FOLDER) if entry.is_file())
    return song_filename in files

In [None]:
# Downloads song from YouTube given videoID
# Song filename follows this format: (index)^(video id)^(title).mp3
#                               e.g  0^LlWGt_84jpg^Special Breed.mp3
def download_song(args):
    video_id, video_index = args
    video_url = f"https://www.youtube.com/watch?v={video_id}"

    ydl_opts = {
        "format": "bestaudio/best",
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
            }
        ],
        "outtmpl": os.path.join(
            DOWNLOAD_FOLDER, f"{video_index}^{video_id}^" + "%(title)s.%(ext)s"
        ),
        "quiet": True,
        "no_warnings": True,
        "verbose": False,
        "noplaylist": True,
        "nocheckcertificate": True,
        "ignoreerrors": True,
        "retries": 3,
        "continuedl": True,
        "max_filesize": 10 * 1024 * 1024,
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            info_dict = ydl.extract_info(video_url, download=False)
            title = info_dict.get("title", None)
            song_filename = f"{video_index}^{video_id}^{title}.mp3"

            if is_song_downloaded(song_filename):
                return

            ydl.download([video_url])
        except Exception as e:
            print(f"YDL ERROR: {e}, videoID: {video_id}")
            return

    song_path = os.path.join(DOWNLOAD_FOLDER, song_filename)
    return song_path

In [None]:
def download_songs_parallel(songs_data, lower, upper):
    video_ids = songs_data["videoID"].to_numpy()[lower:upper]
    video_indices = list(range(lower, upper))

    args = list(zip(video_ids, video_indices))

    with multiprocessing.Pool(CPU_THREADS) as pool:
        pool.map(download_song, args)

In [None]:
download_songs_parallel(songs_data, 0, len(songs_data))