## Install/Import Dependencies

In [232]:
import yt_dlp as youtube_dl
import os
import numpy as np
import matplotlib.pyplot as plt
import json
import pandas as pd
import requests
import librosa
import time
import concurrent.futures
import multiprocessing

from essentia.standard import (
    MonoLoader,
    Danceability,
    Spectrum,
    FrameCutter,
    Loudness,
    RhythmExtractor2013,
    KeyExtractor,
    Energy,
    TonalExtractor,
    Inharmonicity,
    MFCC,
    OnsetRate,
    SpectralCentroidTime,
    DynamicComplexity,
    SpectralPeaks,
    NoveltyCurve,
    Spectrum,
    FrameGenerator,
    Windowing,
    MelBands,
    BeatsLoudness,
    Beatogram,
    Meter,
    HumDetector,
    TensorflowPredictEffnetDiscogs,
    TensorflowPredict2D,
    TensorflowPredictVGGish
)

## Download Songs Functions

In [233]:
YT_API_KEY = "AIzaSyBHm24xn7ooAVJW5KrZWf-Unw1-QnZJTwI"

In [234]:
def is_youtube_video_available(video_id):
    url = f'https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={YT_API_KEY}&part=status'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if data['pageInfo']['totalResults'] > 0:
            return True  # Video is available
        else:
            return False  # Video not found or unavailable
    else:
        print(f"Error: Unable to check video status (HTTP {response.status_code})")
        return False

In [235]:
def download_song_as_mp3(video_id, download_folder):
    video_url = f"https://www.youtube.com/watch?v={video_id}"

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if not is_youtube_video_available(video_id):
        print(f"ERROR: Video is not available: video_id={video_id}")
        return

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': os.path.join(download_folder, '%(title)s.%(ext)s'),
        'quiet': True,
        'no_warnings': True, 
        'verbose': False
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            info_dict = ydl.extract_info(video_url, download=True)
        except Exception as e:
            return
        title = info_dict.get('title', None)

    song_path = os.path.join(download_folder, f"{title}.mp3")

    if os.path.exists(song_path):
        file_size_mb = os.path.getsize(song_path) / (1024 * 1024)
        #print(f"Downloaded MP3 file size: {file_size_mb:.2f} MB")
    else:
        print("Download failed or file not found.")
        return
        
    return song_path

## Extract Features Functions

In [236]:
with open('data/mtg_jamendo_moodtheme-discogs-effnet-1.json', 'r') as jamendo_file:
    jamendo_metadata = json.load(jamendo_file)
jamendo_classes = jamendo_metadata['classes']

with open('data/mtg_jamendo_instrument-discogs-effnet-1.json', 'r') as jamendo_file:
    jamendo_instrument_metadata = json.load(jamendo_file)
jamendo_instrument_classes = jamendo_instrument_metadata['classes']

In [237]:
models_path = 'models'

In [238]:
def convert_mp3_to_spectrogram(audio_path, create_image=False):
    # Load the MP3 file using librosa
    mp3, sample_rate = librosa.load(audio_path, sr=22050)

    # Create a mel-spectrogram (frequently used spectrogram for audio analysis)
    spectrogram = librosa.feature.melspectrogram(y=mp3, sr=sample_rate, n_mels=128, fmax=11025)

    # Convert the power spectrogram (amplitude squared) to decibels
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    if create_image:
        create_spectrogram_image(spectrogram_db, sample_rate)

    return spectrogram_db, sample_rate


def create_spectrogram_image(spectrogram_db, sample_rate):
    # Plot and save the spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', fmax=11025)
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"Mel-Spectrogram")
    plt.tight_layout()
    plt.show()
    plt.close()

In [239]:
def get_mel_bands(audio):
    spectrum = Spectrum()
    frame_generator = FrameGenerator(audio, frameSize=2048, hopSize=1024)
    window = Windowing(type='hann')

    mel_bands = MelBands(numberBands=40)
    mel_band_energies = []

    for frame in frame_generator:
        spec = spectrum(window(frame))
        mel_band_energies.append(mel_bands(spec))

    mel_band_energies = np.array(mel_band_energies)
    return mel_band_energies

In [240]:
def run_essentia_algorithms(audio44k):
    _, mfcc_coeffs = MFCC(inputSize=len(audio44k))(audio44k)
    danceability_score = Danceability()(audio44k)
    loudness_score = Loudness()(audio44k)
    bpm, beat_positions, _, _, _ = RhythmExtractor2013(method="multifeature")(audio44k)
    key, scale, _ = KeyExtractor()(audio44k)
    energy_score = Energy()(audio44k)

    ### Chord Significances
    _, _, _, _, chords, _, _, _, _, _, _, _ = TonalExtractor()(audio44k)
    unique_chords, counts = np.unique(chords, return_counts=True)
    chords_significance = {chord: significance for (chord, significance) in zip(unique_chords, counts)}
    
    ### Inharmonicity
    frames = []
    frameCutter = FrameCutter()
    while True:
        frame = frameCutter(audio44k)
        if not len(frame):
            break
        frames.append(frame)
        
    spectrum_magnitudes = []
    for frame in frames:
        spectrum_magnitudes_frame = Spectrum()(frame)
        spectrum_magnitudes.append(spectrum_magnitudes_frame)
    spectrum_magnitudes = np.array(spectrum_magnitudes).flatten()
    
    frequencies, magnitudes = SpectralPeaks()(audio44k)
    hnr_score = None
    if frequencies[0]: 
        hnr_score = Inharmonicity()(frequencies, magnitudes)
    ###
    
    onset_rate_score = OnsetRate()(audio44k)
    brightness_score = SpectralCentroidTime()(audio44k)
    dynamic_complexity_score, _ = DynamicComplexity()(audio44k)
    
    mel_bands = get_mel_bands(audio44k)
    novelty_curve = NoveltyCurve()(mel_bands)
    novelty_score = np.median(np.abs(np.diff(novelty_curve)))
    
    beats_loudness, beats_loudness_band_ratio = BeatsLoudness(beats=beat_positions)(audio44k)
    beatogram = Beatogram()(beats_loudness, beats_loudness_band_ratio)
    time_signature = Meter()(beatogram)
    
    _, _, saliences, hum_starts, hum_ends = HumDetector()(audio44k)
    hum_intervals = [(hum_start, hum_end, salience) for hum_start, hum_end, salience in zip(hum_starts, hum_ends, saliences)]
        
    features = {
        'Danceability': danceability_score[0],
        'Loudness': loudness_score,
        'BPM': bpm,
        'Key': key,
        'Key Scale': scale,
        'Energy': energy_score,
        'Chords Significance': chords_significance,
        'Inharmonicity': hnr_score,
        'Timbre (MFCC Coefficients Mean)': np.mean(mfcc_coeffs),
        'Onset Rate': onset_rate_score[1],
        'Brightness': brightness_score,
        'Dynamic Complexity': dynamic_complexity_score,
        'Novelty': novelty_score,
        'Time Signature': time_signature,
        'Hum Intervals': hum_intervals
    }
    return features

In [241]:


def run_essentia_algorithms_parallel(audio44k):
    manager = multiprocessing.Manager()
    features = manager.dict()  # Shared dictionary across processes

    def compute_mfcc():
        _, mfcc_coeffs = MFCC(inputSize=len(audio44k))(audio44k)
        features['Timbre (MFCC Coefficients Mean)'] = np.mean(mfcc_coeffs)

    def compute_danceability():
        danceability_score = Danceability()(audio44k)
        features['Danceability'] = danceability_score[0]

    def compute_loudness():
        loudness_score = Loudness()(audio44k)
        features['Loudness'] = loudness_score

    def compute_rhythm():
        bpm, beat_positions, _, _, _ = RhythmExtractor2013(method="multifeature")(audio44k)
        features['BPM'] = bpm
        features['Beat Positions'] = beat_positions

    def compute_key():
        key, scale, _ = KeyExtractor()(audio44k)
        features['Key'] = key
        features['Key Scale'] = scale

    def compute_energy():
        energy_score = Energy()(audio44k)
        features['Energy'] = energy_score

    def compute_chords():
        _, _, _, _, chords, _, _, _, _, _, _, _ = TonalExtractor()(audio44k)
        unique_chords, counts = np.unique(chords, return_counts=True)
        features['Chords Significance'] = {chord: significance for (chord, significance) in zip(unique_chords, counts)}

    def compute_inharmonicity():
        frequencies, magnitudes = SpectralPeaks()(audio44k)
        hnr_score = None
        if frequencies[0]:
            hnr_score = Inharmonicity()(frequencies, magnitudes)
        features['Inharmonicity'] = hnr_score

    def compute_onset_rate():
        onset_rate_score = OnsetRate()(audio44k)
        features['Onset Rate'] = onset_rate_score[1]

    def compute_brightness():
        brightness_score = SpectralCentroidTime()(audio44k)
        features['Brightness'] = brightness_score

    def compute_dynamic_complexity():
        dynamic_complexity_score, _ = DynamicComplexity()(audio44k)
        features['Dynamic Complexity'] = dynamic_complexity_score

    def compute_novelty():
        mel_bands = get_mel_bands(audio44k)
        novelty_curve = NoveltyCurve()(mel_bands)
        novelty_score = np.median(np.abs(np.diff(novelty_curve)))
        features['Novelty'] = novelty_score

    def compute_time_signature():
        bpm, beat_positions, _, _, _ = RhythmExtractor2013(method="multifeature")(audio44k)
        beats_loudness, beats_loudness_band_ratio = BeatsLoudness(beats=beat_positions)(audio44k)
        beatogram = Beatogram()(beats_loudness, beats_loudness_band_ratio)
        time_signature = Meter()(beatogram)
        features['Time Signature'] = time_signature

    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(compute_mfcc),
            executor.submit(compute_danceability),
            executor.submit(compute_loudness),
            executor.submit(compute_rhythm),
            executor.submit(compute_key),
            executor.submit(compute_energy),
            executor.submit(compute_chords),
            executor.submit(compute_inharmonicity),
            executor.submit(compute_onset_rate),
            executor.submit(compute_brightness),
            executor.submit(compute_dynamic_complexity),
            executor.submit(compute_novelty),
            executor.submit(compute_time_signature)
        ]

        # Wait for all processes to complete
        concurrent.futures.wait(futures)

    return dict(features)


In [242]:
def run_essentia_models(audio16k, audio44k):
    features = {}
    
    # Get embeddings
    discogs_embeddings = TensorflowPredictEffnetDiscogs(graphFilename=models_path+'/discogs-effnet-bs64-1.pb', output="PartitionedCall:1")(audio16k)
    vggish_embeddings = TensorflowPredictVGGish(graphFilename=models_path+'/audioset-vggish-3.pb', output="model/vggish/embeddings")(audio16k)

    # Approachability
    approachability_predictions = TensorflowPredict2D(graphFilename=models_path+'/approachability_regression-discogs-effnet-1.pb', output="model/Identity")(discogs_embeddings)
    approachability = np.median(np.squeeze(approachability_predictions))
    
    # Engagement
    engagement_predictions = TensorflowPredict2D(graphFilename=models_path+'/engagement_regression-discogs-effnet-1.pb', output="model/Identity")(discogs_embeddings)
    engagement = np.median(np.squeeze(engagement_predictions))
    
    # Arousal/Valence
    arousal_valence_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/deam-audioset-vggish-2.pb', output="model/Identity")(vggish_embeddings), axis=0)
    valence = arousal_valence_predictions[0]
    arousal = arousal_valence_predictions[1]
    
    # Aggressive
    aggressive_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_aggressive-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    aggressive = np.median(aggressive_predictions, axis=0)[0]
    
    # Happy
    happy_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_happy-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    happy = np.median(happy_predictions, axis=0)[0]
    
    # Party
    party_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_party-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    party = np.median(party_predictions, axis=0)[0]
    
    # Relaxed
    relaxed_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_relaxed-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    relaxed = np.median(relaxed_predictions, axis=0)[0]
    
    # Sad
    sad_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_sad-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    sad = np.median(sad_predictions, axis=0)[0]
    
    # Jamendo labels
    jamendo_predictions = TensorflowPredict2D(graphFilename=models_path+'/mtg_jamendo_moodtheme-discogs-effnet-1.pb')(discogs_embeddings)
    jamendo_values = np.median(jamendo_predictions, axis=0)
    jamendo_dict = {jamendo_class:jamendo_value for (jamendo_class, jamendo_value) in zip(jamendo_classes, jamendo_values)}
    
    # Jamendo instrument labels
    jamendo_instrument_predictions = TensorflowPredict2D(graphFilename=models_path+'/mtg_jamendo_instrument-discogs-effnet-1.pb')(discogs_embeddings)
    jamendo_instrument_values = np.median(jamendo_instrument_predictions, axis=0)
    jamendo_instrument_dict = {jamendo_class:jamendo_value for (jamendo_class, jamendo_value) in zip(jamendo_instrument_classes, jamendo_instrument_values)}
    
    # Acoustic
    acoustic_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_acoustic-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    acoustic = np.median(acoustic_predictions, axis=0)[0]
    
    # Electronic
    electronic_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_electronic-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    electronic = np.median(electronic_predictions, axis=0)[0]
    
    # Voice/Instrumental
    voice_instrumental_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/voice_instrumental-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings), axis=0)
    voice = voice_instrumental_predictions[0]
    instrumental = voice_instrumental_predictions[1]
    
    # Gender (Male/Female)
    gender_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/gender-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings), axis=0)
    female = gender_predictions[0]
    male = gender_predictions[1]
    
    # Timbre (Bright/Dark)
    timbre_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/timbre-discogs-effnet-1.pb', output="model/Softmax")(discogs_embeddings), axis=0)
    bright = timbre_predictions[0]
    dark = timbre_predictions[1]   
    
    # Reverb (Dry/Wet)
    reverb_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/nsynth_reverb-discogs-effnet-1.pb', output="model/Softmax")(discogs_embeddings), axis=0)
    dry = reverb_predictions[0]
    wet = reverb_predictions[1]
    
    # Return model results
    features = {
        'Embeddings': vggish_embeddings,
        'Approachability': approachability,
        'Engagement': engagement,
        'Valence': valence,
        'Arousal': arousal,
        'Aggressive': aggressive,
        'Happy': happy,
        'Party': party,
        'Relaxed': relaxed,
        'Sad': sad,
        'Jamendo Labels': jamendo_dict,
        'Jamendo Instruments': jamendo_instrument_dict,
        'Acoustic': acoustic,
        'Electronic': electronic,
        'Voice': voice,
        'Instrumental': instrumental,
        'Male': male,
        'Female': female,
        'Bright': bright,
        'Dark': dark,
        'Dry': dry,
        'Wet': wet
    }
    return features

In [243]:
def run_essentia_models(audio16k, audio44k):
    features = {}
    
    # Get embeddings
    start_time = time.time()
    discogs_embeddings = TensorflowPredictEffnetDiscogs(graphFilename=models_path+'/discogs-effnet-bs64-1.pb', output="PartitionedCall:1")(audio16k)
    end_time = time.time()
    print(f"Discogs embeddings: {end_time - start_time:.2f} seconds")
    
    start_time = time.time()
    vggish_embeddings = TensorflowPredictVGGish(graphFilename=models_path+'/audioset-vggish-3.pb', output="model/vggish/embeddings")(audio16k)
    end_time = time.time()
    print(f"VGGish embeddings: {end_time - start_time:.2f} seconds")
    
    # Approachability
    start_time = time.time()
    approachability_predictions = TensorflowPredict2D(graphFilename=models_path+'/approachability_regression-discogs-effnet-1.pb', output="model/Identity")(discogs_embeddings)
    approachability = np.median(np.squeeze(approachability_predictions))
    end_time = time.time()
    print(f"Approachability: {end_time - start_time:.2f} seconds")
    
    # Engagement
    start_time = time.time()
    engagement_predictions = TensorflowPredict2D(graphFilename=models_path+'/engagement_regression-discogs-effnet-1.pb', output="model/Identity")(discogs_embeddings)
    engagement = np.median(np.squeeze(engagement_predictions))
    end_time = time.time()
    print(f"Engagement: {end_time - start_time:.2f} seconds")
    
    # Arousal/Valence
    start_time = time.time()
    arousal_valence_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/deam-audioset-vggish-2.pb', output="model/Identity")(vggish_embeddings), axis=0)
    valence = arousal_valence_predictions[0]
    arousal = arousal_valence_predictions[1]
    end_time = time.time()
    print(f"Arousal/Valence: {end_time - start_time:.2f} seconds")
    
    # Aggressive
    start_time = time.time()
    aggressive_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_aggressive-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    aggressive = np.median(aggressive_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Aggressive: {end_time - start_time:.2f} seconds")
    
    # Happy
    start_time = time.time()
    happy_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_happy-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    happy = np.median(happy_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Happy: {end_time - start_time:.2f} seconds")
    
    # Party
    start_time = time.time()
    party_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_party-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    party = np.median(party_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Party: {end_time - start_time:.2f} seconds")
    
    # Relaxed
    start_time = time.time()
    relaxed_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_relaxed-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    relaxed = np.median(relaxed_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Relaxed: {end_time - start_time:.2f} seconds")
    
    # Sad
    start_time = time.time()
    sad_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_sad-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    sad = np.median(sad_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Sad: {end_time - start_time:.2f} seconds")
    
    # Jamendo labels
    start_time = time.time()
    jamendo_predictions = TensorflowPredict2D(graphFilename=models_path+'/mtg_jamendo_moodtheme-discogs-effnet-1.pb')(discogs_embeddings)
    jamendo_values = np.median(jamendo_predictions, axis=0)
    jamendo_dict = {jamendo_class: jamendo_value for (jamendo_class, jamendo_value) in zip(jamendo_classes, jamendo_values)}
    end_time = time.time()
    print(f"Jamendo labels: {end_time - start_time:.2f} seconds")
    
    # Jamendo instrument labels
    start_time = time.time()
    jamendo_instrument_predictions = TensorflowPredict2D(graphFilename=models_path+'/mtg_jamendo_instrument-discogs-effnet-1.pb')(discogs_embeddings)
    jamendo_instrument_values = np.median(jamendo_instrument_predictions, axis=0)
    jamendo_instrument_dict = {jamendo_class: jamendo_value for (jamendo_class, jamendo_value) in zip(jamendo_instrument_classes, jamendo_instrument_values)}
    end_time = time.time()
    print(f"Jamendo instrument labels: {end_time - start_time:.2f} seconds")
    
    # Acoustic
    start_time = time.time()
    acoustic_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_acoustic-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    acoustic = np.median(acoustic_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Acoustic: {end_time - start_time:.2f} seconds")
    
    # Electronic
    start_time = time.time()
    electronic_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_electronic-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    electronic = np.median(electronic_predictions, axis=0)[0]
    end_time = time.time()
    print(f"Electronic: {end_time - start_time:.2f} seconds")
    
    # Voice/Instrumental
    start_time = time.time()
    voice_instrumental_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/voice_instrumental-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings), axis=0)
    voice = voice_instrumental_predictions[0]
    instrumental = voice_instrumental_predictions[1]
    end_time = time.time()
    print(f"Voice/Instrumental: {end_time - start_time:.2f} seconds")
    
    # Gender (Male/Female)
    start_time = time.time()
    gender_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/gender-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings), axis=0)
    female = gender_predictions[0]
    male = gender_predictions[1]
    end_time = time.time()
    print(f"Gender (Male/Female): {end_time - start_time:.2f} seconds")
    
    # Timbre (Bright/Dark)
    start_time = time.time()
    timbre_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/timbre-discogs-effnet-1.pb', output="model/Softmax")(discogs_embeddings), axis=0)
    bright = timbre_predictions[0]
    dark = timbre_predictions[1]
    end_time = time.time()
    print(f"Timbre (Bright/Dark): {end_time - start_time:.2f} seconds")
    
    # Reverb (Dry/Wet)
    start_time = time.time()
    reverb_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/nsynth_reverb-discogs-effnet-1.pb', output="model/Softmax")(discogs_embeddings), axis=0)
    dry = reverb_predictions[0]
    wet = reverb_predictions[1]
    end_time = time.time()
    print(f"Reverb (Dry/Wet): {end_time - start_time:.2f} seconds")
    
    # Return model results
    features = {
        'Embeddings': vggish_embeddings,
        'Approachability': approachability,
        'Engagement': engagement,
        'Valence': valence,
        'Arousal': arousal,
        'Aggressive': aggressive,
        'Happy': happy,
        'Party': party,
        'Relaxed': relaxed,
        'Sad': sad,
        'Jamendo Labels': jamendo_dict,
        'Jamendo Instruments': jamendo_instrument_dict,
        'Acoustic': acoustic,
        'Electronic': electronic,
        'Voice': voice,
        'Instrumental': instrumental,
        'Male': male,
        'Female': female,
        'Bright': bright,
        'Dark': dark,
        'Dry': dry,
        'Wet': wet
    }
    return features


In [244]:
def extract_features(audio_file):
    # Load the audio file
    #audio16k = MonoLoader(filename=audio_file, sampleRate=16000)()
    audio44k = MonoLoader(filename=audio_file)()

    algorithm_features = run_essentia_algorithms(audio44k)
    #model_features = run_essentia_models(audio16k, audio44k)
    spectrogram, sample_rate = convert_mp3_to_spectrogram(audio_file)

    # Merge results and return
    features = algorithm_features | {'Spectrogram': spectrogram, 'Spectrogram Sample Rate': sample_rate}
    return features

## Main Code

In [245]:
# Constants
DOWNLOAD_FOLDER = 'songs'
NUM_BATCHES = 8 # I have 16 threads

In [246]:
# Datasets
songs_data = pd.read_csv('data/songs_final.csv')
songs_data_full = songs_data.copy(deep=True)

In [247]:
def process_song(song, song_index, namespace):
    song_id = song.get('videoID')
    if not song_id:
        print(f"Error: videoID not found. Song: {song}")
        return

    # Download song
    song_path = download_song_as_mp3(song_id, DOWNLOAD_FOLDER)
    if not song_path:
        return

    # Extract song features
    song_features = extract_features(song_path)

    for feature, value in song_features.items():
        if isinstance(value, (tuple, set, list, np.ndarray, dict)) and feature not in namespace.shared_songs_data.columns:
            namespace.shared_songs_data[feature] = np.nan
            namespace.shared_songs_data[feature] = namespace.shared_songs_data[feature].astype(object)
        namespace.shared_songs_data.at[song_index, feature] = value

    # Delete song
    os.remove(song_path)

In [248]:
def process_batch(args, shared_songs_data):
    song_batch, batch_num = args
    process_times = []
    song_count = 0
    
    for song_index, song in song_batch.iterrows():
        song_count += 1
        startTime = time.time()
        process_song(song, song_index, shared_songs_data)
        process_time = time.time() - startTime
        process_times.append(process_time)
        print(f"Batch {batch_num}, Song {song_count}/{len(song_batch)}: {process_time:.2f} seconds. Avg extraction time: {np.mean(process_times):.2f}")
        break

def download_songs_and_extract_features():
    manager = multiprocessing.Manager()
    shared_songs_data = manager.dict({index: song for index, song in songs_data.iterrows()})

    batches = np.array_split(songs_data, NUM_BATCHES)
    batch_args = [(batches[i], i) for i in range(NUM_BATCHES)]

    with multiprocessing.Pool(processes=NUM_BATCHES) as pool:
        pool.starmap(process_batch, [(args, shared_songs_data) for args in batch_args])

    songs_data_full = pd.DataFrame.from_dict(shared_songs_data, orient="index")
    return songs_data_full

In [249]:
songs_data_full_ = download_songs_and_extract_features()
songs_data_full_

                               title  \
0                      Special Breed   
1                       Unnoticeable   
2             Time Dawdles Immersion   
3                           Justness   
4                     INTRANSIGEANCE   
...                              ...   
101040                        Misery   
101041             What Lies Beneath   
101042               Yeh Dil Deewana   
101043  Work (Freemasons Radio Edit)   
101044                     Baby Stop   

                                              artist     views      videoID  \
0                                        PolyCulture        34  LlWGt_84jpg   
1                                     Lost Ambitions         2  TD3za_a4uWo   
2                                  Happy Moppy Puppy        19  LlyA3VwQGfk   
3                                       Generallykoi         1  J14sCvTWh3Q   
4                                                BFV        47  uAjBGvZFLi4   
...                                          

  return bound(*args, **kwds)
ERROR: [youtube] aoSK7vju_l8: Video unavailable. This video is not available


Batch 7, Song 1/12630: 1.31 seconds. Avg extraction time: 1.31
                               title  \
0                      Special Breed   
1                       Unnoticeable   
2             Time Dawdles Immersion   
3                           Justness   
4                     INTRANSIGEANCE   
...                              ...   
101040                        Misery   
101041             What Lies Beneath   
101042               Yeh Dil Deewana   
101043  Work (Freemasons Radio Edit)   
101044                     Baby Stop   

                                              artist     views      videoID  \
0                                        PolyCulture        34  LlWGt_84jpg   
1                                     Lost Ambitions         2  TD3za_a4uWo   
2                                  Happy Moppy Puppy        19  LlyA3VwQGfk   
3                                       Generallykoi         1  J14sCvTWh3Q   
4                                                BFV        4

KeyboardInterrupt: 

In [120]:
songs_data_full.head(20)

Unnamed: 0,title,artist,views,videoID,duration
0,Special Breed,PolyCulture,34,LlWGt_84jpg,331
1,Unnoticeable,Lost Ambitions,2,TD3za_a4uWo,61
2,Time Dawdles Immersion,Happy Moppy Puppy,19,LlyA3VwQGfk,47
3,Justness,Generallykoi,1,J14sCvTWh3Q,86
4,INTRANSIGEANCE,BFV,47,uAjBGvZFLi4,228
5,Largehearted,Ratliff Riggs,0,WaLBwUXUEXA,174
6,Reevaluating What Matters,Oti$,15,cLXxWV-PWgA,183
7,Quotha,KC4K,18,zW5qPmgJBxI,300
8,Brigid Bisley,Alexander Ivashkin,54,a2EkhHFfW1A,527
9,Corporatist Utopia (Rap Version),,0,0VseA79g9DE,210


In [121]:
songs_data_full.to_csv(f'songs_full.csv', index=False)