## Install/Import Dependencies

In [1]:
import yt_dlp as youtube_dl
import os
import numpy as np
import IPython
import matplotlib.pyplot as plt
import json
import pandas as pd
import glob
import threading
from tqdm import tqdm
import requests
import concurrent.futures
import re
import librosa
import time

from essentia.standard import (
    MonoLoader,
    Danceability,
    Spectrum,
    FrameCutter,
    Loudness,
    RhythmExtractor2013,
    KeyExtractor,
    PredominantPitchMelodia,
    Energy,
    TonalExtractor,
    Inharmonicity,
    SpectralComplexity,
    MFCC,
    OnsetRate,
    SpectralCentroidTime,
    DynamicComplexity,
    SpectralPeaks,
    PitchYinFFT,
    NoveltyCurve,
    Spectrum,
    FrameGenerator,
    Windowing,
    MelBands,
    BeatsLoudness,
    Beatogram,
    Meter,
    HumDetector,
    TensorflowPredictEffnetDiscogs,
    TensorflowPredict2D,
    TensorflowPredictVGGish
)

2024-10-16 14:41:34.241810: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-10-16 14:41:34.241840: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
2024-10-16 14:41:34.553929: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-16 14:41:34.555273: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2024-10-16 14:41:3

## Download Songs Functions

In [2]:
YT_API_KEY = "AIzaSyBHm24xn7ooAVJW5KrZWf-Unw1-QnZJTwI"

In [3]:
def is_youtube_video_available(video_id):
    url = f'https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={YT_API_KEY}&part=status'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if data['pageInfo']['totalResults'] > 0:
            return True  # Video is available
        else:
            return False  # Video not found or unavailable
    else:
        print(f"Error: Unable to check video status (HTTP {response.status_code})")
        return False

In [4]:
def download_song_as_mp3(video_id, download_folder):
    video_url = f"https://www.youtube.com/watch?v={video_id}"

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if not is_youtube_video_available(video_id):
        print(f"ERROR: Video is not available: video_id={video_id}")
        return

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': os.path.join(download_folder, '%(title)s.%(ext)s'),
        'quiet': True,
        'no_warnings': True, 
        'verbose': False
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            info_dict = ydl.extract_info(video_url, download=True)
        except Exception as e:
            return
        title = info_dict.get('title', None)

    song_path = os.path.join(download_folder, f"{title}.mp3")

    if os.path.exists(song_path):
        file_size_mb = os.path.getsize(song_path) / (1024 * 1024)
        #print(f"Downloaded MP3 file size: {file_size_mb:.2f} MB")
    else:
        print("Download failed or file not found.")
        return
        
    return song_path

## Extract Features Functions

In [5]:
with open('data/mtg_jamendo_moodtheme-discogs-effnet-1.json', 'r') as jamendo_file:
    jamendo_metadata = json.load(jamendo_file)
jamendo_classes = jamendo_metadata['classes']

with open('data/mtg_jamendo_instrument-discogs-effnet-1.json', 'r') as jamendo_file:
    jamendo_instrument_metadata = json.load(jamendo_file)
jamendo_instrument_classes = jamendo_instrument_metadata['classes']

In [6]:
models_path = 'models'

In [7]:
def convert_mp3_to_spectrogram(audio_path, output_folder='./spectrograms', create_image=False):
    # Load the MP3 file using librosa
    mp3, sample_rate = librosa.load(audio_path, sr=16000)

    # Create a mel-spectrogram (frequently used spectrogram for audio analysis)
    spectrogram = librosa.feature.melspectrogram(y=mp3, sr=sample_rate, n_mels=128, fmax=8000)

    # Convert the power spectrogram (amplitude squared) to decibels
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    if create_image:
        create_spectrogram_image(spectrogram_db, sample_rate)

    return spectrogram_db, sample_rate


def create_spectrogram_image(spectrogram_db, sample_rate):
    # Plot and save the spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', fmax=8000)
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"Mel-Spectrogram")
    plt.tight_layout()
    plt.show()
    plt.close()

In [8]:
def get_mel_bands(audio):
    spectrum = Spectrum()
    frame_generator = FrameGenerator(audio, frameSize=2048, hopSize=1024)
    window = Windowing(type='hann')

    mel_bands = MelBands(numberBands=40)
    mel_band_energies = []

    for frame in frame_generator:
        spec = spectrum(window(frame))
        mel_band_energies.append(mel_bands(spec))

    mel_band_energies = np.array(mel_band_energies)
    return mel_band_energies

In [9]:
def run_essentia_algorithms(audio16k, audio44k):
    mfcc_bands, mfcc_coeffs = MFCC(inputSize=len(audio44k))(audio44k)
    danceability_score = Danceability()(audio44k)
    loudness_score = Loudness()(audio44k)
    bpm, beat_positions, _, _, _ = RhythmExtractor2013(method="multifeature")(audio44k)
    key, scale, strength = KeyExtractor()(audio44k)
    energy_score = Energy()(audio44k)

    chords_change_rate, _, _, _, chords, _, _, _, _, _, _, _ = TonalExtractor()(audio44k)
    unique_chords, counts = np.unique(chords, return_counts=True)
    chords_significance = {chord: significance for (chord, significance) in zip(unique_chords, counts)}
    
    ### Inharmonicity
    frames = []
    frameCutter = FrameCutter()
    while True:
        frame = frameCutter(audio44k)
        if not len(frame):
            break
        frames.append(frame)
        
    spectrum_magnitudes = []
    for frame in frames:
        spectrum_magnitudes_frame = Spectrum()(frame)
        spectrum_magnitudes.append(spectrum_magnitudes_frame)
    spectrum_magnitudes = np.array(spectrum_magnitudes).flatten()
    
    frequencies, magnitudes = SpectralPeaks()(audio44k)
    hnr_score = None
    if frequencies[0]: 
        hnr_score = Inharmonicity()(frequencies, magnitudes)
    ###
    
    onset_rate_score = OnsetRate()(audio44k)
    brightness_score = SpectralCentroidTime()(audio44k)
    dynamic_complexity_score, loudness_range = DynamicComplexity()(audio44k)
    
    mel_bands = get_mel_bands(audio44k)
    novelty_curve = NoveltyCurve()(mel_bands)
    novelty_score = np.median(np.abs(np.diff(novelty_curve)))
    
    beats_loudness, beats_loudness_band_ratio = BeatsLoudness(beats=beat_positions)(audio44k)
    beatogram = Beatogram()(beats_loudness, beats_loudness_band_ratio)
    time_signature = Meter()(beatogram)
    
    _, hum_freq, hum_saliences, hum_starts, _ = HumDetector()(audio44k)
        
    features = {
        'Danceability': danceability_score[0],
        'Loudness': loudness_score,
        'BPM': bpm,
        'Key': key,
        'Key Scale': scale,
        'Energy': energy_score,
        'Chords Significance': chords_significance,
        'Inharmonicity': hnr_score,
        'Timbre (MFCC Coefficients Mean)': np.mean(mfcc_coeffs),
        'Onset Rate': onset_rate_score[1],
        'Brightness': brightness_score,
        'Dynamic Complexity': dynamic_complexity_score,
        'Novelty': novelty_score,
        'Time Signature': time_signature,
        'Hum Positions': hum_starts
    }
    return features

In [10]:
def run_essentia_models(audio16k, audio44k):
    features = {}
    
    # Get embeddings
    discogs_embeddings = TensorflowPredictEffnetDiscogs(graphFilename=models_path+'/discogs-effnet-bs64-1.pb', output="PartitionedCall:1")(audio16k)
    vggish_embeddings = TensorflowPredictVGGish(graphFilename=models_path+'/audioset-vggish-3.pb', output="model/vggish/embeddings")(audio16k)

    # Approachability
    approachability_predictions = TensorflowPredict2D(graphFilename=models_path+'/approachability_regression-discogs-effnet-1.pb', output="model/Identity")(discogs_embeddings)
    approachability = np.median(np.squeeze(approachability_predictions))
    
    # Engagement
    engagement_predictions = TensorflowPredict2D(graphFilename=models_path+'/engagement_regression-discogs-effnet-1.pb', output="model/Identity")(discogs_embeddings)
    engagement = np.median(np.squeeze(engagement_predictions))
    
    # Arousal/Valence
    arousal_valence_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/deam-audioset-vggish-2.pb', output="model/Identity")(vggish_embeddings), axis=0)
    valence = arousal_valence_predictions[0]
    arousal = arousal_valence_predictions[1]
    
    # Aggressive
    aggressive_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_aggressive-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    aggressive = np.median(aggressive_predictions, axis=0)[0]
    
    # Happy
    happy_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_happy-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    happy = np.median(happy_predictions, axis=0)[0]
    
    # Party
    party_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_party-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    party = np.median(party_predictions, axis=0)[0]
    
    # Relaxed
    relaxed_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_relaxed-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    relaxed = np.median(relaxed_predictions, axis=0)[0]
    
    # Sad
    sad_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_sad-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    sad = np.median(sad_predictions, axis=0)[0]
    
    # Jamendo labels
    jamendo_predictions = TensorflowPredict2D(graphFilename=models_path+'/mtg_jamendo_moodtheme-discogs-effnet-1.pb')(discogs_embeddings)
    jamendo_values = np.median(jamendo_predictions, axis=0)
    jamendo_dict = {jamendo_class:jamendo_value for (jamendo_class, jamendo_value) in zip(jamendo_classes, jamendo_values)}
    
    # Jamendo instrument labels
    jamendo_instrument_predictions = TensorflowPredict2D(graphFilename=models_path+'/mtg_jamendo_instrument-discogs-effnet-1.pb')(discogs_embeddings)
    jamendo_instrument_values = np.median(jamendo_instrument_predictions, axis=0)
    jamendo_instrument_dict = {jamendo_class:jamendo_value for (jamendo_class, jamendo_value) in zip(jamendo_instrument_classes, jamendo_instrument_values)}
    
    # Acoustic
    acoustic_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_acoustic-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    acoustic = np.median(acoustic_predictions, axis=0)[0]
    
    # Electronic
    electronic_predictions = TensorflowPredict2D(graphFilename=models_path+'/mood_electronic-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings)
    electronic = np.median(electronic_predictions, axis=0)[0]
    
    # Voice/Instrumental
    voice_instrumental_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/voice_instrumental-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings), axis=0)
    voice = voice_instrumental_predictions[0]
    instrumental = voice_instrumental_predictions[1]
    
    # Gender (Male/Female)
    gender_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/gender-audioset-vggish-1.pb', output="model/Softmax")(vggish_embeddings), axis=0)
    female = gender_predictions[0]
    male = gender_predictions[1]
    
    # Timbre (Bright/Dark)
    timbre_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/timbre-discogs-effnet-1.pb', output="model/Softmax")(discogs_embeddings), axis=0)
    bright = timbre_predictions[0]
    dark = timbre_predictions[1]   
    
    # Reverb (Dry/Wet)
    reverb_predictions = np.median(TensorflowPredict2D(graphFilename=models_path+'/nsynth_reverb-discogs-effnet-1.pb', output="model/Softmax")(discogs_embeddings), axis=0)
    dry = reverb_predictions[0]
    wet = reverb_predictions[1]
    
    # Return model results
    features = {
        'Embeddings': vggish_embeddings,
        'Approachability': approachability,
        'Engagement': engagement,
        'Valence': valence,
        'Arousal': arousal,
        'Aggressive': aggressive,
        'Happy': happy,
        'Party': party,
        'Relaxed': relaxed,
        'Sad': sad,
        'Jamendo Labels': jamendo_dict,
        'Jamendo Instruments': jamendo_instrument_dict,
        'Acoustic': acoustic,
        'Electronic': electronic,
        'Voice': voice,
        'Instrumental': instrumental,
        'Male': male,
        'Female': female,
        'Bright': bright,
        'Dark': dark,
        'Dry': dry,
        'Wet': wet
    }
    return features

In [11]:
def extract_features(audio_file):
    # Load the audio file
    audio16k = MonoLoader(filename=audio_file, sampleRate=16000)()
    audio44k = MonoLoader(filename=audio_file)()

    # Run helper functions
    print("START")
    startTime = time.time()
    algorithm_features = run_essentia_algorithms(audio16k, audio16k)
    algoTime = time.time() - startTime
    print("ALGO: ", algoTime)
    model_features = run_essentia_models(audio16k, audio44k)
    modelTime = time.time() - startTime - algoTime
    print("MODEL: ", modelTime)
    spectrogram, sample_rate = convert_mp3_to_spectrogram(audio_file) 
    specTime = time.time() - startTime - modelTime
    print("SPEC: ", specTime)

    # Merge results and return
    features = algorithm_features | model_features | {'Spectrogram': spectrogram, 'Spectrogram Sample Rate': sample_rate}
    return features

## Main Code

In [12]:
# Constants
DOWNLOAD_FOLDER = 'songs'
BATCH_NUM = 0
BATCH_SIZE = 10

In [13]:
# Datasets
songs_data = pd.read_csv('data/songs_final.csv')
songs_data_full = songs_data.copy(deep=True)

In [14]:
def process_song(song, song_index):
    song_id = song.get('videoID')
    if not song_id:
        print(f"Error: videoID not found. Song: {song}")
        return

    # Download song
    song_path = download_song_as_mp3(song_id, DOWNLOAD_FOLDER)
    if not song_path:
        return

    # Extract song features
    song_features = extract_features(song_path)
    for feature, value in song_features.items():
        if isinstance(value, (tuple, set, list, np.ndarray, dict)) and feature not in songs_data_full.columns:
            songs_data_full[feature] = np.nan
            songs_data_full[feature] = songs_data_full[feature].astype(object)
        songs_data_full.at[song_index, feature] = value

    # Delete song
    os.remove(song_path)

In [15]:
def download_songs_and_extract_features():
    progress_bar = tqdm(total=BATCH_SIZE, desc='Processing song batches')

    for song_index, song in songs_data.iterrows():
        if song_index < BATCH_SIZE * BATCH_NUM:
            progress_bar.update(1)
            continue
        
        process_song(song, song_index)
        progress_bar.update(1)
    
    progress_bar.close()

In [16]:
download_songs_and_extract_features()

Processing song batches:   0%|                                                                   | 0/10 [00:00<?, ?it/s]

START                                                      
ALGO:  4.513674259185791


2024-10-16 14:41:53.616330: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-16 14:41:53.616385: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3050 Laptop GPU computeCapability: 8.6
coreClock: 1.5GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2024-10-16 14:41:53.616397: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1766] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-10-16 14:41:53.616407: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258

In [17]:
songs_data_full

Unnamed: 0,title,artist,views,videoID,duration
0,Special Breed,PolyCulture,34,LlWGt_84jpg,331
1,Unnoticeable,Lost Ambitions,2,TD3za_a4uWo,61
2,Time Dawdles Immersion,Happy Moppy Puppy,19,LlyA3VwQGfk,47
3,Justness,Generallykoi,1,J14sCvTWh3Q,86
4,INTRANSIGEANCE,BFV,47,uAjBGvZFLi4,228
...,...,...,...,...,...
101040,Misery,Maroon 5,10005663,qHCHVHkTpbo,216
101041,What Lies Beneath,Breaking Benjamin,10005140,KuSPuDFHY4s,215
101042,Yeh Dil Deewana,"Sonu Nigam, Hema Sardesai, Shankar Mahadevan",10005017,d7MDzmchS50,426
101043,Work (Freemasons Radio Edit),Kelly Rowland,10004642,vUj62L2fS1c,192


In [18]:
songs_data_full.to_csv(f'songs_full_{BATCH_NUM}.csv', index=False)

In [None]:
songs_data_full.to_csv(f'songs_full_1.csv', index=False)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
import sys
sys.version