### Environment Setup

In [1]:
import tensorflow as tf
# Enable GPU memory growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Memory growth enabled on GPU.")
    except RuntimeError as e:
        print(e)

import os
# Hide unnecessary TensorFlow messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import sys
import essentia.standard as es
import pandas as pd
from pprint import pprint

import numpy as np
%matplotlib inline


# Determine project root (assuming the notebook is in the notebooks/ folder)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root:", project_root)
print("Current PYTHONPATH (first few entries):", sys.path[:3])

# Define paths
raw_dir = os.path.join(project_root, "data", "raw")
sample_audio = os.path.join(raw_dir, "example.mp3")  # Ensure this file exists
tempo_model_file = os.path.join(project_root, "src", "deeptemp-k16-3.pb")
print("Sample audio file:", sample_audio)
print("Tempo model file:", tempo_model_file)


2025-02-13 18:41:05.586357: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-13 18:41:05.637322: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-13 18:41:05.637382: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


Memory growth enabled on GPU.


[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


Project root: /home/cepatinog/amplab
Current PYTHONPATH (first few entries): ['/home/cepatinog/amplab', '/home/cepatinog/miniconda3/envs/amplab_essentia/lib/python310.zip', '/home/cepatinog/miniconda3/envs/amplab_essentia/lib/python3.10']
Sample audio file: /home/cepatinog/amplab/data/raw/example.mp3
Tempo model file: /home/cepatinog/amplab/src/deeptemp-k16-3.pb


### Test Audio Loading

In [2]:
from src.load_audio import load_audio_file

audio_dict = load_audio_file(sample_audio, targetMonoSampleRate=44100, targetTempoSampleRate=11025)

print("Returned keys:")
pprint(list(audio_dict.keys()))

print("\nDetails of loaded audio:")
print("Stereo audio (first 5 samples):")
pprint(audio_dict['stereo_audio'][:5])
print("Mono audio length (for key extraction):", len(audio_dict['mono_audio']))
print("Mono audio length (for tempo extraction):", len(audio_dict['mono_tempo']))
print("Sample rate used for mono audio:", audio_dict['sampleRate'])
print("Number of channels in original file:", audio_dict['numChannels'])


Returned keys:
['stereo_audio', 'mono_audio', 'mono_tempo', 'sampleRate', 'numChannels']

Details of loaded audio:
Stereo audio (first 5 samples):
array([[-0.10971069,  0.09204102],
       [-0.17526245,  0.07278442],
       [-0.29336548, -0.0501709 ],
       [-0.28570557,  0.00180054],
       [-0.22473145,  0.12976074]], dtype=float32)
Mono audio length (for key extraction): 1323001
Mono audio length (for tempo extraction): 330750
Sample rate used for mono audio: 44100.0
Number of channels in original file: 2


### Test Individual Feature Extractors

In [3]:
from src.extract_tempo import extract_tempo_features
from src.extract_key import extract_key_features
from src.extract_loudness import extract_loudness_features

tempo_features = extract_tempo_features(audio_dict['mono_tempo'], method='tempocnn', model_file=tempo_model_file)
print("Tempo Features:")
pprint(tempo_features)

key_features = extract_key_features(audio_dict['mono_audio'])
print("\nKey Features:")
pprint(key_features)

loudness_features = extract_loudness_features(audio_dict['stereo_audio'], hopSize=1024/44100, sampleRate=44100, startAtZero=True)
print("\nLoudness Features:")
pprint(loudness_features)


[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/deeptemp-k16-3.pb`


Tempo Features:
{'global_bpm': 128.0,
 'local_bpms': [128.0, 128.0, 128.0, 128.0],
 'local_probs': [0.9546994566917419,
                 0.9014107584953308,
                 0.8928113579750061,
                 0.8102709054946899]}

Key Features:
{'edma_key': 'F#',
 'edma_scale': 'major',
 'edma_strength': 0.9310499429702759,
 'krumhansl_key': 'F#',
 'krumhansl_scale': 'major',
 'krumhansl_strength': 0.985231876373291,
 'temperley_key': 'F#',
 'temperley_scale': 'major',
 'temperley_strength': 0.9225367903709412}

Loudness Features:
{'integrated_loudness': -7.263375759124756,
 'loudness_range': 1.1281824111938477,
 'momentary_loudness': array([-9.38477 , -9.104578, -8.783603, ..., -8.875216, -9.161896,
       -9.573801], dtype=float32),
 'short_term_loudness': array([-9.571363, -9.482447, -9.383096, ..., -9.565043, -9.629582,
       -9.684465], dtype=float32)}


### Test Embedding Extraction

In [4]:
# Import embedding extraction functions.
from src.extract_embeddings import extract_discogs_effnet_embeddings, extract_msd_musicnn_embeddings

# Load audio for embeddings: using MonoLoader at 16 kHz.
audio_embeddings = es.MonoLoader(filename=sample_audio, sampleRate=16000, resampleQuality=4)()
print("Loaded audio for embeddings length:", len(audio_embeddings))

# Define model paths.
discogs_model_file = os.path.join(project_root, "src", "discogs-effnet-bs64-1.pb")  # Update filename if necessary
musicnn_model_file = os.path.join(project_root, "src", "msd-musicnn-1.pb")           # Update filename if necessary

# Extract Discogs-Effnet embeddings.
discogs_embedding = extract_discogs_effnet_embeddings(audio_embeddings, model_file=discogs_model_file)
print("Discogs-Effnet embedding shape:", discogs_embedding.shape)
print("Discogs-Effnet embedding:")
pprint(discogs_embedding)

# Extract MSD-MusicCNN embeddings.
musicnn_embedding = extract_msd_musicnn_embeddings(audio_embeddings, model_file=musicnn_model_file)
print("MSD-MusicCNN embedding shape:", musicnn_embedding.shape)
print("MSD-MusicCNN embedding:")
pprint(musicnn_embedding)



Loaded audio for embeddings length: 480001


[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/discogs-effnet-bs64-1.pb`


Discogs-Effnet embedding shape: (1280,)
Discogs-Effnet embedding:
array([-0.00818077,  0.04617088,  0.08795192, ..., -0.0115188 ,
        0.22933835, -0.03071781], dtype=float32)
MSD-MusicCNN embedding shape: (200,)
MSD-MusicCNN embedding:
array([ 1.6210680e+00,  3.7390992e-01, -1.4761760e+00, -1.8138838e+00,
       -1.9977241e+00,  3.1101890e+00,  2.1713927e+00, -1.4830803e+00,
       -5.1325660e+00,  1.1591920e-01,  2.4978249e+00,  2.1131213e+00,
       -2.4963405e+00, -1.3414558e+00,  7.5094149e-02,  3.0526948e+00,
        1.6317570e-01, -4.4245654e-01, -2.4706023e+00, -2.8656778e+00,
        1.4657718e+00,  5.5882281e-01, -2.9023263e+00, -1.3423108e+00,
        7.5930852e-01, -2.9156735e+00,  1.9484429e+00,  3.3846447e+00,
        2.6429994e+00, -2.7658646e+00,  2.5999832e-01, -2.6172974e+00,
        7.8670716e-01, -3.4677293e+00,  3.1758797e+00,  3.5159130e+00,
       -2.7254109e+00,  3.6064587e+00, -2.2145446e-02,  3.6668515e-01,
        2.1951363e+00, -1.8875448e-01, -1.8936421e

[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/msd-musicnn-1.pb`


### Test Genre Extraction

In [5]:
from src.extract_genre import extract_genre_features

# Define the path to the Genre Discogs400 model file.
genre_model_file = os.path.join(project_root, "src", "genre_discogs400-discogs-effnet-1.pb")
print("Genre model file:", genre_model_file)

# Use the previously extracted Discogs-Effnet embedding (discogs_embedding)
# Make sure that discogs_embedding is a 1D numpy array (averaged over frames).
genre_predictions = extract_genre_features(discogs_embedding, model_file=genre_model_file)

print("Genre predictions shape:", genre_predictions.shape)
print("Genre predictions:")
pprint(genre_predictions)

Genre model file: /home/cepatinog/amplab/src/genre_discogs400-discogs-effnet-1.pb
Genre predictions shape: (400,)
Genre predictions:
array([1.30847070e-07, 8.30177783e-07, 1.30965827e-06, 3.13870174e-07,
       9.18692592e-07, 3.17890851e-07, 8.42679100e-08, 1.78020230e-07,
       6.08618893e-06, 2.35749280e-06, 7.01915724e-06, 3.13942792e-06,
       6.39352447e-06, 1.09354453e-07, 7.15663688e-08, 6.15305225e-06,
       1.07923632e-04, 5.59011369e-06, 8.48912293e-07, 4.40602491e-07,
       1.42137806e-05, 9.19592640e-06, 1.76541889e-07, 4.29456395e-06,
       1.42761326e-06, 8.22987658e-06, 1.53997871e-06, 6.95729104e-05,
       8.18130616e-07, 1.29890353e-07, 2.93427820e-06, 1.21864286e-04,
       1.14853930e-04, 3.45757639e-04, 1.05018982e-04, 8.45106028e-04,
       3.02613154e-02, 1.17283489e-05, 5.58645252e-06, 1.86647649e-03,
       1.11828895e-05, 2.31459201e-03, 7.85330922e-05, 1.31600490e-03,
       1.15292751e-04, 8.69157538e-03, 9.03651642e-04, 9.72750187e-02,
       8.411210

[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/genre_discogs400-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/genre_discogs400-discogs-effnet-1.pb`


### Test voice/instrumental classification

In [6]:
from src.extract_voice_instrumental import extract_voice_instrumental

# Assuming discogs_embedding was already extracted (and is a 1D vector)
# Ensure it's reshaped to 2D if needed:
if discogs_embedding.ndim == 1:
    discogs_embedding = np.expand_dims(discogs_embedding, axis=0)

voice_result = extract_voice_instrumental(discogs_embedding, model_file=os.path.join(project_root, "src", "voice_instrumental-discogs-effnet-1.pb"))
print("Voice/Instrumental Classification:")
pprint(voice_result)


Voice/Instrumental Classification:
{'predicted_class': 'voice',
 'predictions': array([0.00353792, 0.99646205], dtype=float32)}


[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/voice_instrumental-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/voice_instrumental-discogs-effnet-1.pb`


### Test danceability

In [7]:
from src.extract_danceability import extract_danceability_features

# Test signal-based danceability extraction:
dance_signal = extract_danceability_features(audio_dict['mono_audio'], mode="signal", sampleRate=44100)
print("Signal-based Danceability:")
pprint(dance_signal)

# Test classifier-based danceability extraction:
# Ensure discogs_embedding is 2D:
if discogs_embedding.ndim == 1:
    discogs_embedding = np.expand_dims(discogs_embedding, axis=0)
dance_classifier = extract_danceability_features(discogs_embedding, mode="classifier", model_file=os.path.join(project_root, "src", "danceability-discogs-effnet-1.pb"))
print("\nClassifier-based Danceability:")
pprint(dance_classifier)



Signal-based Danceability:
{'danceability': 1.4231427907943726,
 'dfa': [0.7849090099334717,
         0.7395235896110535,
         0.6986629962921143,
         0.6825172901153564,
         0.6640278100967407,
         0.6067122220993042,
         0.5174141526222229,
         0.45753443241119385,
         0.4432242214679718,
         0.45849862694740295,
         0.47390294075012207,
         0.5290201902389526,
         0.6048661470413208,
         0.6491430997848511,
         0.6958308219909668,
         0.7307958006858826,
         0.7874581813812256,
         0.832729160785675,
         0.8220929503440857,
         0.8494006991386414,
         0.8841837048530579,
         0.8800502419471741,
         0.8848588466644287,
         0.9146839380264282,
         0.8871573805809021,
         0.8608101606369019,
         0.922595739364624,
         0.8846914172172546,
         0.8137402534484863,
         0.7432535886764526,
         0.6876935362815857,
         0.645575761795044,
        

[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/danceability-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/danceability-discogs-effnet-1.pb`


### Test Integrated Pipeline

In [8]:
from src.audio_analysis import extract_all_features

# Define additional model paths for genre and voice/instrumental.
genre_model_file = os.path.join(project_root, "src", "genre_discogs400-discogs-effnet-1.pb")
voice_model_file = os.path.join(project_root, "src", "voice_instrumental-discogs-effnet-1.pb")
discogs_model_file = os.path.join(project_root, "src", "discogs-effnet-bs64-1.pb")
musicnn_model_file = os.path.join(project_root, "src", "msd-musicnn-1.pb")

# Extract all features including embeddings, genre activations, and voice/instrumental classification.
all_features = extract_all_features(
    audio_dict, 
    tempo_method='tempocnn', 
    tempo_model_file=tempo_model_file,
    emb_discogs_model_file=discogs_model_file,
    emb_msd_model_file=musicnn_model_file,
    genre_model_file=genre_model_file,
    voice_model_file=voice_model_file
)

print("\nAll Integrated Extracted Features (with embeddings, genre, and voice classification):")
pprint(all_features)

# Optionally, display the results in a DataFrame.
df = pd.DataFrame([all_features])
df



[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/deeptemp-k16-3.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/discogs-effnet-bs64-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/genre_discogs400-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/genre_discogs400-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/voice_instrumental-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/voice_instrumental-discogs-effnet-1.pb`



All Integrated Extracted Features (with embeddings, genre, and voice classification):
{'danceability_signal': {'danceability': 1.4231427907943726,
                         'dfa': [0.7849090099334717,
                                 0.7395235896110535,
                                 0.6986629962921143,
                                 0.6825172901153564,
                                 0.6640278100967407,
                                 0.6067122220993042,
                                 0.5174141526222229,
                                 0.45753443241119385,
                                 0.4432242214679718,
                                 0.45849862694740295,
                                 0.47390294075012207,
                                 0.5290201902389526,
                                 0.6048661470413208,
                                 0.6491430997848511,
                                 0.6958308219909668,
                                 0.7307958006858826,
 

[   INFO   ] TensorflowPredict: Successfully loaded graph file: `/home/cepatinog/amplab/src/msd-musicnn-1.pb`


Unnamed: 0,tempo_global_bpm,tempo_local_bpms,tempo_local_probs,temperley_key,temperley_scale,temperley_strength,krumhansl_key,krumhansl_scale,krumhansl_strength,edma_key,...,edma_strength,loudness_momentary_loudness,loudness_short_term_loudness,loudness_integrated_loudness,loudness_loudness_range,emb_discogs,genre_activations,voice_instrumental,emb_msd,danceability_signal
0,128.0,"[128.0, 128.0, 128.0, 128.0]","[0.9546994566917419, 0.9014107584953308, 0.892...",F#,major,0.922537,F#,major,0.985232,F#,...,0.93105,"[-9.38477, -9.104578, -8.783603, -8.4422655, -...","[-9.571363, -9.482447, -9.383096, -9.306011, -...",-7.263376,1.128182,"[-0.007370607927441597, 0.04884450510144234, 0...","[1.0846021325505717e-07, 8.567224654143502e-07...","{'predictions': [0.0051579396, 0.9948421], 'pr...","[0.7219624519348145, -0.3700222671031952, 0.89...","{'danceability': 1.4231427907943726, 'dfa': [0..."
