### Environment Setup

In [None]:
import tensorflow as tf
# Enable GPU memory growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Memory growth enabled on GPU.")
    except RuntimeError as e:
        print(e)

import os
# Hide unnecessary TensorFlow messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import sys
import essentia.standard as es
import pandas as pd
from pprint import pprint

import numpy as np
%matplotlib inline


# Determine project root (assuming the notebook is in the notebooks/ folder)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root:", project_root)
print("Current PYTHONPATH (first few entries):", sys.path[:3])

# Define paths
raw_dir = os.path.join(project_root, "data", "raw")
sample_audio = os.path.join(raw_dir, "example.mp3")  # Ensure this file exists
tempo_model_file = os.path.join(project_root, "src", "deeptemp-k16-3.pb")
print("Sample audio file:", sample_audio)
print("Tempo model file:", tempo_model_file)


### Test Audio Loading

In [None]:
from src.load_audio import load_audio_file

audio_dict = load_audio_file(sample_audio, targetMonoSampleRate=44100, targetTempoSampleRate=11025)

print("Returned keys:")
pprint(list(audio_dict.keys()))

print("\nDetails of loaded audio:")
print("Stereo audio (first 5 samples):")
pprint(audio_dict['stereo_audio'][:5])
print("Mono audio length (for key extraction):", len(audio_dict['mono_audio']))
print("Mono audio length (for tempo extraction):", len(audio_dict['mono_tempo']))
print("Sample rate used for mono audio:", audio_dict['sampleRate'])
print("Number of channels in original file:", audio_dict['numChannels'])


### Test Individual Feature Extractors

In [None]:
from src.extract_tempo import extract_tempo_features
from src.extract_key import extract_key_features
from src.extract_loudness import extract_loudness_features

tempo_features = extract_tempo_features(audio_dict['mono_tempo'], method='tempocnn', model_file=tempo_model_file)
print("Tempo Features:")
pprint(tempo_features)

key_features = extract_key_features(audio_dict['mono_audio'])
print("\nKey Features:")
pprint(key_features)

loudness_features = extract_loudness_features(audio_dict['stereo_audio'], hopSize=1024/44100, sampleRate=44100, startAtZero=True)
print("\nLoudness Features:")
pprint(loudness_features)


### Test Embedding Extraction

In [None]:
# Import embedding extraction functions.
from src.extract_embeddings import extract_discogs_effnet_embeddings, extract_msd_musicnn_embeddings

# Load audio for embeddings: using MonoLoader at 16 kHz.
audio_embeddings = es.MonoLoader(filename=sample_audio, sampleRate=16000, resampleQuality=4)()
print("Loaded audio for embeddings length:", len(audio_embeddings))

# Define model paths.
discogs_model_file = os.path.join(project_root, "src", "discogs-effnet-bs64-1.pb")  # Update filename if necessary
musicnn_model_file = os.path.join(project_root, "src", "msd-musicnn-1.pb")           # Update filename if necessary

# Extract Discogs-Effnet embeddings.
discogs_embedding = extract_discogs_effnet_embeddings(audio_embeddings, model_file=discogs_model_file)
print("Discogs-Effnet embedding shape:", discogs_embedding.shape)
print("Discogs-Effnet embedding:")
pprint(discogs_embedding)

# Extract MSD-MusicCNN embeddings.
musicnn_embedding = extract_msd_musicnn_embeddings(audio_embeddings, model_file=musicnn_model_file)
print("MSD-MusicCNN embedding shape:", musicnn_embedding.shape)
print("MSD-MusicCNN embedding:")
pprint(musicnn_embedding)



### Test Genre Extraction

In [None]:
from src.extract_genre import extract_genre_features

# Define the path to the Genre Discogs400 model file.
genre_model_file = os.path.join(project_root, "src", "genre_discogs400-discogs-effnet-1.pb")
print("Genre model file:", genre_model_file)

# Use the previously extracted Discogs-Effnet embedding (discogs_embedding)
# Make sure that discogs_embedding is a 1D numpy array (averaged over frames).
genre_predictions = extract_genre_features(discogs_embedding, model_file=genre_model_file)

print("Genre predictions shape:", genre_predictions.shape)
print("Genre predictions:")
pprint(genre_predictions)

### Test voice/instrumental classification

In [None]:
from src.extract_voice_instrumental import extract_voice_instrumental

# Assuming discogs_embedding was already extracted (and is a 1D vector)
# Ensure it's reshaped to 2D if needed:
if discogs_embedding.ndim == 1:
    discogs_embedding = np.expand_dims(discogs_embedding, axis=0)

voice_result = extract_voice_instrumental(discogs_embedding, model_file=os.path.join(project_root, "src", "voice_instrumental-discogs-effnet-1.pb"))
print("Voice/Instrumental Classification:")
pprint(voice_result)


### Test danceability

In [None]:
from src.extract_danceability import extract_danceability_features

# Test signal-based danceability extraction:
dance_signal = extract_danceability_features(audio_dict['mono_audio'], mode="signal", sampleRate=44100)
print("Signal-based Danceability:")
pprint(dance_signal)

# Test classifier-based danceability extraction:
# Ensure discogs_embedding is 2D:
if discogs_embedding.ndim == 1:
    discogs_embedding = np.expand_dims(discogs_embedding, axis=0)
dance_classifier = extract_danceability_features(discogs_embedding, mode="classifier", model_file=os.path.join(project_root, "src", "danceability-discogs-effnet-1.pb"))
print("\nClassifier-based Danceability:")
pprint(dance_classifier)



### Test Emotion

In [None]:
from src.extract_arousal_valence import extract_arousal_valence_features

# Load audio for emotion extraction using MonoLoader at 16kHz.
audio_emotion = es.MonoLoader(filename=sample_audio, sampleRate=16000, resampleQuality=4)()
print("Loaded audio for emotion extraction length:", len(audio_emotion))

# Test arousal/valence extraction.
# Note: We use MSD-MusicCNN embeddings for emotion extraction.
emotion_predictions = extract_arousal_valence_features(audio_emotion,
                                                       embedding_model_file=os.path.join(project_root, "src", "msd-musicnn-1.pb"),
                                                       regression_model_file=os.path.join(project_root, "src", "emomusic-msd-musicnn-2.pb"))
print("Arousal/Valence predictions:")
pprint(emotion_predictions)


### Test Integrated Pipeline

In [None]:
from src.audio_analysis2 import extract_all_features

# Define additional model paths for genre, voice/instrumental, danceability, and emotion.
genre_model_file = os.path.join(project_root, "src", "genre_discogs400-discogs-effnet-1.pb")
voice_model_file = os.path.join(project_root, "src", "voice_instrumental-discogs-effnet-1.pb")
discogs_model_file = os.path.join(project_root, "src", "discogs-effnet-bs64-1.pb")
musicnn_model_file = os.path.join(project_root, "src", "msd-musicnn-1.pb")
danceability_model_file = os.path.join(project_root, "src", "danceability-discogs-effnet-1.pb")
emotion_model_file = os.path.join(project_root, "src", "emomusic-msd-musicnn-2.pb")

# Extract all features including embeddings, genre activations, voice/instrumental,
# danceability (classifier mode), and emotion (arousal/valence).
all_features = extract_all_features(
    audio_dict, 
    tempo_method='tempocnn', 
    tempo_model_file=tempo_model_file,
    emb_discogs_model_file=discogs_model_file,
    emb_msd_model_file=musicnn_model_file,
    genre_model_file=genre_model_file,
    voice_model_file=voice_model_file,
    danceability_model_file=danceability_model_file,
    emotion_model_file=emotion_model_file
)

print("\nAll Integrated Extracted Features (with embeddings, genre, voice, danceability, and emotion):")
pprint(all_features)

# Optionally, display the results in a DataFrame.
df = pd.DataFrame([all_features])
df



### Estimate total extraction time

In [None]:
import time
import os
import glob
import csv
from tqdm import tqdm

# Define the sample folder (e.g., one of the audio chunk folders).
sample_folder = os.path.join(project_root, "data", "raw", "audio_chunks", "audio.000")

# Recursively find all .mp3 files in that folder.
sample_files = glob.glob(os.path.join(sample_folder, "**", "*.mp3"), recursive=True)
# Filter out non-audio artifacts (e.g., files with ":" in the name).
sample_files = [f for f in sample_files if ":" not in f]

if not sample_files:
    raise ValueError("No sample audio files were found in the folder: " + sample_folder)

# Optionally limit the sample size.
sample_files = sample_files[:10]
print(f"Processing a sample of {len(sample_files)} files for time estimation...")

start_time = time.time()

# Open a sample CSV file to write results.
sample_csv = os.path.join(project_root, "data", "processed", "sample_features.csv")
with open(sample_csv, 'w', newline='') as csvfile:
    writer = None  # We'll create the DictWriter once we have a feature dictionary.
    for file_path in tqdm(sample_files, desc="Processing sample files"):
        try:
            # Load the audio and extract features.
            audio_dict = load_audio_file(file_path, targetMonoSampleRate=44100, targetTempoSampleRate=11025)
            features = extract_all_features(
                audio_dict, 
                tempo_method='tempocnn', 
                tempo_model_file=tempo_model_file,
                emb_discogs_model_file=discogs_model_file,
                emb_msd_model_file=musicnn_model_file,
                genre_model_file=genre_model_file,
                voice_model_file=voice_model_file,
                danceability_model_file=danceability_model_file,
                emotion_model_file=emotion_model_file
            )
            # Include the file path for reference.
            features['file'] = file_path
            # Initialize the CSV writer on the first successful extraction.
            if writer is None:
                fieldnames = list(features.keys())
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
            writer.writerow(features)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            # Optionally log the traceback or continue.
end_time = time.time()
sample_duration = end_time - start_time
avg_time_per_file = sample_duration / len(sample_files)

# Count all audio files in the entire raw directory.
all_files = []
for root, _, files in os.walk(os.path.join(project_root, "data", "raw")):
    for file in files:
        if file.lower().endswith((".mp3", ".wav", ".flac", ".ogg", ".m4a")) and ":" not in file:
            all_files.append(os.path.join(root, file))
total_files = len(all_files)
estimated_total_time = avg_time_per_file * total_files

print(f"\nSample processing time: {sample_duration:.2f} seconds for {len(sample_files)} files")
print(f"Average time per file: {avg_time_per_file:.2f} seconds")
print(f"Estimated total processing time for {total_files} files: {estimated_total_time:.2f} seconds (~{estimated_total_time/60:.2f} minutes)")
print(f"Sample features saved to: {sample_csv}")


In [None]:
# New cell: Run the checkpoint-based extraction

from src.audio_analysis2 import process_all_audio_with_checkpoint

# Define the raw data directory and the checkpoint (features) directory.
raw_dir = os.path.join(project_root, "data", "raw")
checkpoint_dir = os.path.join(project_root, "data", "processed", "features")

# Define model file paths.
tempo_model_file = os.path.join(project_root, "src", "deeptemp-k16-3.pb")
emb_discogs_model_file = os.path.join(project_root, "src", "discogs-effnet-bs64-1.pb")
emb_msd_model_file = os.path.join(project_root, "src", "msd-musicnn-1.pb")
genre_model_file = os.path.join(project_root, "src", "genre_discogs400-discogs-effnet-1.pb")
voice_model_file = os.path.join(project_root, "src", "voice_instrumental-discogs-effnet-1.pb")
danceability_model_file = os.path.join(project_root, "src", "danceability-discogs-effnet-1.pb")
emotion_model_file = os.path.join(project_root, "src", "emomusic-msd-musicnn-2.pb")

# Run the extraction with checkpointing.
process_all_audio_with_checkpoint(
    raw_dir,
    checkpoint_dir,
    tempo_method="tempocnn",
    tempo_model_file=tempo_model_file,
    emb_discogs_model_file=emb_discogs_model_file,
    emb_msd_model_file=emb_msd_model_file,
    genre_model_file=genre_model_file,
    voice_model_file=voice_model_file,
    danceability_model_file=danceability_model_file,
    emotion_model_file=emotion_model_file
)
