### Audio data

We have scraped the 30 seconds preview of the targetted songs from Deezer, and now we want to create a dataset with the embedding representation of all of these audios. We will leverage the MERT model and its encoder for the vector represenation.

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [39]:
# Load the manifest dataset
audio_manifest = pd.read_csv('audio_data/audio_manifest_20k.csv')

# Drop the rows with missing values
audio_manifest = audio_manifest.dropna()

# change the audio path to the correct path
audio_manifest['audio_path'] = 'audio_' + audio_manifest['audio_path']

In [40]:
# Load the MERT model
from transformers import AutoProcessor, AutoModel
import torch
import torchaudio

# Load the processor and model
processor = AutoProcessor.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True)
model = AutoModel.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True)




In [43]:
# Extract embeddings for all songs
import torch
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

embeddings_list = []
track_ids = []

total_songs = len(audio_manifest)
print(f"Starting audio embedding extraction for {total_songs} songs...\n")

for idx, row in tqdm(audio_manifest.iterrows(), total=total_songs, desc="Embedding songs"):
    track_id = row['track_id']
    audio_path = row['audio_path']

    # Log every 500 songs (adjust if you want)
    #if idx % 500 == 0:
    #    print(f"[LOG] Processing song {idx}/{total_songs}: track_id={track_id}")

    # Load audio
    waveform, sr = torchaudio.load(audio_path)

    # Convert stereo → mono
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample to 24kHz
    if sr != 24000:
        waveform = torchaudio.functional.resample(waveform, sr, 24000)

    # Squeeze: [1, N] → [N]
    waveform = waveform.squeeze(0)

    # Prepare inputs
    inputs = processor(waveform, sampling_rate=24000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Embedding: mean over time → (1024,)
    song_emb = outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

    # Store results
    track_ids.append(track_id)
    embeddings_list.append(song_emb)

print("\nAudio embedding extraction completed!")


Starting audio embedding extraction for 20405 songs...



Embedding songs: 100%|██████████| 20405/20405 [1:15:38<00:00,  4.50it/s]


Audio embedding extraction completed!





In [44]:
# Create the embeddings dataframe
audio_emb_df = pd.DataFrame(
    np.vstack(embeddings_list),
    columns=[f"audio_emb_{i}" for i in range(len(embeddings_list[0]))]
)

audio_emb_df.insert(0, 'track_id', track_ids)

audio_emb_df.to_parquet("audio_data/audio_embeddings_mert_330M.parquet", index=False)

print("Embeddings saved to audio_embeddings_mert_330M.parquet")


Embeddings saved to audio_embeddings_mert_330M.parquet


In [45]:
# Sanity check of embedding df shape
expected_rows = len(audio_manifest)
expected_cols = 1 + len(embeddings_list[0])   # 1 = track_id

actual_rows, actual_cols = audio_emb_df.shape

print(f"\n[CHECK] Expected shape: ({expected_rows}, {expected_cols})")
print(f"[CHECK] Actual shape:   ({actual_rows}, {actual_cols})")

if actual_rows == expected_rows and actual_cols == expected_cols:
    print("[PASS] audio_emb_df shape is correct!")
else:
    print("[FAIL] Shape mismatch!")
    if actual_rows != expected_rows:
        print(f"  → Row mismatch: expected {expected_rows}, got {actual_rows}")
    if actual_cols != expected_cols:
        print(f"  → Col mismatch: expected {expected_cols}, got {actual_cols}")



[CHECK] Expected shape: (20405, 1025)
[CHECK] Actual shape:   (20405, 1025)
[PASS] audio_emb_df shape is correct!
