In [10]:
!pip -q install librosa soundfile pandas scikit-learn tensorflow==2.16.1

In [11]:
import os, json, math, random, itertools, functools
import numpy as np
import pandas as pd
import librosa, soundfile as sf
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MultiLabelBinarizer


Constructing Pathing and Metadata

In [12]:
# Assuming database is in a folder "FSD50K" in the same directory as this .ipynb
ROOT = "FSD50K"

DEV_AUDIO = os.path.join(ROOT, "FSD50K.dev_audio")
EVAL_AUDIO = os.path.join(ROOT, "FSD50K.eval_audio")
GT_DIR    = os.path.join(ROOT, "FSD50K.ground_truth")
# print(GT_DIR)
dev_df  = pd.read_csv(GT_DIR + "/" + "dev.csv")
eval_df = pd.read_csv(GT_DIR + "/" + "eval.csv") # testing data
vocab   = pd.read_csv(GT_DIR + "/" + "vocabulary.csv", header=None)
# print(vocab)

# Labels in dev/eval are comma-separated strings
def split_labels(s): 
    return [t for t in str(s).split(",") if t]

all_labels = sorted(vocab.iloc[:, 1].tolist()) # Extract all possible labels
mlb = MultiLabelBinarizer(classes=all_labels)    # fixed class ordering
mlb.fit([all_labels])  # initialize with full set so order is stable


Getting the Train/Test Split

In [None]:
train_df = dev_df[dev_df["split"]=="train"].copy() # extract training data
val_df   = dev_df[dev_df["split"]=="val"].copy() # extract validation data

# Helper function to convert metadata rows into file paths and label arrays
# df is either the training, validation, or testing data frame
# audio_base is the directory path that has the audio files
def rows_to_examples(df, audio_base):
    fnames = df["fname"].astype(str).tolist()
    paths  = [os.path.join(audio_base, f"{f}.wav") for f in fnames]
    labels = [split_labels(s) for s in df["labels"].tolist()]
    Y      = mlb.transform(labels).astype("float32")
    return paths, Y

train_paths, y_train = rows_to_examples(train_df, DEV_AUDIO)
val_paths,   y_val   = rows_to_examples(val_df,   DEV_AUDIO)
eval_paths,  y_eval  = rows_to_examples(eval_df,  EVAL_AUDIO)
NUM_CLASSES = len(all_labels)


Preprocessing audio to make all samples 32 kHz and 10s

In [None]:
# Hyperparameters
SR          = 22050
DURATION    = 5.0
N_MELS      = 64
HOP_LENGTH  = 512
N_FFT       = 1024

import math, numpy as np, librosa, tensorflow as tf

# Exact frame count used in training
T = math.ceil((SR * DURATION) / HOP_LENGTH) + 1

def load_mel(path):
    # unwrap what tf.numpy_function hands us
    if isinstance(path, np.ndarray):     # 0-D object array
        path = path.item()
    if isinstance(path, (bytes, np.bytes_)):
        path = path.decode("utf-8")
    path = str(path)

    # Use librosa.load for robust decoding on Windows
    y, _ = librosa.load(path, sr=SR, mono=True)

    # pad/crop waveform to fixed duration
    target_len = int(SR * DURATION)
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]

    # log-mel
    S = librosa.feature.melspectrogram(
        y=y, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, power=2.0
    )
    S_db = librosa.power_to_db(S, ref=np.max).astype(np.float32)  # [mels, frames]
    return np.expand_dims(S_db, -1)  # [mels, frames, 1]

def tf_load_mel(path, label):
    path = tf.cast(path, tf.string)

    mel = tf.numpy_function(load_mel, [path], tf.float32)
    mel = mel[:, :T, :]
    pad_frames = tf.maximum(0, T - tf.shape(mel)[1])
    mel = tf.pad(mel, paddings=[[0, 0], [0, pad_frames], [0, 0]])

    # give Keras a static shape
    mel.set_shape([N_MELS, T, 1])

    return mel, tf.cast(label, tf.float32)


tf.data pipelines

In [36]:
BATCH = 32

# one-time precompute pass (can be interrupted/resumed)
import numpy as np, os, hashlib
def mel_cache_path(wav_path):
    h = hashlib.md5(wav_path.encode()).hexdigest()
    return os.path.join("mel_store", h + ".npy")

os.makedirs("mel_store", exist_ok=True)

def materialize(paths):
    out = []
    for p in paths:
        npy = mel_cache_path(p)
        if not os.path.exists(npy):
            mel = load_mel(p)                  
            np.save(npy, mel.astype(np.float32))
        out.append(npy)
    return out

train_mels = materialize(train_paths)
val_mels   = materialize(val_paths)
eval_mels  = materialize(eval_paths)

def npy_loader(npy_path, label):
    def _load(x):
        if isinstance(x, (bytes, np.bytes_)): x = x.decode()
        return np.load(x)
    mel = tf.numpy_function(_load, [npy_path], tf.float32)
    mel.set_shape([N_MELS, None, 1])
    
    return mel, label

def make_cached_ds(npy_paths, labels, shuffle):
    ds = tf.data.Dataset.from_tensor_slices((npy_paths, labels))
    ds = ds.map(npy_loader, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle: ds = ds.shuffle(2048)
    return ds.batch(BATCH).cache().prefetch(tf.data.AUTOTUNE)

train_ds = make_cached_ds(train_mels, y_train, True)
val_ds   = make_cached_ds(val_mels,   y_val,   False)
test_ds  = make_cached_ds(eval_mels,  y_eval,  False)

CNN Classifier

In [None]:
from tensorflow.keras import layers as L, models

"""
Build a convolutional neural network (CNN) for multi-label audio classification.
Input: log-mel spectrograms of shape [N_MELS, frames, 1]
Output: per-class probabilities (sigmoid activation) for n_classes labels.
"""
def build_model(n_classes=NUM_CLASSES, input_shape=(N_MELS, None, 1)):
    # input layer, takes a mel-spectrogram with a fixed # of mel bins, fixed # of time frames, and one channel
    inp = L.Input(shape=(N_MELS, math.ceil((SR*DURATION)/HOP_LENGTH)+1, 1))

    # First conv. block with 32 filters, 3x3 kernel size, batch normalization for stabalizing, and ReLU for nonlinearity
    x = L.Conv2D(32, 3, padding="same")(inp); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)

    # Second conv. block
    x = L.Conv2D(64, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)

    # third conv. block
    x = L.Conv2D(128, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)

    # fourth conv. block
    x = L.Conv2D(256, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)

    # Global pooling to aggregate each feature map into a single val
    # compact representation for a clip
    x = L.GlobalAveragePooling2D()(x)

    # Done to reduce overfitting
    x = L.Dropout(0.3)(x)

    # Output layer, one neuron per class, independent probabilities per class for multi-label classification
    out = L.Dense(n_classes, activation="sigmoid")(x)  # multi-label → sigmoid

    return models.Model(inp, out)

model = build_model()
model.summary()

model.compile(
    optimizer=keras.optimizers.Adam(3e-4), # general learning rate
    loss=keras.losses.BinaryCrossentropy(), # so each class ifs treated as a separate yes/no prediction

    # AUPRC (Area under Precision-Recall curve) - good for imbalanced data.
    # AUROC (Area under ROC curve) - general discriminative metric.
    metrics=[
        keras.metrics.AUC(curve="PR", multi_label=True, num_labels=NUM_CLASSES, name="AUPRC"),
        keras.metrics.AUC(curve="ROC", multi_label=True, num_labels=NUM_CLASSES, name="AUROC"),
    ],
)


Training Model

In [18]:
callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, verbose=1),
    keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1),
    keras.callbacks.ModelCheckpoint("fsd50k_cnn.h5", monitor="val_AUPRC", mode="max", save_best_only=True),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=callbacks,
)


Epoch 1/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step - AUPRC: 0.0342 - AUROC: 0.5075 - loss: 0.1429



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 188ms/step - AUPRC: 0.0474 - AUROC: 0.6666 - loss: 0.0727 - val_AUPRC: 0.0346 - val_AUROC: 0.5840 - val_loss: 0.0818 - learning_rate: 3.0000e-04
Epoch 2/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - AUPRC: 0.0852 - AUROC: 0.6138 - loss: 0.0440



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 99ms/step - AUPRC: 0.1280 - AUROC: 0.7817 - loss: 0.0421 - val_AUPRC: 0.0544 - val_AUROC: 0.6288 - val_loss: 0.0746 - learning_rate: 3.0000e-04
Epoch 3/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - AUPRC: 0.1146 - AUROC: 0.6518 - loss: 0.0399



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 102ms/step - AUPRC: 0.1716 - AUROC: 0.8267 - loss: 0.0384 - val_AUPRC: 0.0707 - val_AUROC: 0.6515 - val_loss: 0.0721 - learning_rate: 3.0000e-04
Epoch 4/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - AUPRC: 0.1367 - AUROC: 0.6863 - loss: 0.0370



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 100ms/step - AUPRC: 0.2078 - AUROC: 0.8605 - loss: 0.0358 - val_AUPRC: 0.0901 - val_AUROC: 0.6819 - val_loss: 0.0689 - learning_rate: 3.0000e-04
Epoch 5/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - AUPRC: 0.1571 - AUROC: 0.7052 - loss: 0.0350



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 98ms/step - AUPRC: 0.2419 - AUROC: 0.8828 - loss: 0.0340 - val_AUPRC: 0.0957 - val_AUROC: 0.6826 - val_loss: 0.0702 - learning_rate: 3.0000e-04
Epoch 6/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - AUPRC: 0.1754 - AUROC: 0.7228 - loss: 0.0334



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 99ms/step - AUPRC: 0.2708 - AUROC: 0.8975 - loss: 0.0324 - val_AUPRC: 0.1069 - val_AUROC: 0.6849 - val_loss: 0.0702 - learning_rate: 3.0000e-04
Epoch 7/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - AUPRC: 0.1919 - AUROC: 0.7338 - loss: 0.0322



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 99ms/step - AUPRC: 0.2968 - AUROC: 0.9078 - loss: 0.0312 - val_AUPRC: 0.1145 - val_AUROC: 0.7122 - val_loss: 0.0681 - learning_rate: 3.0000e-04
Epoch 8/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 99ms/step - AUPRC: 0.3197 - AUROC: 0.9165 - loss: 0.0301 - val_AUPRC: 0.1092 - val_AUROC: 0.6893 - val_loss: 0.0716 - learning_rate: 3.0000e-04
Epoch 9/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - AUPRC: 0.2185 - AUROC: 0.7533 - loss: 0.0302



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 98ms/step - AUPRC: 0.3416 - AUROC: 0.9237 - loss: 0.0292 - val_AUPRC: 0.1199 - val_AUROC: 0.7186 - val_loss: 0.0688 - learning_rate: 3.0000e-04
Epoch 10/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - AUPRC: 0.2306 - AUROC: 0.7568 - loss: 0.0293
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0001500000071246177.




[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 97ms/step - AUPRC: 0.3617 - AUROC: 0.9279 - loss: 0.0285 - val_AUPRC: 0.1253 - val_AUROC: 0.7175 - val_loss: 0.0699 - learning_rate: 3.0000e-04
Epoch 11/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - AUPRC: 0.2312 - AUROC: 0.7633 - loss: 0.0303



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 97ms/step - AUPRC: 0.3632 - AUROC: 0.9300 - loss: 0.0288 - val_AUPRC: 0.1428 - val_AUROC: 0.7517 - val_loss: 0.0642 - learning_rate: 1.5000e-04
Epoch 12/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - AUPRC: 0.2466 - AUROC: 0.7717 - loss: 0.0288



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 98ms/step - AUPRC: 0.3813 - AUROC: 0.9349 - loss: 0.0280 - val_AUPRC: 0.1462 - val_AUROC: 0.7470 - val_loss: 0.0654 - learning_rate: 1.5000e-04
Epoch 13/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - AUPRC: 0.2513 - AUROC: 0.7739 - loss: 0.0282



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 99ms/step - AUPRC: 0.3930 - AUROC: 0.9369 - loss: 0.0275 - val_AUPRC: 0.1502 - val_AUROC: 0.7555 - val_loss: 0.0641 - learning_rate: 1.5000e-04
Epoch 14/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - AUPRC: 0.2547 - AUROC: 0.7756 - loss: 0.0278
Epoch 14: ReduceLROnPlateau reducing learning rate to 7.500000356230885e-05.
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 97ms/step - AUPRC: 0.4009 - AUROC: 0.9395 - loss: 0.0271 - val_AUPRC: 0.1479 - val_AUROC: 0.7543 - val_loss: 0.0646 - learning_rate: 1.5000e-04
Epoch 15/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - AUPRC: 0.2544 - AUROC: 0.7774 - loss: 0.0286



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 96ms/step - AUPRC: 0.3980 - AUROC: 0.9385 - loss: 0.0275 - val_AUPRC: 0.1644 - val_AUROC: 0.7604 - val_loss: 0.0618 - learning_rate: 7.5000e-05
Epoch 16/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 98ms/step - AUPRC: 0.4105 - AUROC: 0.9408 - loss: 0.0270 - val_AUPRC: 0.1628 - val_AUROC: 0.7549 - val_loss: 0.0622 - learning_rate: 7.5000e-05
Epoch 17/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - AUPRC: 0.2653 - AUROC: 0.7829 - loss: 0.0272



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 98ms/step - AUPRC: 0.4161 - AUROC: 0.9421 - loss: 0.0267 - val_AUPRC: 0.1655 - val_AUROC: 0.7599 - val_loss: 0.0619 - learning_rate: 7.5000e-05
Epoch 18/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - AUPRC: 0.2703 - AUROC: 0.7852 - loss: 0.0269
Epoch 18: ReduceLROnPlateau reducing learning rate to 3.7500001781154424e-05.




[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 98ms/step - AUPRC: 0.4209 - AUROC: 0.9439 - loss: 0.0265 - val_AUPRC: 0.1663 - val_AUROC: 0.7566 - val_loss: 0.0623 - learning_rate: 7.5000e-05
Epoch 19/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - AUPRC: 0.2659 - AUROC: 0.7833 - loss: 0.0278



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 98ms/step - AUPRC: 0.4133 - AUROC: 0.9431 - loss: 0.0270 - val_AUPRC: 0.1777 - val_AUROC: 0.7649 - val_loss: 0.0610 - learning_rate: 3.7500e-05
Epoch 20/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - AUPRC: 0.2715 - AUROC: 0.7831 - loss: 0.0272



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 99ms/step - AUPRC: 0.4200 - AUROC: 0.9430 - loss: 0.0268 - val_AUPRC: 0.1787 - val_AUROC: 0.7586 - val_loss: 0.0615 - learning_rate: 3.7500e-05
Restoring model weights from the end of the best epoch: 19.


Getting Evaluation Scores

In [38]:
model.load_weights("fsd50k_cnn.h5")
metrics = model.evaluate(test_ds, return_dict=True)
metrics


[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - AUPRC: 0.1746 - AUROC: 0.7387 - loss: 0.0834


{'AUPRC': 0.1746385544538498,
 'AUROC': 0.7387352585792542,
 'loss': 0.08339846134185791}

Predictions on any specific evaluation sample

In [43]:
# Function to get top k class predictions with scores for a given audio file path
def predict_clip(path, k=5):
    x, _ = tf_load_mel(tf.convert_to_tensor(path), tf.zeros([NUM_CLASSES], tf.float32))
    x = tf.expand_dims(x, 0)
    probs = model.predict(x, verbose=0)[0]
    topk = probs.argsort()[-k:][::-1]

    print(path)

    return [(all_labels[i], float(probs[i])) for i in topk]

In [44]:
# Example to get predictions for an audio file
predict_clip(eval_paths[0], k=5)

FSD50K\FSD50K.eval_audio\37199.wav


[('Musical_instrument', 0.985999584197998),
 ('Music', 0.985095202922821),
 ('Guitar', 0.5938268899917603),
 ('Plucked_string_instrument', 0.5418901443481445),
 ('Electric_guitar', 0.250101774930954)]