In [50]:
!pip -q install librosa soundfile pandas scikit-learn tensorflow==2.16.1

In [51]:
import os, json, math, random, itertools, functools
import numpy as np
import pandas as pd
import librosa, soundfile as sf
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MultiLabelBinarizer


Constructing Pathing and Metadata

In [52]:
# Assuming database is in a folder "FSD50K" in the same directory as this .ipynb
ROOT = "FSD50K"

DEV_AUDIO = os.path.join(ROOT, "FSD50K.dev_audio")
EVAL_AUDIO = os.path.join(ROOT, "FSD50K.eval_audio")
GT_DIR    = os.path.join(ROOT, "FSD50K.ground_truth")
# print(GT_DIR)
dev_df  = pd.read_csv(GT_DIR + "/" + "dev.csv")
eval_df = pd.read_csv(GT_DIR + "/" + "eval.csv") # testing data
vocab   = pd.read_csv(GT_DIR + "/" + "vocabulary.csv", header=None)
# print(vocab)

# Labels in dev/eval are comma-separated strings
def split_labels(s): 
    return [t for t in str(s).split(",") if t]

all_labels = sorted(vocab.iloc[:, 1].tolist()) # Extract all possible labels
mlb = MultiLabelBinarizer(classes=all_labels)    # fixed class ordering
mlb.fit([all_labels])  # initialize with full set so order is stable


Getting the Train/Test Split

In [53]:
train_df = dev_df[dev_df["split"]=="train"].copy() # extract training data
val_df   = dev_df[dev_df["split"]=="val"].copy() # extract validation data

def rows_to_examples(df, audio_base):
    fnames = df["fname"].astype(str).tolist()
    paths  = [os.path.join(audio_base, f"{f}.wav") for f in fnames]
    labels = [split_labels(s) for s in df["labels"].tolist()]
    Y      = mlb.transform(labels).astype("float32")
    return paths, Y

train_paths, y_train = rows_to_examples(train_df, DEV_AUDIO)
val_paths,   y_val   = rows_to_examples(val_df,   DEV_AUDIO)
eval_paths,  y_eval  = rows_to_examples(eval_df,  EVAL_AUDIO)
NUM_CLASSES = len(all_labels)


Preprocessing audio to make all samples 32 kHz and 10s

In [54]:
SR          = 32000
DURATION    = 10.0       # seconds (clips shorter than this are zero-padded)
N_MELS      = 128
HOP_LENGTH  = 320        # 10ms at 32k
N_FFT       = 1024

def load_mel(path):
    y, sr = sf.read(path, dtype='float32', always_2d=False)
    if y.ndim > 1: y = np.mean(y, axis=1)
    if sr != SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=SR)
    target_len = int(SR * DURATION)
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]
    S = librosa.feature.melspectrogram(
        y=y, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, power=2.0
    )
    S_db = librosa.power_to_db(S, ref=np.max).astype(np.float32)  # [mels, frames]
    # Add channel dim → [mels, frames, 1]
    return np.expand_dims(S_db, -1)

def tf_load_mel(path, label):
    mel = tf.numpy_function(load_mel, [path], tf.float32)
    mel.set_shape([N_MELS, None, 1])  # frames dimension is dynamic; we'll fix with cropping/padding
    # For static shapes in Keras, center-crop/pad frames to a fixed T
    T = math.ceil((SR*DURATION)/HOP_LENGTH) + 1
    mel = mel[:, :T, :]
    paddings = tf.maximum(0, T - tf.shape(mel)[1])
    mel = tf.pad(mel, [[0,0], [0,paddings], [0,0]])
    return mel, label


tf.data pipelines

In [55]:
BATCH = 32
AUTOTUNE = tf.data.AUTOTUNE

def make_ds(paths, labels, shuffle):
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.map(lambda p,l: (tf.strings.as_string(p), tf.cast(l, tf.float32)))
    ds = ds.map(tf_load_mel, num_parallel_calls=AUTOTUNE)
    if shuffle:
        ds = ds.shuffle(2048, reshuffle_each_iteration=True)
    ds = ds.batch(BATCH).prefetch(AUTOTUNE)
    return ds

train_ds = make_ds(train_paths, y_train, shuffle=True)
val_ds   = make_ds(val_paths,   y_val,   shuffle=False)
test_ds  = make_ds(eval_paths,  y_eval,  shuffle=False)


CNN Classifier

In [56]:
from tensorflow.keras import layers as L, models

def build_model(n_classes=NUM_CLASSES, input_shape=(N_MELS, None, 1)):
    inp = L.Input(shape=(N_MELS, math.ceil((SR*DURATION)/HOP_LENGTH)+1, 1))

    x = L.Conv2D(32, 3, padding="same")(inp); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)
    x = L.Conv2D(64, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)
    x = L.Conv2D(128, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)
    x = L.Conv2D(256, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.GlobalAveragePooling2D()(x)
    x = L.Dropout(0.3)(x)
    out = L.Dense(n_classes, activation="sigmoid")(x)  # multi-label → sigmoid

    return models.Model(inp, out)

model = build_model()
model.summary()

# Multi-label: use BinaryCrossentropy; AUC-PR is a useful proxy; also track macro F1 via callback later if you like.
model.compile(
    optimizer=keras.optimizers.Adam(3e-4),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.AUC(curve="PR", multi_label=True, num_labels=NUM_CLASSES, name="AUPRC"),
        keras.metrics.AUC(curve="ROC", multi_label=True, num_labels=NUM_CLASSES, name="AUROC"),
    ],
)


Training Model

In [None]:
callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, verbose=1),
    keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1),
    keras.callbacks.ModelCheckpoint("fsd50k_cnn.h5", monitor="val_AUPRC", mode="max", save_best_only=True),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    initial_epoch=8, # remove this line to train from scratch
    callbacks=callbacks,
)


Epoch 1/20


  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUPRC: 0.0309 - AUROC: 0.5009 - loss: 0.1415



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1291s[0m 1s/step - AUPRC: 0.0412 - AUROC: 0.6467 - loss: 0.0737 - val_AUPRC: 0.0338 - val_AUROC: 0.5692 - val_loss: 0.0720 - learning_rate: 3.0000e-04
Epoch 2/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1266s[0m 1s/step - AUPRC: 0.0913 - AUROC: 0.7498 - loss: 0.0459 - val_AUPRC: 0.0312 - val_AUROC: 0.5601 - val_loss: 0.0891 - learning_rate: 3.0000e-04
Epoch 3/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUPRC: 0.0752 - AUROC: 0.6110 - loss: 0.0452



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1237s[0m 1s/step - AUPRC: 0.1132 - AUROC: 0.7835 - loss: 0.0436 - val_AUPRC: 0.0497 - val_AUROC: 0.6129 - val_loss: 0.0798 - learning_rate: 3.0000e-04
Epoch 4/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUPRC: 0.0879 - AUROC: 0.6316 - loss: 0.0434
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0001500000071246177.




[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1231s[0m 1s/step - AUPRC: 0.1335 - AUROC: 0.8132 - loss: 0.0418 - val_AUPRC: 0.0600 - val_AUROC: 0.6467 - val_loss: 0.0731 - learning_rate: 3.0000e-04
Epoch 5/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUPRC: 0.0899 - AUROC: 0.6494 - loss: 0.0442



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1232s[0m 1s/step - AUPRC: 0.1302 - AUROC: 0.8099 - loss: 0.0420 - val_AUPRC: 0.0750 - val_AUROC: 0.6866 - val_loss: 0.0663 - learning_rate: 1.5000e-04
Epoch 6/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1446s[0m 1s/step - AUPRC: 0.1480 - AUROC: 0.8340 - loss: 0.0406 - val_AUPRC: 0.0730 - val_AUROC: 0.6926 - val_loss: 0.0655 - learning_rate: 1.5000e-04
Epoch 7/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUPRC: 0.1094 - AUROC: 0.6793 - loss: 0.0411



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1408s[0m 1s/step - AUPRC: 0.1618 - AUROC: 0.8482 - loss: 0.0396 - val_AUPRC: 0.0758 - val_AUROC: 0.6967 - val_loss: 0.0658 - learning_rate: 1.5000e-04
Epoch 8/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - AUPRC: 0.1178 - AUROC: 0.6923 - loss: 0.0396



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1409s[0m 1s/step - AUPRC: 0.1749 - AUROC: 0.8598 - loss: 0.0386 - val_AUPRC: 0.0810 - val_AUROC: 0.7138 - val_loss: 0.0663 - learning_rate: 1.5000e-04
Epoch 9/20
[1m 210/1150[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m26:50[0m 2s/step - AUPRC: 0.0668 - AUROC: 0.4389 - loss: 0.0454

In [58]:
from pathlib import Path
import os

bad_example = r"FSD50K\FSD50K.dev_audio\64760.wav"  # from your error
p = Path(bad_example) if os.path.isabs(bad_example) else Path.cwd() / bad_example
print("Exists:", p.exists(), "Size:", p.stat().st_size if p.exists() else "N/A", "Path:", p)


Exists: False Size: N/A Path: c:\Users\Austin\Documents\ECEN758\FSD50K\FSD50K.dev_audio\64760.wav
