In [52]:
!pip -q install librosa soundfile pandas scikit-learn tensorflow==2.16.1

In [5]:
import os, json, math, random, itertools, functools
import numpy as np
import pandas as pd
import librosa, soundfile as sf
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MultiLabelBinarizer


Constructing Pathing and Metadata

In [6]:
# Assuming database is in a folder "FSD50K" in the same directory as this .ipynb
ROOT = "FSD50K"

DEV_AUDIO = os.path.join(ROOT, "FSD50K.dev_audio")
EVAL_AUDIO = os.path.join(ROOT, "FSD50K.eval_audio")
GT_DIR    = os.path.join(ROOT, "FSD50K.ground_truth")
# print(GT_DIR)
dev_df  = pd.read_csv(GT_DIR + "/" + "dev.csv")
eval_df = pd.read_csv(GT_DIR + "/" + "eval.csv") # testing data
vocab   = pd.read_csv(GT_DIR + "/" + "vocabulary.csv", header=None)
# print(vocab)

# Labels in dev/eval are comma-separated strings
def split_labels(s): 
    return [t for t in str(s).split(",") if t]

all_labels = sorted(vocab.iloc[:, 1].tolist()) # Extract all possible labels
mlb = MultiLabelBinarizer(classes=all_labels)    # fixed class ordering
mlb.fit([all_labels])  # initialize with full set so order is stable


Getting the Train/Test Split

In [7]:
train_df = dev_df[dev_df["split"]=="train"].copy() # extract training data
val_df   = dev_df[dev_df["split"]=="val"].copy() # extract validation data

# Helper function to convert metadata rows into file paths and label arrays
# df is either the training, validation, or testing data frame
# audio_base is the directory path that has the audio files
def rows_to_examples(df, audio_base):
    fnames = df["fname"].astype(str).tolist()
    paths  = [os.path.join(audio_base, f"{f}.wav") for f in fnames]
    labels = [split_labels(s) for s in df["labels"].tolist()]
    Y      = mlb.transform(labels).astype("float32")
    return paths, Y

train_paths, y_train = rows_to_examples(train_df, DEV_AUDIO)
val_paths,   y_val   = rows_to_examples(val_df,   DEV_AUDIO)
eval_paths,  y_eval  = rows_to_examples(eval_df,  EVAL_AUDIO)
NUM_CLASSES = len(all_labels)


Preprocessing audio to make all samples 32 kHz and 10s

In [8]:
# Hyperparameters
SR          = 22050
DURATION    = 15.0
N_MELS      = 64
HOP_LENGTH  = 512
N_FFT       = 1024

import math, numpy as np, librosa, tensorflow as tf

# Exact frame count used in training
T = math.ceil((SR * DURATION) / HOP_LENGTH) + 1

def load_mel(path):
    # unwrap what tf.numpy_function hands us
    if isinstance(path, np.ndarray):     # 0-D object array
        path = path.item()
    if isinstance(path, (bytes, np.bytes_)):
        path = path.decode("utf-8")
    path = str(path)

    # Use librosa.load for robust decoding on Windows
    y, _ = librosa.load(path, sr=SR, mono=True)

    # pad/crop waveform to fixed duration
    target_len = int(SR * DURATION)
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]

    # log-mel
    S = librosa.feature.melspectrogram(
        y=y, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, power=2.0
    )
    S_db = librosa.power_to_db(S, ref=np.max).astype(np.float32)  # [mels, frames]
    return np.expand_dims(S_db, -1)  # [mels, frames, 1]

def tf_load_mel(path, label):
    path = tf.cast(path, tf.string)

    mel = tf.numpy_function(load_mel, [path], tf.float32)
    mel = mel[:, :T, :]
    pad_frames = tf.maximum(0, T - tf.shape(mel)[1])
    mel = tf.pad(mel, paddings=[[0, 0], [0, pad_frames], [0, 0]])

    # give Keras a static shape
    mel.set_shape([N_MELS, T, 1])

    return mel, tf.cast(label, tf.float32)


tf.data pipelines

In [None]:
BATCH = 32 # samples ber batch

# Computing mel-spectrograms on-the-fly is extremely slow
# To fix this, we preprocess each WAV file once and save the mel-spectrogram as a .npy file. Future runs load the .npy instantly
# This caching optimization resulted in a 6x speedup for training time

# one-time precompute pass (can be interrupted/resumed)
import numpy as np, os, hashlib
def mel_cache_path(wav_path):
    h = hashlib.md5(wav_path.encode()).hexdigest()
    return os.path.join("mel_store", h + ".npy")

os.makedirs("mel_store", exist_ok=True)

# For each WAV file path:
#      - compute its cache .npy file name
#      - if the .npy file does NOT exist:
#            load the WAV → compute mel → save to disk
#      - return a list of .npy file paths
def materialize(paths):
    out = []
    for p in paths:
        npy = mel_cache_path(p)
        if not os.path.exists(npy):
            mel = load_mel(p)                  
            np.save(npy, mel.astype(np.float32))
        out.append(npy)
    return out

train_mels = materialize(train_paths)
val_mels   = materialize(val_paths)
eval_mels  = materialize(eval_paths)

# takes path to cached mel file and one-hot multi-label vector and returns the mel tensor shape and a label as a float32 tensor
def npy_loader(npy_path, label):
    def _load(x):
        if isinstance(x, (bytes, np.bytes_)):
            x = x.decode()
        return np.load(x)
    # Loading the mel
    mel = tf.numpy_function(_load, [npy_path], tf.float32)
    mel.set_shape([N_MELS, None, 1])

    # crop/pad along time axis to T frames
    mel = mel[:, :T, :]                      # crop if too long
    pad_frames = tf.maximum(0, T - tf.shape(mel)[1])
    mel = tf.pad(mel, [[0, 0], [0, pad_frames], [0, 0]])

    # static shape: (N_MELS, T, 1) so Keras is happy
    mel.set_shape([N_MELS, T, 1])

    return mel, tf.cast(label, tf.float32)

def make_cached_ds(npy_paths, labels, shuffle):
    ds = tf.data.Dataset.from_tensor_slices((npy_paths, labels))
    ds = ds.map(npy_loader, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle: ds = ds.shuffle(2048)
    return ds.batch(BATCH).cache().prefetch(tf.data.AUTOTUNE)

train_ds = make_cached_ds(train_mels, y_train, True)
val_ds   = make_cached_ds(val_mels,   y_val,   False)
test_ds  = make_cached_ds(eval_mels,  y_eval,  False)

CNN Classifier

In [10]:
from tensorflow.keras import layers as L, models

"""
Build a convolutional neural network (CNN) for multi-label audio classification.
Input: log-mel spectrograms of shape [N_MELS, frames, 1]
Output: per-class probabilities (sigmoid activation) for n_classes labels.
"""
def build_model(n_classes=NUM_CLASSES, input_shape=(N_MELS, None, 1)):
    # input layer, takes a mel-spectrogram with a fixed # of mel bins, fixed # of time frames, and one channel
    inp = L.Input(shape=(N_MELS, math.ceil((SR*DURATION)/HOP_LENGTH)+1, 1))

    # First conv. block with 32 filters, 3x3 kernel size, batch normalization for stabalizing, and ReLU for nonlinearity
    x = L.Conv2D(32, 3, padding="same")(inp); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)

    # Second conv. block
    x = L.Conv2D(64, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)

    # third conv. block
    x = L.Conv2D(128, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)
    x = L.MaxPool2D((2,2))(x)

    # fourth conv. block
    x = L.Conv2D(256, 3, padding="same")(x); x = L.BatchNormalization()(x); x = L.ReLU()(x)

    # Global pooling to aggregate each feature map into a single val
    # compact representation for a clip
    x = L.GlobalAveragePooling2D()(x)

    # Done to reduce overfitting
    x = L.Dropout(0.3)(x)

    # Output layer, one neuron per class, independent probabilities per class for multi-label classification
    out = L.Dense(n_classes, activation="sigmoid")(x)  # multi-label → sigmoid

    return models.Model(inp, out)

model = build_model()
model.summary()

model.compile(
    optimizer=keras.optimizers.Adam(3e-4), # general learning rate
    loss=keras.losses.BinaryCrossentropy(), # so each class ifs treated as a separate yes/no prediction

    # AUPRC (Area under Precision-Recall curve) - good for imbalanced data.
    # AUROC (Area under ROC curve) - general discriminative metric.
    metrics=[
        keras.metrics.AUC(curve="PR", multi_label=True, num_labels=NUM_CLASSES, name="AUPRC"),
        keras.metrics.AUC(curve="ROC", multi_label=True, num_labels=NUM_CLASSES, name="AUROC")
    ],
)


Training Model

In [59]:
callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, verbose=1),
    keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1),
    keras.callbacks.ModelCheckpoint("fsd50k_cnn.h5", monitor="val_AUPRC", mode="max", save_best_only=True),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=callbacks,
)


Epoch 1/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step - AUPRC: 0.0274 - AUROC: 0.4849 - loss: 0.1413



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 320ms/step - AUPRC: 0.0362 - AUROC: 0.6254 - loss: 0.0740 - val_AUPRC: 0.0281 - val_AUROC: 0.5344 - val_loss: 0.0858 - learning_rate: 3.0000e-04
Epoch 2/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step - AUPRC: 0.0578 - AUROC: 0.5736 - loss: 0.0472



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m344s[0m 299ms/step - AUPRC: 0.0838 - AUROC: 0.7465 - loss: 0.0456 - val_AUPRC: 0.0387 - val_AUROC: 0.5845 - val_loss: 0.0811 - learning_rate: 3.0000e-04
Epoch 3/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300ms/step - AUPRC: 0.0762 - AUROC: 0.6078 - loss: 0.0438



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 306ms/step - AUPRC: 0.1125 - AUROC: 0.7881 - loss: 0.0423 - val_AUPRC: 0.0484 - val_AUROC: 0.6160 - val_loss: 0.0763 - learning_rate: 3.0000e-04
Epoch 4/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - AUPRC: 0.0941 - AUROC: 0.6391 - loss: 0.0410



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 294ms/step - AUPRC: 0.1402 - AUROC: 0.8245 - loss: 0.0396 - val_AUPRC: 0.0589 - val_AUROC: 0.6510 - val_loss: 0.0758 - learning_rate: 3.0000e-04
Epoch 5/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - AUPRC: 0.1131 - AUROC: 0.6727 - loss: 0.0388



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 294ms/step - AUPRC: 0.1696 - AUROC: 0.8522 - loss: 0.0375 - val_AUPRC: 0.0642 - val_AUROC: 0.6463 - val_loss: 0.0795 - learning_rate: 3.0000e-04
Epoch 6/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.1307 - AUROC: 0.6907 - loss: 0.0367



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 292ms/step - AUPRC: 0.1970 - AUROC: 0.8744 - loss: 0.0357 - val_AUPRC: 0.0765 - val_AUROC: 0.6749 - val_loss: 0.0738 - learning_rate: 3.0000e-04
Epoch 7/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - AUPRC: 0.1464 - AUROC: 0.7081 - loss: 0.0350



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.2232 - AUROC: 0.8925 - loss: 0.0341 - val_AUPRC: 0.0873 - val_AUROC: 0.6946 - val_loss: 0.0715 - learning_rate: 3.0000e-04
Epoch 8/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.1619 - AUROC: 0.7226 - loss: 0.0335



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.2488 - AUROC: 0.9058 - loss: 0.0327 - val_AUPRC: 0.0960 - val_AUROC: 0.6967 - val_loss: 0.0703 - learning_rate: 3.0000e-04
Epoch 9/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.1773 - AUROC: 0.7382 - loss: 0.0322



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.2724 - AUROC: 0.9167 - loss: 0.0316 - val_AUPRC: 0.1002 - val_AUROC: 0.7084 - val_loss: 0.0700 - learning_rate: 3.0000e-04
Epoch 10/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.1882 - AUROC: 0.7372 - loss: 0.0312



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 292ms/step - AUPRC: 0.2906 - AUROC: 0.9221 - loss: 0.0306 - val_AUPRC: 0.1110 - val_AUROC: 0.7130 - val_loss: 0.0684 - learning_rate: 3.0000e-04
Epoch 11/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.3111 - AUROC: 0.9283 - loss: 0.0297 - val_AUPRC: 0.1039 - val_AUROC: 0.6867 - val_loss: 0.0726 - learning_rate: 3.0000e-04
Epoch 12/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.2119 - AUROC: 0.7532 - loss: 0.0293



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.3312 - AUROC: 0.9331 - loss: 0.0289 - val_AUPRC: 0.1205 - val_AUROC: 0.7234 - val_loss: 0.0688 - learning_rate: 3.0000e-04
Epoch 13/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.2188 - AUROC: 0.7568 - loss: 0.0286
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0001500000071246177.
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.3440 - AUROC: 0.9369 - loss: 0.0283 - val_AUPRC: 0.1118 - val_AUROC: 0.6913 - val_loss: 0.0718 - learning_rate: 3.0000e-04
Epoch 14/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - AUPRC: 0.2292 - AUROC: 0.7650 - loss: 0.0286



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 292ms/step - AUPRC: 0.3583 - AUROC: 0.9423 - loss: 0.0279 - val_AUPRC: 0.1354 - val_AUROC: 0.7237 - val_loss: 0.0703 - learning_rate: 1.5000e-04
Epoch 15/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.3723 - AUROC: 0.9439 - loss: 0.0273 - val_AUPRC: 0.1351 - val_AUROC: 0.7270 - val_loss: 0.0678 - learning_rate: 1.5000e-04
Epoch 16/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - AUPRC: 0.2420 - AUROC: 0.7649 - loss: 0.0271



[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 294ms/step - AUPRC: 0.3815 - AUROC: 0.9468 - loss: 0.0268 - val_AUPRC: 0.1536 - val_AUROC: 0.7613 - val_loss: 0.0639 - learning_rate: 1.5000e-04
Epoch 17/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 293ms/step - AUPRC: 0.3909 - AUROC: 0.9482 - loss: 0.0264 - val_AUPRC: 0.1420 - val_AUROC: 0.7349 - val_loss: 0.0664 - learning_rate: 1.5000e-04
Epoch 18/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 292ms/step - AUPRC: 0.4008 - AUROC: 0.9489 - loss: 0.0261 - val_AUPRC: 0.1424 - val_AUROC: 0.7334 - val_loss: 0.0670 - learning_rate: 1.5000e-04
Epoch 19/20
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - AUPRC: 0.2580 - AUROC: 0.7674 - loss: 0.0259
Epoch 19: ReduceLROnPlateau reducing learning rate to 7.500000356230885e-05.
[1m1150/1150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 294ms/step - AUPRC: 0.4104 - AUROC: 0.9499 - l

Getting Evaluation Scores

In [11]:
# Evaluation
model.load_weights("fsd50k_cnn.h5")
metrics = model.evaluate(test_ds, return_dict=True)
metrics

[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 98ms/step - AUPRC: 0.1429 - AUROC: 0.7442 - loss: 0.0836


{'AUPRC': 0.14291605353355408,
 'AUROC': 0.7442101836204529,
 'loss': 0.083570696413517}

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Predict probabilities for every evaluation example
y_prob = model.predict(test_ds, verbose=1)

# Convert to 0/1 predictions using a threshold
thresholds = [0.5, 0.25, 0.1, 0.05, 0.02, 0.01]
for t in thresholds:
    y_pred = (y_prob > t).astype(int)
    
    precision_micro = precision_score(y_eval, y_pred, average='micro')
    recall_micro    = recall_score(y_eval, y_pred, average='micro')
    f1_micro        = f1_score(y_eval, y_pred, average='micro')
    accuracy_micro  = accuracy_micro = accuracy_score(y_eval.flatten(), y_pred.flatten())

    precision_macro = precision_score(y_eval, y_pred, average='macro')
    recall_macro    = recall_score(y_eval, y_pred, average='macro')
    f1_macro        = f1_score(y_eval, y_pred, average='macro')
    accuracies = []
    for i in range(NUM_CLASSES):
        tp = np.sum((y_eval[:, i] == 1) & (y_pred[:, i] == 1))
        tn = np.sum((y_eval[:, i] == 0) & (y_pred[:, i] == 0))
        fp = np.sum((y_eval[:, i] == 0) & (y_pred[:, i] == 1))
        fn = np.sum((y_eval[:, i] == 1) & (y_pred[:, i] == 0))

        acc = (tp + tn) / (tp + tn + fp + fn + 1e-7)
        accuracies.append(acc)

    accuracy_macro = np.mean(accuracies)

    print("Evaluation Metrics with a threshold of", t)
    print("Micro Accuracy:", accuracy_micro)
    print("Micro Precision:", precision_micro)
    print("Micro Recall:", recall_micro)
    print("Micro F1:", f1_micro, '\n')

    print("Macro Accuracy:", accuracy_macro)
    print("Macro Precision:", precision_macro)
    print("Macro Recall:", recall_macro)
    print("Macro F1:", f1_macro, '\n')

[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 86ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics with a threshold of 0.5
Micro Accuracy: 0.9816342488515296
Micro Precision: 0.5587419056429233
Micro Recall: 0.12519432065499014
Micro F1: 0.20455507577681822 

Macro Accuracy: 0.9816342488419348
Macro Precision: 0.13007667595782718
Macro Recall: 0.03074771039874952
Macro F1: 0.03845799807178719 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics with a threshold of 0.25
Micro Accuracy: 0.9786707066757893
Micro Precision: 0.38542896050839764
Micro Recall: 0.21999689086952015
Micro F1: 0.28011084353247784 

Macro Accuracy: 0.9786707066662234
Macro Precision: 0.15921206312055644
Macro Recall: 0.07806428375830234
Macro F1: 0.06997934826855841 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics with a threshold of 0.1
Micro Accuracy: 0.9678340338187861
Micro Precision: 0.24253773691975938
Micro Recall: 0.332210591771168
Micro F1: 0.28037873652445827 

Macro Accuracy: 0.9678340338093261
Macro Precision: 0.15882723699682674
Macro Recall: 0.15709384629255976
Macro F1: 0.09823427578646429 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics with a threshold of 0.05
Micro Accuracy: 0.9520457433290979
Micro Precision: 0.1753136249590924
Micro Recall: 0.4163902995129029
Micro F1: 0.2467412832204873 

Macro Accuracy: 0.9520457433197923
Macro Precision: 0.16748673971419056
Macro Recall: 0.23453130960033491
Macro F1: 0.11731576364294045 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics with a threshold of 0.02
Micro Accuracy: 0.9158953181507185
Micro Precision: 0.11761942679720215
Micro Recall: 0.5319722251010467
Micro F1: 0.1926449270263043 

Macro Accuracy: 0.9158953181417661
Macro Precision: 0.14476551297037396
Macro Recall: 0.35792810050747653
Macro F1: 0.13354099712541323 

Evaluation Metrics with a threshold of 0.01
Micro Accuracy: 0.8720731111328316
Micro Precision: 0.08861828182951018
Micro Recall: 0.6227847445331123
Micro F1: 0.15515850218501281 

Macro Accuracy: 0.8720731111243076
Macro Precision: 0.12314062243320681
Macro Recall: 0.46771676862125766
Macro F1: 0.1322362609744049 



In [14]:
from sklearn.metrics import classification_report
y_pred = (y_prob > 0.05).astype(int)
print(classification_report(y_eval, y_pred, target_names=all_labels))


                                                precision    recall  f1-score   support

            Accelerating_and_revving_and_vroom       0.09      0.61      0.15       114
                                     Accordion       0.00      0.00      0.00        50
                               Acoustic_guitar       0.33      0.04      0.08       134
                                      Aircraft       0.06      0.81      0.12        88
                                         Alarm       0.35      0.46      0.40       584
                                        Animal       0.17      0.89      0.29      1082
                                      Applause       0.00      0.00      0.00       150
                                          Bark       0.00      0.00      0.00       122
                                     Bass_drum       0.00      0.00      0.00       119
                                   Bass_guitar       0.25      0.01      0.02        78
                  Bathtub_(fill

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Confusion Matrices

In [15]:
from sklearn.metrics import confusion_matrix
import numpy as np

y_true = y_eval
conf_matrices = []

# Obtains a confusion matrix for each of the 200 classes
for i in range(NUM_CLASSES):
    cm = confusion_matrix(y_true[:, i], y_pred[:, i], labels=[0,1])
    conf_matrices.append(cm)

In [18]:
# Code to display the confusion matrix for a selected class
k = 185  # pick any class index
print(all_labels[k])
print(conf_matrices[k])


Vehicle_horn_and_car_horn_and_honking
[[10160     3]
 [   68     0]]


Predictions on any specific evaluation sample

In [66]:
# Function to get top k class predictions with scores for a given audio file path
def predict_clip(path, k=5):
    x, _ = tf_load_mel(tf.convert_to_tensor(path), tf.zeros([NUM_CLASSES], tf.float32))
    x = tf.expand_dims(x, 0)
    probs = model.predict(x, verbose=0)[0]
    topk = probs.argsort()[-k:][::-1]

    print(path)

    return [(all_labels[i], float(probs[i])) for i in topk]

In [70]:
# Example to get predictions for an audio file
predict_clip("FSD50K/FSD50K.eval_audio/117304.wav", k=5)

FSD50K/FSD50K.eval_audio/117304.wav


[('Liquid', 0.8960501551628113),
 ('Domestic_sounds_and_home_sounds', 0.3265989124774933),
 ('Animal', 0.06631539762020111),
 ('Water', 0.019509442150592804),
 ('Sink_(filling_or_washing)', 0.013542539440095425)]