# 01b — Model Training on FMA (UNet_Audio_Classifier)

Allena solo `UNet_Audio_Classifier` su FMA e salva risultati.


In [1]:
# Load processed FMA data
import os, pickle, time, numpy as np, pandas as pd, tensorflow as tf, keras
from keras import layers, models, callbacks
from keras.utils import to_categorical
from pathlib import Path
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[1]
PROCESSED = PROJECT_ROOT/'data'/'processed_fma'
MODELS = PROJECT_ROOT/'models'
REPORTS = PROJECT_ROOT/'reports'
MODELS.mkdir(exist_ok=True); REPORTS.mkdir(exist_ok=True)
X_train = np.load(PROCESSED/'X_train.npy'); y_train = np.load(PROCESSED/'y_train.npy')
X_val = np.load(PROCESSED/'X_val.npy'); y_val = np.load(PROCESSED/'y_val.npy')
X_test = np.load(PROCESSED/'X_test.npy'); y_test = np.load(PROCESSED/'y_test.npy')
with open(PROCESSED/'label_encoder.pkl','rb') as f: le = pickle.load(f)
num_classes = len(le.classes_)
y_train_cat = to_categorical(y_train, num_classes)
y_val_cat = to_categorical(y_val, num_classes)
y_test_cat = to_categorical(y_test, num_classes)
print('FMA shapes:', X_train.shape, X_val.shape, X_test.shape, '| classes:', num_classes)

2025-08-17 16:18:05.574974: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-17 16:18:05.822587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755440285.923459   64084 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755440285.946691   64084 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755440286.179659   64084 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

FMA shapes: (60, 128, 128, 1) (30, 128, 128, 1) (50, 128, 128, 1) | classes: 8


In [2]:
# Define UNet_Audio_Classifier (matching GTZAN version used)
def build_unet_audio_classifier(input_shape, num_classes):
    i = layers.Input(shape=input_shape)
    x = layers.Conv2D(32,3,padding='same',use_bias=False)(i); x = layers.BatchNormalization()(x); x = layers.PReLU(shared_axes=[1,2])(x)
    x = layers.Conv2D(32,3,padding='same',use_bias=False)(x); x = layers.BatchNormalization()(x); x = layers.PReLU(shared_axes=[1,2])(x)
    x = layers.MaxPooling2D(2)(x)
    x = layers.Conv2D(64,3,padding='same',use_bias=False)(x); x = layers.BatchNormalization()(x); x = layers.PReLU(shared_axes=[1,2])(x)
    x = layers.MaxPooling2D(2)(x)
    x = layers.Conv2D(128,3,padding='same',use_bias=False)(x); x = layers.BatchNormalization()(x); x = layers.PReLU(shared_axes=[1,2])(x)
    x = layers.GlobalAveragePooling2D()(x); x = layers.Dropout(0.5)(x)
    o = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    return models.Model(i,o,name='UNet_Audio_Classifier')

input_shape = X_train.shape[1:]
model = build_unet_audio_classifier(input_shape, num_classes)
model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

I0000 00:00:1755440288.629067   64084 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10162 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070, pci bus id: 0000:01:00.0, compute capability: 8.9


In [3]:
# Train
AUTOTUNE = tf.data.AUTOTUNE
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train_cat)).shuffle(len(X_train)).batch(64).prefetch(AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val_cat)).batch(64).prefetch(AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test_cat)).batch(64).prefetch(AUTOTUNE)
cb = [callbacks.EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True),
      callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7),
      callbacks.ModelCheckpoint(MODELS/'UNet_Audio_Classifier_best_FMA.keras', monitor='val_accuracy', save_best_only=True)]
h = model.fit(train_ds, validation_data=val_ds, epochs=100, verbose=1, callbacks=cb)
test_loss, test_acc = model.evaluate(test_ds, verbose=0)
print('FMA Test Accuracy:', test_acc)
pd.DataFrame([{
  'Model':'UNet_Audio_Classifier','Dataset':'FMA_SMALL','Best_Val_Accuracy':float(np.max(h.history.get('val_accuracy',[0]))),
  'Test_Accuracy': float(test_acc), 'Epochs_Run': int(len(h.history.get('val_accuracy',[])))
}]).to_csv(REPORTS/'training_summary_FMA.csv', index=False)
print('Saved:', REPORTS/'training_summary_FMA.csv')

Epoch 1/100


I0000 00:00:1755440290.506077   64215 service.cc:152] XLA service 0x75f8c8012510 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755440290.506091   64215 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4070, Compute Capability 8.9
2025-08-17 16:18:10.559645: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1755440290.824691   64215 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1755440290.824691   64215 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0000e+00 - loss: 2.9084

I0000 00:00:1755440295.484276   64215 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - accuracy: 0.0000e+00 - loss: 2.9084 - val_accuracy: 0.0000e+00 - val_loss: 2.1122 - learning_rate: 0.0010
Epoch 2/100
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.0500 - loss: 2.9718 - val_accuracy: 0.0000e+00 - val_loss: 2.0922 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.0167 - loss: 2.6941 - val_accuracy: 0.0000e+00 - val_loss: 2.0680 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.0500 - loss: 2.5912 - val_accuracy: 0.0000e+00 - val_loss: 2.0605 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.0833 - loss: 2.4607 - val_accuracy: 0.1000 - val_loss: 2.0398 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m