#some imports 

In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf

#example with one audio

In [None]:
metadata_path = 'c:/Users/leonor/desktop/ACII...folders/MCII-Project/UrbanSound8K/metadata/UrbanSound8K.csv'
audio_folder = 'c:/Users/leonor/desktop/ACII...folders/MCII-Project/UrbanSound8K/audio'

metadata = pd.read_csv(metadata_path)
print(metadata.head())


'''#ex: load single audio file
example_row = metadata.iloc[0]
file_path = os.path.join(audio_folder,f"fold{example_row['fold']}", example_row['slice_file_name'])
label = example_row['class']

#loading audio file
y, sr = librosa.load(file_path, sr = 22050) #resample to 22050 Hz
print(f"Audio file: {file_path}, Sample rate: {sr}, Duration: {len(y)/sr:.2f} seconds")
'''


      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  
0          dog_bark  
1  children_playing  
2  children_playing  
3  children_playing  
4  children_playing  


'#ex: load single audio file\nexample_row = metadata.iloc[0]\nfile_path = os.path.join(audio_folder,f"fold{example_row[\'fold\']}", example_row[\'slice_file_name\'])\nlabel = example_row[\'class\']\n\n#load audio file\ny, sr = librosa.load(file_path, sr = 22050) #resample to 22050 Hz\nprint(f"Audio file: {file_path}, Sample rate: {sr}, Duration: {len(y)/sr:.2f} seconds")\n'

#pre-processing of all audio files
#feature extraction

In [3]:
from tqdm import tqdm #to track progress

def extract_features(metadata, audio_folder, fixed_length=128):
    features = []
    labels = []

    for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
        file_path = None #Initialize to avoid issues in the 'except' block
        try:
            #construct file path
            file_path = os.path.join(audio_folder, f"fold{row['fold']}", row['slice_file_name'])
            #File loading: It reads the audio file using librosa.load and converts the sound into a format your program understands (a waveform and its sample rate)
            y, sr = librosa.load(file_path, sr=22050)
            #adjust n_fft dynamically for short clips
            n_fft = min(2048, len(y))
            #compute mel-spectrogram
            S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=512, n_mels=128)
            S_dB = librosa.power_to_db(S, ref=np.max)
            #ensure fixed-length feature by padding/truncating
            if S_dB.shape[1]<fixed_length:
                #ensure fixed length features
                repeat_times = (fixed_length // S_dB.shape[1])+1
                extended = np.tile(S_dB, (1,repeat_times))
                features.append(extended[:, :fixed_length]) 
            else:
                #truncate if too long
                features.append(S_dB[:, :fixed_length])

            labels.append(row['classID'])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    return features, labels

#extract features and labels
features, labels = extract_features(metadata, audio_folder)

#convert to NumPy arrays for model compatibility
x = np.array(features)
y = np.array(labels)

print(f"Feature array shape: {x.shape}")
print(f"Labels array shape: {y.shape}")

100%|██████████| 8732/8732 [05:22<00:00, 27.05it/s]


Feature array shape: (8732, 128, 128)
Labels array shape: (8732,)


#10-fold cross validation splits

In [4]:
from sklearn.model_selection import GroupKFold

def prepare_splits(x, y, metadata, n_splits=10):
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = metadata['fold']
    
    for train_idx, test_idx in group_kfold.split(x, y, groups):
        val_idx = train_idx[:len(train_idx)//10]  # Use 10% of training data for validation
        train_idx = train_idx[len(train_idx)//10:]

        x_train, x_val, x_test = x[train_idx], x[val_idx], x[test_idx]
        y_train, y_val, y_test = y[train_idx], y[val_idx], y[test_idx]

        yield x_train[..., np.newaxis], x_val[..., np.newaxis], x_test[..., np.newaxis], y_train, y_val, y_test


#CNN MODEL

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def new_cnn_model(input_shape=(128,128,1), num_classes=10):
    model =Sequential()

    #1st conv + MaxPooling
    model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    #2nd Conv + MaxPooling
    model.add(Conv2D(64, kernel_size=(3,3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    #3rd Conv + MaxPooling
    model.add(Conv2D(128, kernel_size=(3,3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.4))

    #Flattening
    model.add(Flatten())

    #Fully connected layers
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))

    #Output layer
    model.add(Dense(num_classes, activation='softmax'))

    #Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001), #Adaptive gradient optimizer
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

cnn_model = new_cnn_model()
cnn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#simple supervised training

In [None]:
'''
#training parameters
batch_size = 32
epochs = 20

#callbacks
early_stopping = EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1
)

#train model
history = cnn_model.fit(
    x_train, y_train,
    validation_data=(x_val,y_val),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

#evaluate model
test_loss, test_accuracy = cnn_model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_accuracy:.2f}, Test loss: {test_loss:.2f}")'''

'from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n\n#training parameters\nbatch_size = 32\nepochs = 20\n\n#callbacks\nearly_stopping = EarlyStopping(\n    monitor=\'val_loss\', patience=5, restore_best_weights=True, verbose=1\n)\nreduce_lr = ReduceLROnPlateau(\n    monitor=\'val_loss\', factor=0.5, patience=3, min_lr=1e-6, verbose=1\n)\n\n#train model\nhistory = cnn_model.fit(\n    x_train, y_train,\n    validation_data=(x_val,y_val),\n    batch_size=batch_size,\n    epochs=epochs,\n    callbacks=[early_stopping, reduce_lr],\n    verbose=1\n)\n\n#evaluate model\ntest_loss, test_accuracy = cnn_model.evaluate(x_test, y_test)\nprint(f"Test accuracy: {test_accuracy:.2f}, Test loss: {test_loss:.2f}")'

#10-fold cross-validation with CNN implemented

In [None]:
#from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


#fold results
fold_accuracies = []
fold_losses = []

for x_train, x_val, x_test, y_train, y_val, y_test in prepare_splits(x, y, metadata):
    cnn_model = new_cnn_model(input_shape=(128, 128, 1), num_classes=len(np.unique(y)))

       # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1
    )

    #training model
    history = cnn_model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        batch_size=32,
        epochs=20,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    #evatuation on the test set
    test_loss, test_accuracy = cnn_model.evaluate(x_test, y_test, verbose=0)
    print(f"Test accuracy: {test_accuracy:.2f}, Test loss: {test_loss:.2f}")
    fold_accuracies.append(test_accuracy)
    fold_losses.append(test_loss)

#averages
average_accuracy = np.mean(fold_accuracies)
average_loss = np.mean(fold_losses)

print(f"Average Test Accuracy: {average_accuracy:.2f}")
print(f"Average Test Loss: {average_loss:.2f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 290ms/step - accuracy: 0.1129 - loss: 10.7524 - val_accuracy: 0.0711 - val_loss: 2.2571 - learning_rate: 0.0010
Epoch 2/20
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 286ms/step - accuracy: 0.1206 - loss: 2.2707 - val_accuracy: 0.0711 - val_loss: 2.2499 - learning_rate: 0.0010
Epoch 3/20
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 287ms/step - accuracy: 0.1232 - loss: 2.2648 - val_accuracy: 0.0891 - val_loss: 2.2504 - learning_rate: 0.0010
Epoch 4/20
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 287ms/step - accuracy: 0.1247 - loss: 2.2594 - val_accuracy: 0.0711 - val_loss: 2.2342 - learning_rate: 0.0010
Epoch 5/20
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 289ms/step - accuracy: 0.1674 - loss: 2.1849 - val_accuracy: 0.1240 - val_loss: 2.1137 - learning_rate: 0.0010
Epoch 6/20
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[

"\n#previously defined folds from metadata\nfolds = metadata['fold'].values\nclass_labels = metadata['classID'].values\n\nfor test_fold in range(1,11):\n    for val_fold in range(1,11):\n        if val_fold == test_fold:\n            continue\n        #create train, test and val splits\n        train_indices = metadata.index[~metadata['fold'].isin([test_fold, val_fold])].tolist()\n        val_indices = metadata.index[metadata['fold'] == val_fold].tolist()\n        test_indices = metadata.index[metadata['fold'] == test_fold].tolist()\n\n        x_train = x[train_indices]\n        y_train = y[train_indices]\n        x_val = x[val_indices]\n        y_val = y[val_indices]\n        x_test = x[test_indices]\n        y_test = y[test_indices]\n\n        #CNN model\n        model = Sequential([\n            Conv2D(32,(3,3), activation='relu', input_shape=x_train.shape[1:]),\n            MaxPooling2D(pool_size=(2,2)),\n            Dropout(0.25),\n            Conv2D(64,(3,3), activation='relu'),\