In [9]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf

In [31]:
data_folder = './data/'
preprocessed_folder = './preprocessed/'
if not os.path.exists(preprocessed_folder):
    os.makedirs(preprocessed_folder)
metadata = pd.read_csv('./data/UrbanSound8K.csv')

In [32]:
for _, row in metadata.iterrows():
    file_path = os.path.join(data_folder, f'fold{row["fold"]}', row['slice_file_name'])
    original_duration = librosa.get_duration(path=file_path)
    y, sr = librosa.load(file_path, sr=None, mono=True, duration=4.0)

    if len(y) < sr * 4:
        repeats = int(np.ceil((sr * 4) / len(y)))
        y = np.tile(y, repeats)[:sr * 4]
    else:
        y = y[:sr * 4]

    preprocessed_path = os.path.join(preprocessed_folder, f'fold{row["fold"]}', f'{row["slice_file_name"][:-4]}.wav')
    
    if not os.path.exists(os.path.dirname(preprocessed_path)):
        os.makedirs(os.path.dirname(preprocessed_path))

    sf.write(preprocessed_path, y, sr)
    print(f"Processed: {row['slice_file_name']} - Original Duration: {original_duration} seconds")

Processed: 100032-3-0-0.wav - Original Duration: 0.31755102040816324 seconds
Processed: 100263-2-0-117.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-121.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-126.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-137.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-143.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-161.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-3.wav - Original Duration: 4.0 seconds
Processed: 100263-2-0-36.wav - Original Duration: 4.0 seconds
Processed: 100648-1-0-0.wav - Original Duration: 0.6485260770975056 seconds
Processed: 100648-1-1-0.wav - Original Duration: 1.05 seconds
Processed: 100648-1-2-0.wav - Original Duration: 0.40532879818594103 seconds
Processed: 100648-1-3-0.wav - Original Duration: 1.62 seconds
Processed: 100648-1-4-0.wav - Original Duration: 1.86 seconds
Processed: 100652-3-0-0.wav - Original Duration: 4.0 seconds
Processed: 100652-3-0-

In [24]:
def extract_features(path):
    y, sr = librosa.load(path,sr=22050)
    melspectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=50, n_fft=1024, hop_length=1024).T, axis=0)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=50).T, axis=0)
    features=np.reshape(np.vstack((melspectrogram,mfcc)),(50,2))
    return features.reshape(50, 2, 1)


In [33]:
extract_features('preprocessed/fold1/40722-8-0-0.wav').shape

(50, 2, 1)

In [45]:
preprocessed_folder = "./preprocessed"  

fold_features = [[] for _ in range(10)]
fold_labels = [[] for _ in range(10)]

for fold_num in range(1, 11):
    fold_folder = os.path.join(preprocessed_folder, f"fold{fold_num}")

    for index, row in metadata.iterrows():
        if row["fold"] == fold_num:
            file_path = os.path.join(fold_folder, row["slice_file_name"])
            class_label = row["class"]
            fold_features[fold_num - 1].append(extract_features(file_path))
            fold_labels[fold_num - 1].append(class_label)


In [46]:
len(fold_features)

10

In [47]:
len(fold_labels)

10

In [48]:
fold_features[0]

[array([[[ 1.82446768e-03],
         [ 3.87084647e-03]],
 
        [[ 5.07727861e-02],
         [ 1.34652793e-01]],
 
        [[ 2.93138981e-01],
         [ 3.66722298e+00]],
 
        [[ 1.84244728e+01],
         [ 1.16331167e+01]],
 
        [[ 7.02173889e-01],
         [ 2.61485308e-01]],
 
        [[ 1.67348847e-01],
         [ 2.95935780e-01]],
 
        [[ 1.12984872e+00],
         [ 5.61654687e-01]],
 
        [[ 3.54807258e-01],
         [ 1.50444373e-01]],
 
        [[ 4.95883524e-02],
         [ 2.69178171e-02]],
 
        [[ 6.15677238e-02],
         [ 1.00730412e-01]],
 
        [[ 8.50106701e-02],
         [ 5.73805831e-02]],
 
        [[ 1.69383548e-02],
         [ 3.22522549e-03]],
 
        [[ 2.25361716e-03],
         [ 2.59597762e-03]],
 
        [[ 1.36030803e-03],
         [ 6.58849254e-04]],
 
        [[ 1.54410343e-04],
         [ 1.74145884e-04]],
 
        [[ 1.07680600e-04],
         [ 4.77173926e-05]],
 
        [[ 4.62757853e-05],
         [ 2.32593629e-05]],

In [49]:
fold_labels[0]

['dog_bark',
 'dog_bark',
 'dog_bark',
 'dog_bark',
 'gun_shot',
 'dog_bark',
 'dog_bark',
 'dog_bark',
 'dog_bark',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'jackhammer',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'engine_idling',
 'children_playing',
 'children_playing',
 'children_playing',
 'children_playing',
 'children_playing',
 'children_playing',
 'children_playing',
 'children_playing',
 'siren',
 'siren',
 'sir

In [50]:
fold_features[0][0]

array([[[ 1.82446768e-03],
        [ 3.87084647e-03]],

       [[ 5.07727861e-02],
        [ 1.34652793e-01]],

       [[ 2.93138981e-01],
        [ 3.66722298e+00]],

       [[ 1.84244728e+01],
        [ 1.16331167e+01]],

       [[ 7.02173889e-01],
        [ 2.61485308e-01]],

       [[ 1.67348847e-01],
        [ 2.95935780e-01]],

       [[ 1.12984872e+00],
        [ 5.61654687e-01]],

       [[ 3.54807258e-01],
        [ 1.50444373e-01]],

       [[ 4.95883524e-02],
        [ 2.69178171e-02]],

       [[ 6.15677238e-02],
        [ 1.00730412e-01]],

       [[ 8.50106701e-02],
        [ 5.73805831e-02]],

       [[ 1.69383548e-02],
        [ 3.22522549e-03]],

       [[ 2.25361716e-03],
        [ 2.59597762e-03]],

       [[ 1.36030803e-03],
        [ 6.58849254e-04]],

       [[ 1.54410343e-04],
        [ 1.74145884e-04]],

       [[ 1.07680600e-04],
        [ 4.77173926e-05]],

       [[ 4.62757853e-05],
        [ 2.32593629e-05]],

       [[ 1.26909290e-05],
        [ 1.64383873e

In [55]:
np.save('./preprocessed/x',fold_features)
np.save('./preprocessed/y',fold_labels)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.