In [1]:
import os
import librosa
import pickle
import numpy as np
import tensorflow as tf
from glob import glob
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras.layers import GlobalAveragePooling1D, Conv1D, Lambda, MaxPooling1D, Activation, Dense, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall
import tensorflow.keras.backend as K
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

In [2]:
# Creating directories in Kaggle
!mkdir -p /kaggle/working/audio
!mkdir -p /kaggle/working/output

In [3]:
# Downloading the dataset from Dropbox
!curl -L https://www.dropbox.com/s/4jw31k5mlzcmgis/genres.tar.gz?dl=1 -o /kaggle/working/genres.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   130  100   130    0     0    717      0 --:--:-- --:--:-- --:--:--   714
100    17  100    17    0     0     12      0  0:00:01  0:00:01 --:--:--   151
100   475    0   475    0     0    246      0 --:--:--  0:00:01 --:--:--   246
100 1168M  100 1168M    0     0  17.2M      0  0:01:07  0:01:07 --:--:-- 16.1M:28  0:00:35 16.9M


In [4]:
# Extracting the dataset
!tar -xvzf /kaggle/working/genres.tar.gz -C /kaggle/working/audio
!ls /kaggle/working/audio/genres

genres/
genres/blues/
genres/blues/blues.00000.au
genres/blues/blues.00001.au
genres/blues/blues.00002.au
genres/blues/blues.00003.au
genres/blues/blues.00004.au
genres/blues/blues.00005.au
genres/blues/blues.00006.au
genres/blues/blues.00007.au
genres/blues/blues.00008.au
genres/blues/blues.00009.au
genres/blues/blues.00010.au
genres/blues/blues.00011.au
genres/blues/blues.00012.au
genres/blues/blues.00013.au
genres/blues/blues.00014.au
genres/blues/blues.00015.au
genres/blues/blues.00016.au
genres/blues/blues.00017.au
genres/blues/blues.00018.au
genres/blues/blues.00019.au
genres/blues/blues.00020.au
genres/blues/blues.00021.au
genres/blues/blues.00022.au
genres/blues/blues.00023.au
genres/blues/blues.00024.au
genres/blues/blues.00025.au
genres/blues/blues.00026.au
genres/blues/blues.00027.au
genres/blues/blues.00028.au
genres/blues/blues.00029.au
genres/blues/blues.00030.au
genres/blues/blues.00031.au
genres/blues/blues.00032.au
genres/blues/blues.00033.au
genres/blues/blues.00034.a

In [5]:
# Defining paths and constants

DATA_AUDIO_DIR = '/kaggle/working/audio/genres'
TARGET_SR = 8000
OUTPUT_DIR_TRAIN = '/kaggle/working/output/train'
OUTPUT_DIR_TEST = '/kaggle/working/output/test'
AUDIO_LENGTH = 10000
NUM_CLASSES = 10
BATCH_SIZE = 128

os.makedirs(OUTPUT_DIR_TRAIN, exist_ok=True)
os.makedirs(OUTPUT_DIR_TEST, exist_ok=True)

In [6]:
# Definition of data handling functions

def extract_class_id(AU_filename):
    genres = {
        'blues': 1,
        'classical': 2,
        'country': 3,
        'disco': 4,
        'hiphop': 5,
        'jazz': 6,
        'metal': 7,
        'pop': 8,
        'reggae': 9,
        'rock': 10
    }

    for genre, class_id in genres.items():
        if genre in AU_filename:
            return class_id
        return None

def read_audio(filename):
    audio, _ = librosa.load(filename, sr=TARGET_SR, mono=True)
    audio = (audio - np.mean(audio)) / np.std(audio)
    if len(audio) < AUDIO_LENGTH:
        audio = np.pad(audio, (0, AUDIO_LENGTH - len(audio)), 'constant')
    return audio[:AUDIO_LENGTH].reshape(-1, 1)

def process_audio_file(i, filename):
    class_id = extract_class_id(filename)
    audio_buf = read_audio(filename)
    output_folder = OUTPUT_DIR_TRAIN if i % 10 != 0 else OUTPUT_DIR_TEST
    output_filename = os.path.join(output_folder, f'{i}.pkl')
    with open(output_filename, 'wb') as f:
        pickle.dump({'class_id': class_id, 'audio': audio_buf, 'sr': TARGET_SR}, f)

def convert_data_parallel():
    filenames = list(glob(os.path.join(DATA_AUDIO_DIR, '**/*.au'), recursive=True))
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        executor.map(process_audio_file, range(len(filenames)), filenames)

convert_data_parallel()

In [8]:
# Defining and Compiling Model

def model_10(num_classes=10):
    m = Sequential()
    m.add(Conv1D(128,
                 input_shape=[AUDIO_LENGTH, 1],
                 kernel_size=80,
                 strides=4,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(128,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(256,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(512,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Lambda(lambda x: K.mean(x, axis=1)))
    m.add(Dense(num_classes, activation='softmax'))
    return m

def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall + K.epsilon())

num_classes = 10
model = model_10(num_classes=num_classes)

if model is None:
    exit('Something went wrong!!')

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', Precision(name='precision'), Recall(name='recall'), f1_score])
print(model.summary())

None


In [9]:
# Load Training and Testing Data

def get_data(file_list):
    x, y = [], []
    for filename in file_list:
        with open(filename, 'rb') as f:
            audio_element = pickle.load(f)
            x.append(audio_element['audio'])
            y.append(int(audio_element['class_id']))
    return np.array(x), np.array(y)

train_files = glob(os.path.join(OUTPUT_DIR_TRAIN, '**/*.pkl'))
x_tr, y_tr = get_data(train_files)
y_tr = to_categorical(y_tr, num_classes=num_classes)

test_files = glob(os.path.join(OUTPUT_DIR_TEST, '**/*.pkl'))
x_te, y_te = get_data(test_files)
y_te = to_categorical(y_te, num_classes=num_classes)

In [10]:
print('X Train shape: ', x_tr.shape)
print('Y Train shape: ', y_tr.shape)
print('X Test shape: ', x_te.shape)
print('Y Test shape: ', y_te.shape)

X Train shape:  (0,)
Y Train shape:  (0, 10)
X Test shape:  (0,)
Y Test shape:  (0, 10)


In [11]:
# Training Model

reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.5, patience=10, min_lr=0.0001, verbose=1)
batch_size = 128

model.fit(x=x_tr,
          y=y_tr,
          batch_size=batch_size,
          epochs=10,
          verbose=1,
          shuffle=True,
          validation_data=(x_te, y_te),
          callbacks=[reduce_lr])

Epoch 1/10


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(128,), dtype=float32). Expected shape (None, 10000, 1), but input has incompatible shape (128,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(128,), dtype=float32)
  • training=True
  • mask=None

In [None]:
# Evaluate the model
loss, accuracy, precision, recall, f1 = model.evaluate(x_te, y_te, verbose=0)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")