In [25]:
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, BatchNormalization, Dense, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [26]:
# Creating directories in Kaggle
!mkdir -p /kaggle/working/audio
!mkdir -p /kaggle/working/output

  pid, fd = os.forkpty()


In [27]:
# Downloading the dataset from Dropbox
!curl -L https://www.dropbox.com/s/4jw31k5mlzcmgis/genres.tar.gz?dl=1 -o /kaggle/working/genres.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   130  100   130    0     0    636      0 --:--:-- --:--:-- --:--:--   637
100    17  100    17    0     0     16      0  0:00:01  0:00:01 --:--:--     0
100   475    0   475    0     0    256      0 --:--:--  0:00:01 --:--:--     0
100 1168M  100 1168M    0     0  75.3M      0  0:00:15  0:00:15 --:--:-- 89.6M    0  0:00:15  0:00:13  0:00:02 92.0M


In [28]:
# Extracting the dataset
!tar -xvzf /kaggle/working/genres.tar.gz -C /kaggle/working/audio
!ls /kaggle/working/audio/genres

genres/
genres/blues/
genres/blues/blues.00000.au
genres/blues/blues.00001.au
genres/blues/blues.00002.au
genres/blues/blues.00003.au
genres/blues/blues.00004.au
genres/blues/blues.00005.au
genres/blues/blues.00006.au
genres/blues/blues.00007.au
genres/blues/blues.00008.au
genres/blues/blues.00009.au
genres/blues/blues.00010.au
genres/blues/blues.00011.au
genres/blues/blues.00012.au
genres/blues/blues.00013.au
genres/blues/blues.00014.au
genres/blues/blues.00015.au
genres/blues/blues.00016.au
genres/blues/blues.00017.au
genres/blues/blues.00018.au
genres/blues/blues.00019.au
genres/blues/blues.00020.au
genres/blues/blues.00021.au
genres/blues/blues.00022.au
genres/blues/blues.00023.au
genres/blues/blues.00024.au
genres/blues/blues.00025.au
genres/blues/blues.00026.au
genres/blues/blues.00027.au
genres/blues/blues.00028.au
genres/blues/blues.00029.au
genres/blues/blues.00030.au
genres/blues/blues.00031.au
genres/blues/blues.00032.au
genres/blues/blues.00033.au
genres/blues/blues.00034.a

In [29]:
# Defining paths and constants
DATA_AUDIO_DIR = '/kaggle/working/audio/genres'
TARGET_SR = 22050
NUM_CLASSES = 10

# Custom transformer for feature extraction
class AudioFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = []
        for filename in X:
            audio, _ = librosa.load(filename, sr=TARGET_SR)
            mfccs = librosa.feature.mfcc(y=audio, sr=TARGET_SR, n_mfcc=40)
            chroma = librosa.feature.chroma_stft(y=audio, sr=TARGET_SR)
            spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=TARGET_SR)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(y=audio)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=TARGET_SR)
            spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=TARGET_SR)

            # Combine features into a single vector
            combined_features = np.hstack([
                np.mean(mfccs.T, axis=0),
                np.mean(chroma.T, axis=0),
                np.mean(spectral_contrast.T, axis=0),
                np.mean(zero_crossing_rate.T, axis=0),
                np.mean(spectral_rolloff.T, axis=0),
                np.mean(spectral_centroid.T, axis=0)
            ])
            features.append(combined_features)
        return np.array(features)

In [32]:
# Definition of data handling functions

# Function to extract class ID from filename
def extract_class_id(filename):
    genre = filename.split('/')[-2]
    genre_to_class_id = {
        'blues': 0,
        'classical': 1,
        'country': 2,
        'disco': 3,
        'hiphop': 4,
        'jazz': 5,
        'metal': 6,
        'pop': 7,
        'reggae': 8,
        'rock': 9
    }
    return genre_to_class_id.get(genre)
    
# Function to gather all audio files and their labels
def load_data():
    filenames = list(glob(os.path.join(DATA_AUDIO_DIR, '**/*.au'), recursive=True))
    x_data = []
    y_data = []

    for filename in filenames:
        class_id = extract_class_id(filename)
        if class_id is not None:
            x_data.append(filename)
            y_data.append(class_id)

    return np.array(x_data), np.array(y_data)

# Load data and create train/test split (90% train, 10% test)
x_data, y_data = load_data()
x_tr, x_te, y_tr, y_te = train_test_split(x_data, y_data, test_size=0.1, random_state=42)

# Create a pipeline for feature extraction and model training
pipeline = Pipeline([
    ('feature_extractor', AudioFeatureExtractor())
])

# Fit the pipeline on training data and transform both train and test sets
x_tr_features = pipeline.fit_transform(x_tr)
x_te_features = pipeline.transform(x_te)

# One-hot encode labels
y_tr_encoded = to_categorical(y_tr, num_classes=NUM_CLASSES)
y_te_encoded = to_categorical(y_te, num_classes=NUM_CLASSES)

# Reshape data for CNN input (adding channel dimension)
x_tr_features = np.expand_dims(x_tr_features, axis=-1)
x_te_features = np.expand_dims(x_te_features, axis=-1)

In [33]:
print('X Train shape: ', x_tr_features.shape)
print('Y Train shape: ', y_tr_encoded.shape)
print('X Test shape: ', x_te_features.shape)
print('Y Test shape: ', y_te_encoded.shape)

X Train shape:  (900, 62, 1)
Y Train shape:  (900, 10)
X Test shape:  (100, 62, 1)
Y Test shape:  (100, 10)


In [34]:
# Define the model architecture
model = Sequential()

# First Conv Layer
model.add(Conv1D(128, kernel_size=3, strides=1, padding='same', input_shape=(x_tr_features.shape[1], 1)))
model.add(BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# Second Conv Layer
model.add(Conv1D(256, kernel_size=3, strides=1, padding='same'))
model.add(BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# Third Conv Layer
model.add(Conv1D(512, kernel_size=3, strides=1, padding='same'))
model.add(BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.4))

# Global Pooling and Dense Layers
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(NUM_CLASSES, activation='softmax'))

# Compile the model with Adam optimizer and custom metrics
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

# Print the model summary
print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [35]:
# Callbacks for learning rate reduction and early stopping
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Training the model with validation data
model.fit(x_tr_features,
                    y_tr_encoded,
                    batch_size=32,
                    epochs=100,
                    validation_data=(x_te_features, y_te_encoded),
                    shuffle=True,
                    callbacks=[reduce_lr, early_stopping])

Epoch 1/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 299ms/step - accuracy: 0.1183 - loss: 4.4516 - precision_2: 0.1027 - recall_2: 0.0527 - val_accuracy: 0.1300 - val_loss: 2.3169 - val_precision_2: 0.0000e+00 - val_recall_2: 0.0000e+00 - learning_rate: 0.0010
Epoch 2/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1319 - loss: 2.3840 - precision_2: 0.1614 - recall_2: 0.0044 - val_accuracy: 0.1400 - val_loss: 2.2860 - val_precision_2: 0.5000 - val_recall_2: 0.0100 - learning_rate: 0.0010
Epoch 3/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1760 - loss: 2.2630 - precision_2: 0.4115 - recall_2: 0.0143 - val_accuracy: 0.1600 - val_loss: 2.1688 - val_precision_2: 1.0000 - val_recall_2: 0.0100 - learning_rate: 0.0010
Epoch 4/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2080 - loss: 2.1385 - precision_2: 0.6279 - recall_2: 0.0626 - va

<keras.src.callbacks.history.History at 0x7ebbe7e000d0>

In [36]:
# Evaluate the model on test data
loss, accuracy, precision_val, recall_val = model.evaluate(x_te_features, y_te_encoded, verbose=2)

# Calculate F1 score using sklearn's f1_score function
f1_val = f1_score(np.argmax(y_te_encoded, axis=-1), np.argmax(model.predict(x_te_features), axis=-1), average='weighted')

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision_val:.4f}")
print(f"Test Recall: {recall_val:.4f}")
print(f"Test F1-score: {f1_val:.4f}")

4/4 - 0s - 7ms/step - accuracy: 0.4600 - loss: 1.6530 - precision_2: 0.8125 - recall_2: 0.1300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 134ms/step
Test Loss: 1.6530
Test Accuracy: 0.4600
Test Precision: 0.8125
Test Recall: 0.1300
Test F1-score: 0.3918
