   Music data recognition
   GTZAN Dataset - Music Genre Classification, Exracted_music_json

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import json
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow.keras as keras
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from glob import glob
import librosa
import librosa.display
import IPython.display as ipd

from itertools import cycle
sns.set_theme(style='white', palette=None)
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [None]:
DATASET_PATH = '/Users/anagha/Desktop/Project/Data/genres_original'
JSON_PATH = 'data.json'
SAMPLE_RATE = 22050
DURATION = 30   # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION
def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    #dictionary to store  data
    data = {
        'mapping': [],
        'mfcc': [],
        'labels':[]
    }
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length)
    # loop through all the genres
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        #ensure that we're not at the root level
        if dirpath is not dataset_path:
            
            # save the semantic level
            dir_path_components = dirpath.split('/') #genre/blue => ['genre', 'blues']
            semantic_label = dir_path_components[-1]
            data['mapping'].append(semantic_label)
            print('\nProcessing {}'.format(semantic_label))
            # process files for a specific genre 
            for f in filenames:
                #load audio file
                file_path = os.path.join(dirpath, f)
                try:
                    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                    # process segments extracting mfcc and storing data
                    for s in range(num_segments):
                        start_sample = num_samples_per_segment * s  # s=0 -> 0
                        finish_sample = start_sample + num_samples_per_segment  # s=0 -> num_samples_per_segment
                        
                        # store mfcc for segment if it hasthe expected length
                        mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample],
                                                    sr=sr,
                                                    n_fft=n_fft,
                                                    hop_length=hop_length,
                                                    n_mfcc=n_mfcc)
                        mfcc = mfcc.T
                        if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                            data['mfcc'].append(mfcc.tolist())
                            data['labels'].append(i - 1)
#                             print('{}, segment:{}'.format(file_path, s+1))
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    
    with open(json_path, 'w') as fp:
        json.dump(data, fp, indent=4)
                    
                    
if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH, num_segments=50)


Processing pop

Processing metal

Processing disco

Processing blues

Processing reggae

Processing classical

Processing rock


In [None]:
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

for genre in genres:
    audio_files = glob(f'/Users/anagha/Desktop/Project/Data/genres_original/{genre}/*.wav')
    if len(audio_files) > 0:
        audio_file = audio_files[0]  # Select the first audio file
        print(f"Click to Play {genre.capitalize()}")
        # Play the audio file
        ipd.display(ipd.Audio(audio_file))
    else:
        print(f"No audio files found for genre: {genre}")

In [None]:
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'metal', 'jazz','pop', 'reggae', 'rock']
fig, axs = plt.subplots(len(genres), 3, figsize=(20, 60))

for i, genre in enumerate(genres):
    audio_files = glob(f'/Users/anagha/Desktop/Project/Data/genres_original/{genre}/*.wav')
    if len(audio_files) > 0:
        audio_file = audio_files[0]

        # Raw Audio
        y, sr = librosa.load(audio_file)
        axs[i, 0].plot(np.arange(len(y)) / sr, y, lw=1)
        axs[i, 0].set_title(f"Raw Audio - {genre}")
        axs[i, 0].set_xlabel('Time (s)')
        axs[i, 0].set_ylabel('Amplitude')

        # Trimmed Audio
        y_trimmed, _ = librosa.effects.trim(y, top_db=20)
        pd.Series(y_trimmed).plot(ax=axs[i, 1], lw=1, title=f"Raw Audio Trimmed ({genre})", color=color_pal[1])

        # Spectrogram
        D = librosa.stft(y)
        S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
        img = librosa.display.specshow(S_db, x_axis='time', y_axis='log', ax=axs[i, 2])
        axs[i, 2].set_title(f'Spectrogram Example ({genre.capitalize()})', fontsize=15)
        fig.colorbar(img, ax=axs[i, 2], format='%+2.0f dB')

    else:
        for j in range(3):
            axs[i, j].axis('off')
        axs[i, 0].text(0.5, 0.5, f"No audio files found for genre: {genre}", horizontalalignment='center', verticalalignment='center', transform=axs[i, 0].transAxes)

plt.tight_layout()
plt.show()

In [None]:
# Anagha S
# anaghasudarshan23@gmail.com

In [None]:
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

for genre in genres:
    audio_files = glob(f'/Users/anagha/Desktop/Project/Data/genres_original/{genre}/*.wav')
    if len(audio_files) > 0:
        audio_file = audio_files[0]  # Select the first audio file

        y, sr = librosa.load(audio_file)

        # Compute the Mel spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128 * 2)
        S_db_mel = librosa.amplitude_to_db(S, ref=np.max)

        # Plot the Mel spectrogram
        fig, ax = plt.subplots(figsize=(15, 5))
        img = librosa.display.specshow(S_db_mel, x_axis='time', y_axis='log', ax=ax)
        ax.set_title(f'Mel Spectrogram - {genre.capitalize()}', fontsize=20)
        fig.colorbar(img, ax=ax, format='%+2.0f dB')

        # Display the plot
        plt.show()
    else:
        print(f"No audio files found for genre: {genre}")

Machine Learning

Load Data:
The load_data function reads a dataset from a JSON file containing MFCC (Mel-frequency cepstral coefficients) features and their corresponding labels. MFCCs are commonly used in audio processing for representing the characteristics of audio signals.

Split Data:
The loaded data is split into training and testing sets using the train_test_split function from scikit-learn. This is a common practice in machine learning to evaluate the model's performance on unseen data.

Build Neural Network:
The neural network model is constructed using the Sequential API in Keras. This model consists of several layers:

Input Layer:
A Flatten layer is used to flatten the input MFCC data into a one-dimensional array. Dense Layers: Three dense (fully connected) layers with ReLU activation functions are added. These layers learn complex patterns in the data.

Dropout Layers:
Dropout layers are added after each dense layer to prevent overfitting. They randomly drop a fraction of the neurons during training. Output Layer: The output layer is a dense layer with a softmax activation function, which is suitable for multi-class classification problems. It outputs the probability distribution over the classes.

Compile Model:
The model is compiled using the Adam optimizer, sparse categorical crossentropy loss function, and accuracy metric. This prepares the model for training by specifying how it should learn from the data and evaluate its performance.

Train Model:
The model is trained using the fit method. The training data (X_train, y_train) is used for training, and the validation data (X_test, y_test) is used to evaluate the model's performance after each epoch. The batch_size parameter specifies the number of samples per gradient update, and the epochs parameter specifies the number of training iterations.

Plot Training History:
The plot_history function is used to visualize the training history, including the training and validation accuracy and loss over epochs. This helps to understand how the model is learning and whether it is overfitting or underfitting.

In [None]:
DATA_PATH = '/Users/anagha/Desktop/Project/data_50.json'
def load_data(data_path):
    '''Loads training dataset from json file.
    
        :param data_path (str): Path to json file containing data
        :return X (ndarray): Inputs
        :return y (ndarray): Targets
    '''
    with open(data_path, 'r') as fp:
        data = json.load(fp)
    
    X = np.array(data['mfcc'])
    y = np.array(data['labels'])
    print("Data succesfully loaded!")
    return X, y


def plot_history(history):
    
    fig = make_subplots(rows=2, cols=1, subplot_titles=('Accuracy eval', 'Error eval'))

    fig.add_trace(go.Scatter(x=list(range(1, len(history.history['accuracy']) + 1)),
                             y=history.history['accuracy'], mode='lines', name='train accuracy'),
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=list(range(1, len(history.history['val_accuracy']) + 1)),
                             y=history.history['val_accuracy'], mode='lines', name='test accuracy'),
                  row=1, col=1)
    fig.update_yaxes(title_text='Accuracy', row=1, col=1)
    fig.update_xaxes(title_text='Epoch', row=1, col=1)

    fig.add_trace(go.Scatter(x=list(range(1, len(history.history['loss']) + 1)),
                             y=history.history['loss'], mode='lines', name='train error'),
                  row=2, col=1)
    fig.add_trace(go.Scatter(x=list(range(1, len(history.history['val_loss']) + 1)),
                             y=history.history['val_loss'], mode='lines', name='test error'),
                  row=2, col=1)
    fig.update_yaxes(title_text='Error', row=2, col=1)
    fig.update_xaxes(title_text='Epoch', row=2, col=1)

    fig.update_layout(height=800, showlegend=True)
    fig.show()
    
    


if __name__ == '__main__':
    #load data
    X, y = load_data(DATA_PATH)
    
    # Create train/tets split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    # Build network topology
    model = keras.Sequential([
        # input layer
        keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),
        # 1st dense layer
        keras.layers.Dense(512, input_dim= X_train.shape[1], activation='relu'),
        # 2nd dense layer
        keras.layers.Dense(256, activation='relu'),
        # 3rd dense layer
        keras.layers.Dense(64, activation='relu'),
        # output layer
        keras.layers.Dense(10, activation='softmax')
    ])
    
    optimizer = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=100)
    
    plot_history(history)

In deep learning, particularly in the context of training neural networks, the terms "loss," "accuracy," "validation loss," and "validation accuracy" are commonly used metrics to evaluate the performance of a model during training and validation. Here's a brief explanation of each:

Loss: The loss, often represented as a single scalar value, indicates how well the model's predictions match the actual ground truth labels in the training data. It measures the error between the predicted output and the actual target values. The goal during training is to minimize this loss, as a lower loss indicates better alignment between predictions and actual labels.

Accuracy: Accuracy is a metric that measures the proportion of correctly classified examples out of the total number of examples. It is usually expressed as a percentage. For example, an accuracy of 0.85 (or 85%) means that the model correctly predicted 85% of the examples in the dataset.

Validation Loss: During training, a portion of the data (validation set) is typically set aside to evaluate the model's performance on unseen data. The validation loss is the loss calculated on this validation set. It helps to assess how well the model generalizes to new, unseen data. A decreasing validation loss indicates that the model is learning and improving its performance.

Validation Accuracy: Similar to validation loss, validation accuracy is the accuracy calculated on the validation set. It provides a measure of how well the model is performing on unseen data. A high validation accuracy indicates that the model generalizes well to new data.

In summary, the loss is a measure of how well the model is performing on the training data, while accuracy and validation metrics provide insights into its performance on both the training and unseen validation data. Lower loss and higher accuracy and validation metrics generally indicate better model performance

In [None]:
#load data
X, y = load_data(DATA_PATH)

# Create train/tets split
X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the remaining data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state=42)

# Build network topology
model = keras.Sequential([
    # input layer
    keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),
    # 1st dense layer
    keras.layers.Dense(512, input_dim= X_train.shape[1], activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.3),

    # 2nd dense layer
    keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.3),

    # 3rd dense layer
    keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.3),

    # output layer
    keras.layers.Dense(10, activation='softmax')
])
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()
# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,          # Stop after no improvement for 10 epochs
    restore_best_weights=True  # Restore weights to the best model
)

# Modify model.fit to include the early stopping callback
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=100,
    callbacks=[early_stopping]  # Pass the callback to the callbacks argument
)

plot_history(history)

In [None]:
def predict(model, X, y):
   X = X[np.newaxis, ...]
   # prediction = [[0.1, 0.2, ...]]
   prediction = model.predict(X)   # X-> (1, 130, 13, 1)
   
   # extract index with max value
   predicted_index = np.argmax(prediction, axis=1)  # [4]
   print("Expected index: {}, Predicted index: {}".format(y, predicted_index))
# make prediction on a sample

# sample 1
X = X_test[30]
y = y_test[30]
print('First sample')
predict(model, X, y)

# sample 2
X = X_test[60]
y = y_test[60]
print('Second sample')
predict(model, X, y)

# sample 3
X = X_test[100]
y = y_test[100]
print('Third sample')
predict(model, X, y)



test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)

model achieved an accuracy of 76.4%
music genre classification,common task in music data recognition, achieving an accuracy above 70% is often considered acceptable.