## Overview

- The main purpose of this notebook is to process the 1000 .wav samples found in the local Data directory, slice them into smaller audio clips, convert these audio clips into mel-spectrograms, and convert the mel-spectrograms into numpy arrays.
- Once the audio data is processed, it can be converted into a dataset. The data set is broken up into training data and testing data. 
- The training and testing data is then used to train our model, which is then saved in /backend/genre_categorization.

In [1]:
import tensorflow as tf
import numpy as np
import os
import librosa
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
DATA_DIRECTORY = '../Data'
label_encoder = LabelEncoder()

In [3]:
def get_audio_data(path_name):
    """Function to create paths/genre arrays to allow for
    future conversion of audio files to mel_spectrograms."""
    paths, genres = [], []
    for root, _, files in os.walk(path_name):
        for name in files:
            filename = os.path.join(root, name)
            genre = os.path.split(root)[-1]
            paths.append(filename)
            genres.append(genre)
    return paths, genres

In [4]:
def create_melspectrogram(audio_file, sample_rate):
    """ Function to create a mel_spectrogram from a received audio_file"""
    mel_spectrogram = librosa.feature.melspectrogram(
          y=audio_file,
          sr=sample_rate,
          n_fft=2048,
          hop_length=512,
          n_mels=128)
    # db = decibel units
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

In [5]:
def pad_audio(mel_db_spect, max_length):
    """ Pads mel_spectrogram files to ensure that they are homogeneous
    in shape. """
    if mel_db_spect.shape[1] < max_length:
        padding = max_length - mel_db_spect.shape[1]
        mel_db_spect = np.pad(mel_db_spect, pad_width=((0,0), (0, padding)), mode='constant')
    else:
        mel_db_spect = mel_db_spect[:, :max_length]
    return mel_db_spect

In [6]:
def split_songs(audio_paths, genres, max_length=78):
    """ Function that splits received songs into smaller audio clips and
    converts them into mel-spectrograms to be used for training/prediction.
    Adapted from code found at: 
    https://github.com/chittalpatel/Music-Genre-Classification-GTZAN/blob/master/Music%20Genre%20Classification/CNN_train(1).ipynb
    """
    split_spects_mel_db = []
    split_genres = []
    window = 0.06
    overlap = 0.3

    for path, genre in tqdm(zip(audio_paths, genres), total=len(audio_paths),desc='Processing Audio Files'):  
        audio, sample_rate = librosa.load(path)  
        audio_shape = audio.shape[0]
        chunk = int(audio_shape * window)
        offset = int(chunk*(1 - overlap))
        individual_split_song = []

        # create array of smaller audio clips
        for i in range(0, audio_shape - chunk + offset, offset):
            individual_split_song.append(audio[i:i+chunk])
        
        # convert small clips into mel_spectrograms
        for sample in individual_split_song:
            if sample.shape[0] != chunk:
                continue
            mel_spec = librosa.feature.melspectrogram(y=sample, sr=sample_rate, n_fft=2048, hop_length=512, n_mels=128)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            
            # Pad outputs to ensure uniformity 
            mel_spec_db = pad_audio(mel_spec_db, max_length)
            
            split_spects_mel_db.append(mel_spec_db)          
            split_genres.append(genre)
    
    return split_spects_mel_db, split_genres

In [7]:
paths, gens = get_audio_data(DATA_DIRECTORY)

In [8]:
spects, gens2 = split_songs(paths, gens)

Processing Audio Files: 100%|██████████| 1000/1000 [01:51<00:00,  8.98it/s]


Determine size of training set and test sets

In [9]:
spects_len = len(spects)
train_size = spects_len * .90
test_size = spects_len - train_size
train_take = train_size / 50
test_take = test_size / 50

Create Dataset

In [10]:
g1 = np.array(gens2)
s1 = np.array(spects)
genres_encoded = label_encoder.fit_transform(g1)
dataset = tf.data.Dataset.from_tensor_slices((s1, genres_encoded))
dataset = dataset.shuffle(len(s1))

In [11]:
final_data = dataset.cache()
final_data = final_data.batch(50)
final_data = final_data.prefetch(25)
train = final_data.take(int(train_take))
test = final_data.skip(int(train_take)).take(int(test_take))

CNN Model

In [12]:
spectrogram_height = 128
spectrogram_width = 78
num_channels = 1  #should be 1 because it is a numpy array, not a color image.
num_classes = 10

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(spectrogram_height, spectrogram_width, num_channels)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax') 
])

In [13]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Train Model

In [14]:
model.fit(train, epochs=15, validation_data=test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x265e0c31ae0>

Save Model

In [15]:
model.save('../genre_categorization')

INFO:tensorflow:Assets written to: ../genre_categorization\assets


INFO:tensorflow:Assets written to: ../genre_categorization\assets
