In [15]:
import librosa
import os
import numpy as np

In [32]:
def load_dataset(dataset_rootpath, target_sr=22050):
    """
    Load the waveform of the music and their corresponding labels
    
    Paramters:
    ================================================
    dataset_rootpath: string
        The location of all the genres directories
    
    target_sr: int
        The desired sample rate for all music
    
    Returns
    =================================================
    X: ndarray of shape (n_samples, 30 * target_size)
        The waveform data of each music sample
    
    y: ndarray of shape(n_samples)
        The label for each waveform       
        
    """
    
    # Get all genres names
    GENRES = sorted(os.listdir(dataset_rootpath))
    X = []
    y = []
    samples = 0
    
    waveform_shape = 30 * target_sr
    
    # Iterate over all genres
    for genre_index, genre in enumerate(GENRES):
        label = genre_index + 1
        genre_path = os.path.join(dataset_rootpath, genre)
        
        # Iterate over each song in the genre folder
        for file in os.listdir(genre_path):
            # Load the music file
            audio, sr = librosa.load(os.path.join(genre_path, file), sr=None)
            
            # Resample if it doesnt have the desired sr
            if sr != target_sr:
                audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
                
            # If it's less than the target shape -> padding with zeros
            if len(audio) < waveform_shape:
                audio = np.append(audio, np.zeros(shape=(waveform_shape - len(audio)), ))
            
            # If it's more than the target shape -> truncate
            if len(audio) > waveform_shape:
                audio = audio[:waveform_shape]
                
            # Store the waveform and its label
            X.append(audio)
            y.append(label)
            samples += 1
            if samples % 100 == 0 or samples == 999:
                print('Already process %d music' % samples)
                
    # Convert the waveforms and labels into ndarrays
    return np.array(X, dtype=np.float32), np.array(y)

In [49]:
def one_hot_encoding(y):
    """
    Convert an array of labels into their corresponding one hot encoding
    
    Parameters:
    =========================================================
    y: ndarray of shape (n_samples)
        The labels of the dataset
    
    Returns:
    =========================================================
    y_onehot: ndarray of shape (n_sample, n_classes)
        The one-hot encoded labels
    """
    print("Encoding the labels...")
    y_onehot = []
    
    # get the each genre name
    y_unique = sorted(set(y))
    num_classes = len(y_unique)
    for label in y:
        
        # Store an initial vector of zeros
        cur = [0]*num_classes
        
        # Get the index of the label
        encode_index = y_unique.index(label)
        
        # Store a one in the label index
        cur[encode_index] = 1
        
        # Add the vector to the list
        y_onehot.append(cur)
        
    # Convert the list to ndarray
    return np.array(y_onehot)

In [58]:
def get_melspec_feature(X, target_sr, frame_size, hop_length, n_mels):
    """
    Get the mel-spectrograms of an ndarray
    
    Parameters:
    ============================================
    X: ndarray of shape (n_samples, waveform_shape)
        The waveform for each music sample
    
    target_sr: int
        The sampling rate of the sample
    
    frame_size: int
        The size of the frame window calculating the STFT
    
    hop_length: int
        The overlapping between the frames
    
    n_mels: int
        The number of frequency bands
    
    Returns:
    ============================================
    melspec_feature: ndarray of shape (n_samples, waveform_shape/hop_length, n_mels)
        The extracted mel_spectrograms
    """
    print('Extracting melspectrograms......')
    melspec_feature = []
    count = 0
    for audio in X:
        # Get the spectrogram for each audio
        audio_melspec = librosa.feature.melspectrogram(audio, sr=target_sr, n_fft=frame_size, hop_length=hop_length)
        
        # Convert the spectrograms to mel-scale
        audio_melspec = librosa.power_to_db(audio_melspec)
        
        # Transpose the spectrogram -> time(x)-frequeny(y)
        audio_melspec = audio_melspec.T
        
        # Append the spectrograms to the list
        melspec_feature.append(audio_melspec)
        count += 1
        if count % 100 == 0 or count == 999:
                print('Already process %d music' % count)
                
    # Convert the list into an ndarray
    return np.array(melspec_feature, dtype=np.float32)

In [51]:
target_sr = 22050
frame_size = 2048
hop_length = 1024
n_mels = 128
dataset_rootpath = './Data/genres_original'
X, y = load_dataset(dataset_rootpath, target_sr=target_sr)
y_onehot = one_hot_encoding(y)
melspec_feature = get_melspec_feature(X, target_sr, frame_size, hop_length, n_mels)

Already process 100 music
Already process 200 music
Already process 300 music
Already process 400 music
Already process 500 music
Already process 600 music
Already process 700 music
Already process 800 music
Already process 900 music
Already process 999 music
Encoding the labels...
Extracting melspectrograms......
Already process 100 music
Already process 200 music
Already process 300 music
Already process 400 music
Already process 500 music
Already process 600 music
Already process 700 music
Already process 800 music
Already process 900 music
Already process 999 music


In [52]:
np.save('./Data/GTZAN_Processing/raw_labes.npy', y)
np.save('./Data/GTZAN_Processing/onehot_labels.npy', y_onehot)
np.save('./Data/GTZAN_Processing/raw_audio.npy', X)
np.save('./Data/GTZAN_Processing/melspec_feature_2048.npy', melspec_feature)