In [2]:
import librosa
import numpy as np
import numpy as np
from Utils import load, get_audio_path
from tqdm import tqdm

In [23]:
metadata_file_path = '../Data/fma_metadata/tracks.csv'

# Load the metadata file into a dataframe
df = load(metadata_file_path)

In [24]:
small = df['set', 'subset'] <= 'small'

# Get the small dataset
data = df.loc[small, ('track', 'genre_top')]
data = data.to_frame()

# Remove the multi-indexing from the created dataframe
data = data.droplevel(level=0, axis=1)
data.head()

Unnamed: 0_level_0,genre_top
track_id,Unnamed: 1_level_1
2,Hip-Hop
5,Hip-Hop
10,Pop
140,Folk
141,Folk


In [25]:
audio_dir = "../Data/fma_small"
data['relative_path'] = list(map((lambda x: get_audio_path(audio_dir, int(x))), data.index))
data.head()

Unnamed: 0_level_0,genre_top,relative_path
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Hip-Hop,../Data/fma_small/000/000002.mp3
5,Hip-Hop,../Data/fma_small/000/000005.mp3
10,Pop,../Data/fma_small/000/000010.mp3
140,Folk,../Data/fma_small/000/000140.mp3
141,Folk,../Data/fma_small/000/000141.mp3


In [26]:
genres_types = list(data['genre_top'].unique())
genre_to_classid = {genre:idx for idx, genre in enumerate(genres_types)}
classid_to_genre = {idx:genre for idx, genre in enumerate(genres_types)}
data['classID'] = [genre_to_classid[genre_top] for genre_top in data['genre_top'].values]
data.head()

Unnamed: 0_level_0,genre_top,relative_path,classID
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Hip-Hop,../Data/fma_small/000/000002.mp3,0
5,Hip-Hop,../Data/fma_small/000/000005.mp3,0
10,Pop,../Data/fma_small/000/000010.mp3,1
140,Folk,../Data/fma_small/000/000140.mp3,2
141,Folk,../Data/fma_small/000/000141.mp3,2


In [31]:
# # NOT working files
# data_dir = data['relative_path'].values
# dirs = []
# orig_path = "../Data/fma_small/"
# for folder in os.listdir(orig_path):
#     for file in os.listdir(orig_path+folder):
#         dirs.append(orig_path+folder+"/"+file)
# not_working_paths=[]
# for real in data_dir:
#     if real not in dirs:
#         not_working_paths.append(real)

# not_working_paths
# not_working_paths = ['../Data/fma_small/099/099134.mp3', '../Data/fma_small/108/108925.mp3']
# for path in not_working_paths:
#     data = data[data['relative_path'] != path]

['../Data/fma_small/099/099134.mp3',
 '../Data/fma_small/108/108925.mp3',
 '../Data/fma_small/133/133297.mp3']

In [18]:
def load_dataset(data_csv, target_sr=22050):
    """
    Load the waveform of the music and their corresponding labels
    
    Paramters:
    ================================================
    dataset_rootpath: string
        The location of all the genres directories
    
    target_sr: int
        The desired sample rate for all music
    
    Returns
    =================================================
    X: ndarray of shape (n_samples, 30 * target_size)
        The waveform data of each music sample
    
    y: ndarray of shape(n_samples)
        The label for each waveform       
        
    """
    
    # Get all genres names
    X = []
    y = []
    samples = 0
    
    waveform_shape = 30 * target_sr
    
    print("Getting Data out...")
    
    for file, class_id in tqdm(zip(data['relative_path'], data['classID'])):
        
        # Load the music file
        audio, sr = librosa.load(file, sr=None)
        
        # Resample if it doesnt have the desired sr
        if sr != target_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

        # If it's less than the target shape -> padding with zeros
        if len(audio) < waveform_shape:
            audio = np.append(audio, np.zeros(shape=(waveform_shape - len(audio)), ))

        # If it's more than the target shape -> truncate
        if len(audio) > waveform_shape:
            audio = audio[:waveform_shape]
        
        # Store the waveform and its label
        X.append(audio)
        y.append(class_id)
        
        samples += 1
        
        if samples % 100 == 0:
            print('Already process %d music' % samples)

    # Convert the waveforms and labels into ndarrays
    return np.array(X, dtype=np.float32), np.array(y)

In [24]:
not_working_paths = ['../Data/fma_small/099/099134.mp3',
                 '../Data/fma_small/108/108925.mp3',
                 '../Data/fma_small/133/133297.mp3']
for path in not_working_paths:
    data = data[data['relative_path'] != path]

'../Data/fma_small/000/000002.mp3'

In [34]:
def one_hot_encoding(y):
    """
    Convert an array of labels into their corresponding one hot encoding
    
    Parameters:
    =========================================================
    y: ndarray of shape (n_samples)
        The labels of the dataset
    
    Returns:
    =========================================================
    y_onehot: ndarray of shape (n_sample, n_classes)
        The one-hot encoded labels
    """
    print("Encoding the labels...")
    y_onehot = []
    
    # get the each genre name
    y_unique = sorted(set(y))
    num_classes = len(y_unique)
    for label in y:
        
        # Store an initial vector of zeros
        cur = [0]*num_classes
        
        # Get the index of the label
        encode_index = y_unique.index(label)
        
        # Store a one in the label index
        cur[encode_index] = 1
        
        # Add the vector to the list
        y_onehot.append(cur)
        
    # Convert the list to ndarray
    return np.array(y_onehot)

In [10]:
def get_melspec_feature(X, target_sr, frame_size, hop_length, n_mels):
    """
    Get the mel-spectrograms of an ndarray
    
    Parameters:
    ============================================
    X: ndarray of shape (n_samples, waveform_shape)
        The waveform for each music sample
    
    target_sr: int
        The sampling rate of the sample
    
    frame_size: int
        The size of the frame window calculating the STFT
    
    hop_length: int
        The overlapping between the frames
    
    n_mels: int
        The number of frequency bands
    
    Returns:
    ============================================
    melspec_feature: ndarray of shape (n_samples, waveform_shape/hop_length, n_mels)
        The extracted mel_spectrograms
    """
    print('Extracting melspectrograms......')
    melspec_feature = []
    count = 0
    for audio in X:
        # Get the spectrogram for each audio
        audio_melspec = librosa.feature.melspectrogram(audio, sr=target_sr, n_fft=frame_size, hop_length=hop_length)
        
        # Convert the spectrograms to mel-scale
        audio_melspec = librosa.power_to_db(audio_melspec)
        
        # Transpose the spectrogram -> time(x)-frequeny(y)
        audio_melspec = audio_melspec.T
        
        # Append the spectrograms to the list
        melspec_feature.append(audio_melspec)
        count += 1
        if count % 100 == 0:
                print('Already process %d music' % count)
                
    # Convert the list into an ndarray
    return np.array(melspec_feature, dtype=np.float32)

In [None]:
target_sr = 22050
frame_size = 2048
hop_length = 1024
n_mels = 128
X, y = load_dataset(data, target_sr=target_sr)

0it [00:00, ?it/s]

Getting Data out...


100it [01:39,  1.05s/it]

Already process 100 music


200it [03:23,  1.03s/it]

Already process 200 music


300it [05:07,  1.02it/s]

Already process 300 music


400it [06:46,  1.11s/it]

Already process 400 music


500it [08:32,  1.03s/it]

Already process 500 music


600it [10:15,  1.01s/it]

Already process 600 music


700it [11:58,  1.04s/it]

Already process 700 music


801it [13:40,  2.08it/s]

Already process 800 music


900it [15:18,  1.03s/it]

Already process 900 music


1000it [17:03,  1.03s/it]

Already process 1000 music


1100it [18:47,  1.02s/it]

Already process 1100 music


1200it [20:23,  1.01it/s]

Already process 1200 music


1300it [22:07,  1.07s/it]

Already process 1300 music


1400it [23:53,  1.04s/it]

Already process 1400 music


1500it [25:37,  1.01it/s]

Already process 1500 music


1600it [27:21,  1.05s/it]

Already process 1600 music


1700it [29:06,  1.06s/it]

Already process 1700 music


1800it [30:48,  1.05s/it]

Already process 1800 music


1900it [32:31,  1.06s/it]

Already process 1900 music


2000it [34:15,  1.08s/it]

Already process 2000 music


2100it [35:58,  1.01s/it]

Already process 2100 music


2200it [37:42,  1.02it/s]

Already process 2200 music


2300it [39:21,  1.06s/it]

Already process 2300 music


2400it [41:02,  1.10s/it]

Already process 2400 music


2500it [42:49,  1.07s/it]

Already process 2500 music


2600it [44:30,  1.02s/it]

Already process 2600 music


2700it [46:15,  1.01s/it]

Already process 2700 music


2800it [48:00,  1.04s/it]

Already process 2800 music


2900it [49:44,  1.03s/it]

Already process 2900 music


3000it [51:27,  1.02it/s]

Already process 3000 music


3100it [53:06,  1.03it/s]

Already process 3100 music


3200it [54:48,  1.04s/it]

Already process 3200 music


3300it [56:28,  1.01it/s]

Already process 3300 music


3371it [57:38,  1.02it/s]

In [52]:
y_onehot = one_hot_encoding(y)
melspec_feature = get_melspec_feature(X, target_sr, frame_size, hop_length, n_mels)
np.save('./Data/FMA_Processing/raw_labes.npy', y)
np.save('./Data/FMA_Processing/onehot_labels.npy', y_onehot)
np.save('./Data/FMA_Processing/raw_audio.npy', X)
np.save('./Data/FMA_Processing/melspec_feature_2048.npy', melspec_feature)