# Make training, testing and validation datasets

- Split entire dataset into training, testing and validation
- .txt files (provided by Kaggle) contain filenames of testing and validation audio files
- XOR all filenames with testing and validation filenames to create a set of training filenames
- Create Mel-power spectrograms for each audio file
- Save all as `.npy` files

In [None]:
import glob
import librosa
import numpy as np
import os
from tqdm import tqdm

**Directory structure**:<br>
data:<br>
| - testing_list.txt<br>
| - validation_list.txt<br>
| audio<br>
. . | - .<br>
. . | - .

In [None]:
PATH_TO_DATA = os.path.join(os.getcwd(), 'data')
PATH_TO_AUDIO = os.path.join(PATH_TO_DATA, 'audio')
SAMPLING_RATE = 16000

# Removing leading '/data/audio/' from all paths
all_data_paths = glob.glob(os.path.join(PATH_TO_AUDIO, '*', '*'))
all_data_paths = np.vectorize(str.replace)(all_data_paths, os.path.join(PATH_TO_AUDIO, ''), '')

In [None]:
# Create a lambda function that helps in vectorizing the string replace function
split_join = lambda x: os.path.join(*str.split(x, '/'))

with open(os.path.join(PATH_TO_DATA,'testing_list.txt')) as f:
    test_data_paths = f.readlines()
test_data_paths = np.vectorize(str.replace)(test_data_paths, '\n', '')
test_data_paths = np.vectorize(split_join)(test_data_paths)

with open(os.path.join(PATH_TO_DATA, 'validation_list.txt')) as f:
    validation_data_paths = f.readlines()
validation_data_paths = np.vectorize(str.replace)(validation_data_paths, '\n', '')
validation_data_paths = np.vectorize(split_join)(validation_data_paths)

train_data_paths = list(set(all_data_paths) ^ set(validation_data_paths) ^ set(test_data_paths))

In [None]:
def read_audio_file(audio_file_path):
    # Read audio file
    # If audio file is shroter than 16000 samples, zero-pad keeping the audio in the center
    # If audio file is longer than 16000 samples, center crop to 16000 samples
    audio_file = librosa.load(os.path.join(PATH_TO_AUDIO, audio_file_path))[0]
    if len(audio_file) < SAMPLING_RATE:
        if len(audio_file) % 2 == 1:
            audio_file = np.append(audio_file, [0])
        pad_width = (SAMPLING_RATE - len(audio_file)) // 2
        audio_file = np.pad(audio_file, pad_width=pad_width, mode='constant')
        audio_file = audio_file[0: SAMPLING_RATE]
    elif len(audio_file) > SAMPLING_RATE:
        length_to_truncate = (len(audio_file) - SAMPLING_RATE) // 2
        audio_file = audio_file[length_to_truncate : SAMPLING_RATE + length_to_truncate]
    return audio_file

In [None]:
def get_mel_power_spectrogram(audio_file):
    return librosa.power_to_db(librosa.feature.melspectrogram(audio_file, 
                                                              sr=SAMPLING_RATE, 
                                                              n_fft=1024, 
                                                              hop_length=256, 
                                                              fmax=3000), 
                               ref=np.max)

Create and save training, testing and validation datasets as `.npy` files

In [None]:
def create_data_split_spectrogram():
    train_data = []
    train_labels = []
    validation_data = []
    validation_labels = []
    test_data = []
    test_labels = []
    
    print('Creating train data')
    for path in train_data_paths:
        train_data.append(get_mel_power_spectrogram(read_audio_file(path)))
        train_labels.append(path.split(os.path.sep)[0])
        
    print('Creating validation data')
    for path in validation_data_paths:
        validation_data.append(get_mel_power_spectrogram(read_audio_file(path)))
        validation_labels.append(path.split(os.path.sep)[0])
    
    print('Creating test data')
    for path in test_data_paths:
        test_data.append(get_mel_power_spectrogram(read_audio_file(path)))
        test_labels.append(path.split(os.path.sep)[0])
             
    train_data = (np.array(train_data) - np.mean(train_data)) / np.std(train_data)
    validation_data = (np.array(validation_data) - np.mean(validation_data)) / np.std(validation_data)
    test_data = (np.array(test_data) - np.mean(test_data)) / np.std(test_data)
    np.save(os.path.join(PATH_TO_DATA, 'train_data'), train_data)
    np.save(os.path.join(PATH_TO_DATA, 'train_labels'), train_labels)
    np.save(os.path.join(PATH_TO_DATA, 'validation_data'), validation_data)
    np.save(os.path.join(PATH_TO_DATA, 'validation_labels'), validation_labels)
    np.save(os.path.join(PATH_TO_DATA, 'test_data'), test_data)
    np.save(os.path.join(PATH_TO_DATA, 'test_labels'), test_labels)

In [None]:
create_data_split_spectrogram()
print("Done")