# Make training, testing and validation datasets

- Split entire dataset into training, testing and validation
- .txt files contain filenames of testing and validation audio files
- XOR all filenames with testing and validation filenames to create a set of training filenames

In [83]:
import glob
import librosa
import numpy as np
import os

**Directory structure**:<br>
data:<br>
| - testing_list.txt<br>
| - validation_list.txt<br>
| audio<br>
. . | - .<br>
. . | - .

In [1]:
PATH_TO_DATA = os.path.join(os.getcwd(), 'data')
PATH_TO_AUDIO = os.path.join(PATH_TO_DATA, 'audio')
SAMPLING_RATE = 16000

# Replace leading .../data/audio/*/* path with /audio/*
all_data_paths = glob.glob(os.path.join(PATH_TO_AUDIO, '*', '*'))
all_data_paths = np.vectorize(str.replace)(all_data_paths, os.path.join(PATH_TO_AUDIO, ''), '')

NameError: name 'os' is not defined

In [115]:
# Create a lambda function that helps in vectorizing the string replace function
split_join = lambda x: os.path.join(*str.split(x, '/'))

with open(os.path.join(PATH_TO_DATA,'testing_list.txt')) as f:
    test_data_paths = f.readlines()
test_data_paths = np.vectorize(str.replace)(test_data_paths, '\n', '')
test_data_paths = np.vectorize(split_join)(test_data_paths)

with open(os.path.join(PATH_TO_DATA, 'validation_list.txt')) as f:
    validation_data_paths = f.readlines()
validation_data_paths = np.vectorize(str.replace)(validation_data_paths, '\n', '')
validation_data_paths = np.vectorize(split_join)(validation_data_paths)

train_data_paths = list(set(all_data_paths) ^ set(validation_data_paths) ^ set(test_data_paths))

In [111]:
train_data_paths = train_data_paths[0: 100]
validation_data_paths = validation_data_paths[0: 100]

In [112]:
def read_audio_file(audio_file_path):
    audio_file = librosa.load(os.path.join('data', 'audio', audio_file_path))[0]
    if len(audio_file) < SAMPLING_RATE:
        pad_width = (SAMPLING_RATE - len(temp_audio)) // 2
        audio_file = np.pad(temp_audio, pad_width=pad_width, mode='constant')
    elif len(audio_file) > SAMPLING_RATE:
        length_to_truncate = (len(temp_audio) - SAMPLING_RATE)//2
        audio_file = temp_audio[length_to_truncate : SAMPLING_RATE + length_to_truncate]
    return audio_file

Create and save training, testing and validation datasets as `.npy` files

In [113]:
train_data = np.zeros((len(train_data_paths), SAMPLING_RATE))
for row_counter, path in enumerate(train_data_paths):
    train_data[row_counter][:] = read_audio_file(path)[:]

In [79]:
np.save('./data/train_data_100', train_data)

In [19]:
test_data = np.zeros((len(test_data_paths), SAMPLING_RATE))
for row_counter, path in enumerate(test_data_paths):
    test_data[row_counter][:] = read_audio_file(path)[:]

In [20]:
np.save('./data/test_data', test_data)

In [100]:
validation_data = np.zeros((len(validation_data_paths), SAMPLING_RATE))
for row_counter, path in enumerate(validation_data_paths):
    validation_data[row_counter][:] = read_audio_file(path)[:]

In [81]:
np.save('./data/validation_data_100', validation_data)