# Splitting data

In [1]:
import glob
from scipy.io.wavfile import read
import numpy as np

In [2]:
SAMPLING_RATE = 16000

In [3]:
all_data_paths = glob.glob('./data/audio/*/*')
all_data_paths = [data.replace('./data/audio/', '') for data in all_data_paths]

In [4]:
with open('./data/validation_list.txt') as f:
    validation_data_paths = f.readlines()
validation_data_paths = [data.replace('\n', '') for data in validation_data_paths]

In [5]:
with open('./data/testing_list.txt') as f:
    test_data_paths = f.readlines()
test_data_paths = [data.replace('\n', '') for data in test_data_paths]

In [6]:
train_data_paths = list(set(all_data_paths) ^ set(validation_data_paths) ^ set(test_data_paths))

In [10]:
train_data = np.zeros((len(train_data_paths), SAMPLING_RATE))
for row_counter, train_data_path in enumerate(train_data_paths):
    temp_audio = read('./data/audio/' + train_data_path)[1]
    if len(temp_audio) < SAMPLING_RATE:
        pad_width = SAMPLING_RATE - len(temp_audio)
        temp_audio = np.pad(temp_audio, pad_width=(0, pad_width), mode='constant')
    elif len(temp_audio) > SAMPLING_RATE:
        length_to_truncate = (len(temp_audio) - SAMPLING_RATE)//2
        temp_audio = temp_audio[length_to_truncate : SAMPLING_RATE + length_to_truncate]
    train_data[row_counter][:] = temp_audio[:]

In [11]:
np.save('./data/train_data', train_data)

In [14]:
train_data = []

In [16]:
validation_data = np.zeros((len(validation_data_paths), SAMPLING_RATE))
for row_counter, validation_data_path in enumerate(validation_data_paths):
    temp_audio = read('./data/audio/' + validation_data_path)[1]
    if len(temp_audio) < SAMPLING_RATE:
        pad_width = SAMPLING_RATE - len(temp_audio)
        temp_audio = np.pad(temp_audio, pad_width=(0, pad_width), mode='constant')
    elif len(temp_audio) > SAMPLING_RATE:
        length_to_truncate = (len(temp_audio) - SAMPLING_RATE)//2
        temp_audio = temp_audio[length_to_truncate : SAMPLING_RATE + length_to_truncate]
    validation_data[row_counter][:] = temp_audio[:]

In [17]:
np.save('./data/validation_data', validation_data)

In [18]:
validation_data = []

In [19]:
test_data = np.zeros((len(test_data_paths), SAMPLING_RATE))
for row_counter, test_data_path in enumerate(test_data_paths):
    temp_audio = read('./data/audio/' + test_data_path)[1]
    if len(temp_audio) < SAMPLING_RATE:
        pad_width = SAMPLING_RATE - len(temp_audio)
        temp_audio = np.pad(temp_audio, pad_width=(0, pad_width), mode='constant')
    elif len(temp_audio) > SAMPLING_RATE:
        length_to_truncate = (len(temp_audio) - SAMPLING_RATE)//2
        temp_audio = temp_audio[length_to_truncate : SAMPLING_RATE + length_to_truncate]
    test_data[row_counter][:] = temp_audio[:]

In [20]:
np.save('./data/test_data', test_data)