In [1]:
import numpy as np
import librosa as lb
import pandas as pd
import numpy as np
import os
import gc
from tqdm import tqdm

idx_to_label = 'bed bird cat dog down eight five four go happy house left marvin nine no off on one right seven sheila six stop three tree two up wow yes zero'.split(' ')

NUM_CLASSES = len(idx_to_label)

label_to_idx = {idx_to_label[i]: i for i in range(NUM_CLASSES)}


train_data_path = 'data/train'
test_data_path = 'data/test'

In [2]:
from sklearn.utils import shuffle

def preprocess_train(pipeline):
    x, y = [], []
    for label in idx_to_label:
        label_dir = f'{train_data_path}/{label}'
        for wav_file in tqdm(os.listdir(label_dir)):
            wav_path = label_dir + f'/{wav_file}'
            wav, _ = lb.load(wav_path, sr=SR)
            x.append(pipeline(wav).astype('float32'))
            y.append(label_to_idx[label])
    x, y = shuffle(np.r_[x], np.r_[y], random_state=7)
    return x, y.astype('int64')

def preprocess_test(pipeline):
    x, keys = [], []
    for wav_file in tqdm(os.listdir(test_data_path)):
        wav_path = f'{test_data_path}/{wav_file}'
        wav, _ = lb.load(wav_path, sr=SR)
        x.append(pipeline(wav).astype('float32'))
        keys.append(wav_file)
    return np.r_[x], np.r_[keys]

In [3]:
from transforms import *

normal_transform = Compose([crop_or_pad, ToLogMelspectrogram(config='1x32x32')])

data_aug_transform = Compose([
    TimeShiftAudio(), ChangeAmplitude(), ChangeSpeedAndPitchAudio(), normal_transform])

x_train, y_train = preprocess_train(lambda x:x)
x_test, test_keys = preprocess_test(normal_transform)

gc.collect()

x_train.shape, y_train.shape, x_test.shape, test_keys.shape

100%|██████████| 1537/1537 [00:00<00:00, 1671.71it/s]
100%|██████████| 1573/1573 [00:00<00:00, 1721.44it/s]
100%|██████████| 1567/1567 [00:00<00:00, 1914.91it/s]
100%|██████████| 1566/1566 [00:00<00:00, 1828.13it/s]
100%|██████████| 2106/2106 [00:01<00:00, 1665.94it/s]
100%|██████████| 2095/2095 [00:01<00:00, 1652.99it/s]
100%|██████████| 2086/2086 [00:01<00:00, 1759.90it/s]
100%|██████████| 2119/2119 [00:01<00:00, 1821.06it/s]
100%|██████████| 2121/2121 [00:01<00:00, 1748.08it/s]
100%|██████████| 1562/1562 [00:00<00:00, 1818.18it/s]
100%|██████████| 1600/1600 [00:00<00:00, 1781.12it/s]
100%|██████████| 2086/2086 [00:01<00:00, 1814.15it/s]
100%|██████████| 1584/1584 [00:00<00:00, 1713.46it/s]
100%|██████████| 2105/2105 [00:01<00:00, 1585.52it/s]
100%|██████████| 2123/2123 [00:01<00:00, 1743.76it/s]
100%|██████████| 2095/2095 [00:01<00:00, 1696.81it/s]
100%|██████████| 2121/2121 [00:01<00:00, 1735.32it/s]
100%|██████████| 2122/2122 [00:01<00:00, 1721.89it/s]
100%|██████████| 2108/2108 [

((57886,), (57886,), (6835, 1, 32, 32), (6835,))

In [4]:
np.savez_compressed('raw_train_data.npz', x_train=x_train, y_train=y_train)
np.savez_compressed('test_data_mel32.npz', x_test=x_test, test_keys=test_keys)