In [None]:
from audio_dataset import *
from tqdm.notebook import tqdm

In [None]:
# audio_dataset.py
import librosa
import os
import random

import numpy as np

audio_endings = ('.mp3', '.wav', '.flac')

class AudioDataSet:
    """AudioDataset helps organize and load a audio dataset split into different directories. 
    Because audio takes up so much space, this will only load the clips that need to be used"""
    def __init__(self, ds_dir, sr, duration, shuffle=True, random_state=42):
        """Initalize with the directory of audio files ds_dir (will scan all subdirectories recursively), 
        the sample_rate sr you wish to have on the whole dataset, and duration of the clips in seconds."""
        self.sr = sr
        self.duration = duration

        self.current_file_idx = 0
        
        self.files = []
        for r, d, f in os.walk(ds_dir):
            for file in f:
                if file.endswith(audio_endings):
                    self.files.append(os.path.join(r, file))
                    
        if shuffle:
            rand = random.Random(random_state)
            rand.shuffle(self.files)
    
    def num_samples(self):
        """The number of files/sampels in audio set."""
        return len(self.files)
    
    def load(self, file_idxs, sr=None, duration=None, pbar=None):
        """Load the specified file indices as an np array of shape (len(file_idxs), sr*duration)"""
        if sr is None:
            sr = self.sr
        if duration is None:
            duration = self.duration
        ys = []
        if pbar is not None:
            pbar.reset(total=len(file_idxs))
        for file_idx in file_idxs:
            y, _ = librosa.load(self.files[file_idx], sr=sr, mono=True, offset=0.0, duration=duration)
            y = librosa.util.fix_length(y, int(sr*duration), mode='wrap')
            ys.append(y)
            if pbar is not None:
                pbar.update(1)
        return np.array(ys)

    def load_next(self, batch_size, sr=None, duration=None, pbar=None):
        """Load the next batch_size files as an np array of shape (batch_size, sr*duration)"""
        if self.current_file_idx + batch_size > self.num_samples():
            self.current_file_idx = 0
            
        file_idxs = range(self.current_file_idx, self.current_file_idx + batch_size)
        self.current_file_idx += batch_size
        return self.load(file_idxs, sr=sr, duration=duration, pbar=pbar)
    def reset_next(self):
        """Reset the file index for the load_next function."""
        self.current_file_idx = 0

# Reformatting datasets

In [None]:
sr = 8000
duration = 5.0
voice_ds = AudioDataSet('datasets/LibriSpeech/LibriSpeech/dev-clean', sr=sr, duration=duration)
noise_ds = AudioDataSet('datasets/urban/UrbanSound8K/audio', sr=sr, duration=duration)

num_samples = voice_ds.num_samples()

In [None]:
voice = voice_ds.load(range(num_samples), pbar=tqdm())
print(voice.shape)
np.save('datasets/voices', voice)
del voice

In [None]:
noise = noise_ds.load(range(num_samples), pbar=tqdm())
print(noise.shape)
np.save('datasets/noises', noise)
del noise

# Testing Fast Load Speeds

In [None]:
%%time
voice = np.load('datasets/voices.npy')
noise = np.load('datasets/noises.npy')
print(voice.shape)
print(noise.shape)