# This is a jupyter notebook for testing and deploying audio_dataset.py

# Code:

In [2]:
# audio_dataset.py
import librosa
import os
import random

import numpy as np

audio_endings = ('.mp3', '.wav', '.flac')

class AudioDataSet:
    """AudioDataset helps organize and load a audio dataset split into different directories. 
    Because audio takes up so much space, this will only load the clips that need to be used"""
    def __init__(self, ds_dir, sr, duration, shuffle=True, random_state=42):
        """Initalize with the directory of audio files ds_dir (will scan all subdirectories recursively), 
        the sample_rate sr you wish to have on the whole dataset, and duration of the clips in seconds."""
        self.sr = sr
        self.duration = duration

        self.current_file_idx = 0
        
        self.files = []
        for r, d, f in os.walk(ds_dir):
            for file in f:
                if file.endswith(audio_endings):
                    self.files.append(os.path.join(r, file))
                    
        if shuffle:
            rand = random.Random(random_state)
            rand.shuffle(self.files)
    
    def num_samples(self):
        """The number of files/sampels in audio set."""
        return len(self.files)
    
    def load(self, file_idxs, sr=None, duration=None, pbar=None):
        """Load the specified file indices as an np array of shape (len(file_idxs), sr*duration)"""
        if sr is None:
            sr = self.sr
        if duration is None:
            duration = self.duration
        ys = []
        if pbar is not None:
            pbar.reset(total=len(file_idxs))
        for file_idx in file_idxs:
            y, _ = librosa.load(self.files[file_idx], sr=sr, mono=True, offset=0.0, duration=duration)
            y = librosa.util.fix_length(y, int(sr*duration), mode='wrap')
            ys.append(y)
            if pbar is not None:
                pbar.update(1)
        return np.array(ys)

    def load_next(self, batch_num, sr=None, duration=None, pbar=None):
        """Load the next batch_num files as an np array of shape (batch_num, sr*duration)"""
        if self.current_file_idx + batch_num > self.num_samples():
            self.current_file_idx = 0
            
        file_idxs = range(self.current_file_idx, self.current_file_idx + batch_num)
        self.current_file_idx += batch_num
        return self.load(file_idxs, sr=sr, duration=duration, pbar=pbar)
    def reset_next(self):
        """Reset the file index for the load_next function."""
        self.current_file_idx = 0

def generate_tone(sr, duration, freq, amplitude=None):
    """generate a tone or tones with a sample_rate sr (float), duration in seconds, and the frequencies and amplitudes (np arrays)"""
    # cases: number number, array number, array array
    if amplitude is None:
        amplitude = np.ones_like(freq)
    freq, amplitude = freq[:, None], amplitude[:, None]
    length = int(sr*duration)
    full = np.tile(np.arange(length), (freq.shape[1], 1))
    y = amplitude * np.sin(2*np.pi/sr * freq * full)
    return y

In [27]:
generate_tone(10000, 4, np.array([1000]), np.array([2])).shape

(1, 40000)

# Testing:

In [7]:
from common_audio import play_audio

ys = voice_ds.load([2, 100, 800, 283])

for y in ys:
    play_audio(y, voice_ds.sr)

In [16]:
from tqdm.notebook import tqdm


pbar = tqdm()
voice_ds.load(range(100), pbar=pbar)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

array([[-0.00160518, -0.00204003, -0.00119617, ..., -0.0021244 ,
        -0.00188904, -0.00897742],
       [ 0.00030777,  0.00022551,  0.00041451, ...,  0.00103126,
         0.00122304,  0.00102236],
       [-0.00349758,  0.00699128,  0.0138538 , ...,  0.00054207,
         0.0187066 ,  0.0249278 ],
       ...,
       [-0.00022078, -0.00031068, -0.00035698, ...,  0.0375695 ,
         0.03703025,  0.03605785],
       [-0.00112331, -0.00151507,  0.00101135, ..., -0.01408388,
         0.0261877 , -0.08615743],
       [-0.00025601, -0.00088416,  0.00025037, ..., -0.00662123,
         0.00517262, -0.00867649]], dtype=float32)

In [None]:



voice_ds = AudioDataset('jfdskla', 8000, 5)

voice_ds.load([0,1,2,3,4])