## Loading data

https://www.kaggle.com/competitions/tensorflow-speech-recognition-challenge

In [1]:
import os 

import librosa
from librosa.feature import melspectrogram
from librosa.display import specshow
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
raw_data_base_path = 'data/raw'
audio_base_path = f'{raw_data_base_path}/audio'

In [3]:
with open(f'{raw_data_base_path}/validation_list.txt', 'r') as file:
    validation_list = file.readlines()
    validation_list = {path.strip() for path in validation_list}

with open(f'{raw_data_base_path}/testing_list.txt', 'r') as file:
    testing_list = file.readlines()
    testing_list = {path.strip() for path in testing_list}

In [6]:
def generate_mel_spectogram(signal, sample_rate, folder: str, name: str):
    """
    name - np. "bed/00f0204f_nohash_0.wav"
    """
    mel_spectrogram = melspectrogram(y=signal, sr=sample_rate)
    mel_spectrogram_abs = np.abs(mel_spectrogram)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram_abs, ref=np.max)

    fig = plt.Figure(figsize=(8, 7))
    ax = fig.add_subplot(1, 1, 1)
    specshow(
        mel_spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel',
        cmap='magma', ax=ax,
    )
    ax.axis('off')

    save_path = f'data/processed/{folder}/{name[:-4]}.png'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    fig.savefig(save_path, pad_inches=-0.1, bbox_inches='tight')

# Generate mel-spectrograms for known classes

In [7]:
datasets = ['train', 'valid', 'test']

In [36]:
folders_known = ['up', 'down', 'left', 'right', 'on', 'off', 'yes', 'no']
# number of observations in all known classes combined (train/valid/test)
known_counts = [0, 0, 0]

for label in folders_known:
    print(f'Processing {label} class...')
    for filename in os.listdir(f'{audio_base_path}/{label}'):
        
        full_name = f'{label}/{filename}'
        full_silence_signal, sample_rate = librosa.load(f'{audio_base_path}/{full_name}')

        # check which subset
        if full_name in validation_list:
            dataset_id = 1
        elif full_name in testing_list:
            dataset_id = 2
        else:
            dataset_id = 0

        folder = datasets[dataset_id]
        known_counts[dataset_id] += 1

        generate_mel_spectogram(full_silence_signal, sample_rate, f'{folder}/known', full_name)

# Generate mel-spectrograms for unknown class

In [37]:
known_counts

[14792, 2071, 2067]

In [40]:
unknown_classes_count = 30 - len(folders_known)

We want to create a varied unknown class, so we take the same number of samples from each unknown class

In [41]:
subset_size_per_class = [c // unknown_classes_count for c in known_counts]
subset_size_per_class

[672, 94, 93]

In [47]:
rng = np.random.default_rng(28)

for label in os.listdir(audio_base_path):
    if label in folders_known or label == '_background_noise_':
        continue

    path = f'{audio_base_path}/{label}'
    if not os.path.isdir(path):
        continue

    files = os.listdir(f'{audio_base_path}/{label}')
    # randomise files order
    rng.shuffle(files)
    # track the number of files we processed for this class
    curr_counts = [0, 0, 0]

    print(f'Processing {label} class...')
    for filename in files:
        
        full_name = f'{label}/{filename}'
        full_silence_signal, sample_rate = librosa.load(f'{audio_base_path}/{full_name}')

        # check which subset
        if full_name in validation_list:
            dataset_id = 1
        elif full_name in testing_list:
            dataset_id = 2
        else:
            dataset_id = 0

        # if we have enough of this class in this dataset then move on to the next file
        if curr_counts[dataset_id] == subset_size_per_class[dataset_id]:
            continue 

        folder = datasets[dataset_id]
        curr_counts[dataset_id] += 1
        # here we merge class label with the filename
        generate_mel_spectogram(full_silence_signal, sample_rate, f'{folder}/unknown', full_name.replace('/', ''))

Processing marvin class...
Processing six class...
Processing sheila class...
Processing four class...
Processing go class...
Processing stop class...
Processing five class...
Processing seven class...
Processing bird class...
Processing bed class...
Processing nine class...
Processing two class...
Processing dog class...
Processing happy class...
Processing tree class...
Processing eight class...
Processing cat class...
Processing wow class...
Processing house class...
Processing three class...
Processing zero class...
Processing one class...


# Generate mel-spectrogramns for silence class

We want silence files to have the same length as the other files. If a piece of audio at the end of the file is too short, in order to standardize the audio data, we will extend it with silence to ensure all files have equal duration of sound.

In [65]:
signal, _ = librosa.load(f'{audio_base_path}/bed/00f0204f_nohash_0.wav')
audio_len = len(signal)

In [96]:
valid_frac = known_counts[1] / np.sum(known_counts)
valid_frac

0.10940306391970417

In [100]:
# train, valid, test
silence_signals = [[], [], []]
bg_noise_path = f'{audio_base_path}/_background_noise_'

rng = np.random.default_rng(26)

for filename in os.listdir(bg_noise_path):
    if not filename.endswith('.wav'):
        continue

    full_silence_signal, sample_rate = librosa.load(f'{bg_noise_path}/{filename}')
    silence_segments = []
    for boundary in range(0, len(full_silence_signal), audio_len):
        silence = full_silence_signal[boundary:boundary + audio_len]
        
        silence_len = len(silence)
        if silence_len < audio_len:
            # fill with 0s
            silence = np.pad(silence, (0, audio_len - silence_len), mode='constant')
        
        silence_segments.append(silence)
    # add new samples to the respective datasets
    rng.shuffle(silence_segments)
    valid_size = int(valid_frac * len(silence_segments))
    # valid
    silence_signals[1].extend(silence_segments[:valid_size])
    # test
    silence_signals[2].extend(silence_segments[valid_size:2*valid_size])
    # train
    silence_signals[0].extend(silence_segments[2*valid_size:])

In [102]:
[len(s) for s in silence_signals]

[322, 40, 40]

In [104]:
known_counts

[14792, 2071, 2067]

# Generate a small subset of data for testing purposes

In [8]:
import shutil

In [9]:
def generate_subset(size: int):
    for label in ['yes', 'no']:
        for dataset in datasets:
            processed = f'data/processed/{dataset}/known/{label}'
            subset = f'data/subset/{dataset}/known/{label}'
            os.makedirs(subset, exist_ok=True)
            n_files = 0
            for filename in os.listdir(processed):
                shutil.copy(f'{processed}/{filename}', f'{subset}/{filename}')
                n_files += 1
                if n_files == size:
                    break

In [10]:
generate_subset(1)