# Speech Digit Dataset

This notebook shows you how to prepare a synthetic dataset of speech data for digits 0-9.

This project is motivated by the desire to create a dataset similar to MNIST but for speech data.

## Installation

Python==3.10
piper-tts==1.2.0



In [3]:

voices = {
    "amy": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx.json?download=true.json"
    },
    "arctic": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/arctic/medium/en_US-arctic-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/arctic/medium/en_US-arctic-medium.onnx.json?download=true.json"
    },
    "bryce": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/bryce/medium/en_US-bryce-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/bryce/medium/en_US-bryce-medium.onnx.json?download=true.json"
    },
    "danny": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/danny/low/en_US-danny-low.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/danny/low/en_US-danny-low.onnx.json?download=true.json"
    },
    "hfc_female": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_female/medium/en_US-hfc_female-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_female/medium/en_US-hfc_female-medium.onnx.json?download=true.json"
    },
    "hfc_male": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_male/medium/en_US-hfc_male-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/hfc_male/medium/en_US-hfc_male-medium.onnx.json?download=true.json"
    },
    "joe": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/joe/medium/en_US-joe-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/joe/medium/en_US-joe-medium.onnx.json?download=true.json"
    },
    "john": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/john/medium/en_US-john-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/john/medium/en_US-john-medium.onnx.json?download=true.json"
    },
    "kathleen": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/kathleen/low/en_US-kathleen-low.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/kathleen/low/en_US-kathleen-low.onnx.json?download=true.json"
    },
    "kristin": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/kristin/medium/en_US-kristin-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/kristin/medium/en_US-kristin-medium.onnx.json?download=true.json"
    },
    "kusal": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/kusal/medium/en_US-kusal-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/kusal/medium/en_US-kusal-medium.onnx.json?download=true.json"
    },
    "l2arctic": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/l2arctic/medium/en_US-l2arctic-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/l2arctic/medium/en_US-l2arctic-medium.onnx.json?download=true.json"
    },
    "lessac": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/lessac/medium/en_US-lessac-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json?download=true.json"
    },
    "libritts": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/libritts/high/en_US-libritts-high.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/libritts/high/en_US-libritts-high.onnx.json?download=true.json"
    },
    "libritts_r": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/libritts_r/medium/en_US-libritts_r-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/libritts_r/medium/en_US-libritts_r-medium.onnx.json?download=true.json"
    },
    "ljspeech": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ljspeech/medium/en_US-ljspeech-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ljspeech/medium/en_US-ljspeech-medium.onnx.json?download=true.json"
    },
    "norman": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/norman/medium/en_US-norman-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/norman/medium/en_US-norman-medium.onnx.json?download=true.json"
    },
    "ryan": {
        "model": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ryan/medium/en_US-ryan-medium.onnx?download=true",
        "config": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ryan/medium/en_US-ryan-medium.onnx.json?download=true.json"
    }
}



In [None]:

# download voice models

import os
import subprocess

for voice, urls in voices.items():
    # Create directory for the voice
    voice_dir = os.path.join('voices', voice)
    os.makedirs(voice_dir, exist_ok=True)
    print(voice_dir)
    
    
    # Download model file
    model_url = urls['model']
    model_filename = os.path.join(voice_dir, f'{voice}_model.onnx')
    subprocess.run(['wget', '-O', model_filename, model_url], check=True)
    
    # Download config file
    config_url = urls['config']
    config_filename = os.path.join(voice_dir, f'{voice}_config.json')
    subprocess.run(['wget', '-O', config_filename, config_url], check=True)
    
    print(f"Downloaded files for {voice}")
    

In [8]:
# playing with some data augmentation
import numpy as np
import librosa

def noise(data, noise_amt=0.035):
    noise_amp = noise_amt*np.random.uniform()*np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5) * 1000)
    return np.roll(data, shift_range)


def pitch(data, sampling_rate, n_steps=2):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=n_steps)

In [62]:
import os
import subprocess
import librosa
import numpy as np
import soundfile as sf

# select a voice
for voice, urls in voices.items():

    voice_dir = os.path.join('voices', voice)
    #print(voice_dir)

    # Download model file
    model_url = urls['model']
    model_filename = os.path.join(voice_dir, f'{voice}_model.onnx')
    
    # Download config file
    config_url = urls['config']
    config_filename = os.path.join(voice_dir, f'{voice}_config.json')

    #print(model_filename)
    #print(voice)

    # use that voice to create audio for digits 0-10
    for digit in tqdm(range(10)):

        # Create directory for the voice
        digit_dir = os.path.join('speech_digits', str(digit))
        #print(digit_dir)
        os.makedirs(digit_dir, exist_ok=True)
        #print(voice_dir)
    
        output_path = f'speech_digits/{str(digit)}/{voice}_{str(digit)}_original.wav'
        piper_cmd = ['piper', '--model', model_filename, '--config', config_filename, '--output_file', output_path]
        #print(piper_cmd)

        subprocess.run(piper_cmd, input=str(digit), text=True, capture_output=True, check=True)

        # load up that voice
        audio, sample_rate = librosa.load(output_path)

        for i in tqdm(range(1000 // 4)):
            # add some random noise
            noise_data = noise(audio, np.random.uniform(0.001, 0.01))

            stretch_data = stretch(audio, rate=np.random.uniform(0.1, 0.9))

            shift_data = shift(audio)

            pitch_data = pitch(audio, sample_rate, n_steps=np.random.randint(-2, 2))

            sf.write(f'speech_digits/{str(digit)}/{voice}_{str(digit)}_noise_{i}.wav', noise_data, sample_rate)
            sf.write(f'speech_digits/{str(digit)}/{voice}_{str(digit)}_stretch_{i}.wav', stretch_data, sample_rate)
            sf.write(f'speech_digits/{str(digit)}/{voice}_{str(digit)}_shift_{i}.wav', shift_data, sample_rate)
            sf.write(f'speech_digits/{str(digit)}/{voice}_{str(digit)}_pitch_{i}.wav', pitch_data, sample_rate)

100%|██████████| 250/250 [00:10<00:00, 23.25it/s]
100%|██████████| 250/250 [00:07<00:00, 32.22it/s]
100%|██████████| 250/250 [00:08<00:00, 29.94it/s]
100%|██████████| 250/250 [00:08<00:00, 30.45it/s]
100%|██████████| 250/250 [00:09<00:00, 27.05it/s]
100%|██████████| 250/250 [00:09<00:00, 25.18it/s]
100%|██████████| 250/250 [00:08<00:00, 29.73it/s]
100%|██████████| 250/250 [00:10<00:00, 24.41it/s]
100%|██████████| 250/250 [00:09<00:00, 27.49it/s]
100%|██████████| 250/250 [00:09<00:00, 26.81it/s]
100%|██████████| 10/10 [01:46<00:00, 10.63s/it]
100%|██████████| 250/250 [00:08<00:00, 30.79it/s]
100%|██████████| 250/250 [00:08<00:00, 30.02it/s]
100%|██████████| 250/250 [00:05<00:00, 44.10it/s]
100%|██████████| 250/250 [00:08<00:00, 28.34it/s]
100%|██████████| 250/250 [00:09<00:00, 27.56it/s]
100%|██████████| 250/250 [00:08<00:00, 30.88it/s]
100%|██████████| 250/250 [00:08<00:00, 29.67it/s]
100%|██████████| 250/250 [00:10<00:00, 24.86it/s]
100%|██████████| 250/250 [00:05<00:00, 42.12it/s]
10

KeyboardInterrupt: 

In [36]:
# get all of the fies in speech_digits with glob
import glob

files = glob.glob('speech_digits/*/*')

print(len(files))
print(files)

900
['speech_digits/4/john_4_shift.wav', 'speech_digits/4/john_4_stretch.wav', 'speech_digits/4/norman_4_pitch.wav', 'speech_digits/4/norman_4_noise.wav', 'speech_digits/4/kathleen_4_noise.wav', 'speech_digits/4/libritts_r_4_stretch.wav', 'speech_digits/4/ljspeech_4_stretch.wav', 'speech_digits/4/bryce_4_noise.wav', 'speech_digits/4/hfc_male_4_noise.wav', 'speech_digits/4/ryan_4_stretch.wav', 'speech_digits/4/kusal_4_stretch.wav', 'speech_digits/4/libritts_r_4_shift.wav', 'speech_digits/4/hfc_female_4_shift.wav', 'speech_digits/4/danny_4_stretch.wav', 'speech_digits/4/joe_4_shift.wav', 'speech_digits/4/danny_4_shift.wav', 'speech_digits/4/kathleen_4_stretch.wav', 'speech_digits/4/danny_4_original.wav', 'speech_digits/4/lessac_4_shift.wav', 'speech_digits/4/norman_4_original.wav', 'speech_digits/4/hfc_male_4_stretch.wav', 'speech_digits/4/bryce_4_shift.wav', 'speech_digits/4/bryce_4_original.wav', 'speech_digits/4/john_4_noise.wav', 'speech_digits/4/lessac_4_pitch.wav', 'speech_digits/4

In [None]:
# display spectogramand audio player for all files in speech_digits/0
import librosa
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio, display


# Define the directory for digit 0
digit_dir = 'speech_digits/0'

# Get all files in the digit directory
files = glob.glob(os.path.join(digit_dir, '*.wav'))

# Display spectogram and audio player for each file
for file in files:
    # Load the audio file
    y, sr = librosa.load(file, sr=None)
    
    # Display the spectogram
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.title(f'Waveform for {os.path.basename(file)}')
    plt.show()
    
    # Display the spectogram
    plt.figure(figsize=(10, 4))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Spectrogram for {os.path.basename(file)}')
    plt.show()


    display(Audio(file))




In [58]:
import glob
import torch as t
import librosa
from torch.utils.data import Dataset, DataLoader

files = glob.glob('speech_digits/*/*')
print(f"Total files: {len(files)}")

audio_data = []
labels = []
longest_tensor_size = 0

# First pass: load data and find longest tensor
for f in files:
    try:
        audio, sample_rate = librosa.load(f)
        if len(audio) == 0:
            print(f"Warning: Empty audio file: {f}")
            continue
        audio_tensor = t.from_numpy(audio).float()
        audio_data.append(audio_tensor)
        labels.append(int(f.split('/')[-2]))  # Adjust this based on your file structure
        longest_tensor_size = max(longest_tensor_size, audio_tensor.size(0))
    except Exception as e:
        print(f"Error processing file {f}: {str(e)}")

print(f"Longest tensor size: {longest_tensor_size}")
print(f"Number of valid audio files: {len(audio_data)}")

# Second pass: pad tensors
padded_audio_data = []
for audio_tensor in audio_data:
    current_size = audio_tensor.size(0)
    pad_size = longest_tensor_size - current_size
    left_pad = pad_size // 2
    right_pad = pad_size - left_pad
    
    padded_tensor = t.nn.functional.pad(audio_tensor, (left_pad, right_pad))
    padded_audio_data.append(padded_tensor)

# Convert labels to tensor
label_tensor = t.tensor(labels)

class AudioDataset(Dataset):
    def __init__(self, audio_data, labels):
        self.audio_data = audio_data
        self.labels = labels

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        return self.audio_data[idx], self.labels[idx]

# Create datasets
full_dataset = AudioDataset(padded_audio_data, label_tensor)
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = t.utils.data.random_split(full_dataset, [train_size, test_size])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

print(f"Number of training batches: {len(train_loader)}")
print(f"Number of test batches: {len(test_loader)}")


Total files: 900
Longest tensor size: 309182
Number of valid audio files: 900
Number of training batches: 23
Number of test batches: 6


In [59]:
# create our model, a simple MLP

model = t.nn.Sequential(
    t.nn.Flatten(),
    t.nn.Linear(longest_tensor_size, 512),
    t.nn.ReLU(),
    t.nn.Linear(512, 512),
    t.nn.ReLU(),
    t.nn.Linear(512, 10),
    t.nn.Softmax(dim=1)
)

In [60]:
# train our model
from tqdm import tqdm

device = t.device('cuda' if t.cuda.is_available() else 'cpu')

model.to(device)

loss_fn = t.nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.001)

epochs = 10

for epoch in tqdm(range(epochs)):
    model.train()
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# evaluate our model

model.eval()

correct = 0
total = 0

with t.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        _, predicted = t.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f"Accuracy of the model on the test set: {100 * correct / total}%")



 10%|█         | 1/10 [00:24<03:42, 24.78s/it]

Epoch 1/10, Loss: 2.2205870151519775


 20%|██        | 2/10 [00:49<03:15, 24.49s/it]

Epoch 2/10, Loss: 1.6150628328323364


 30%|███       | 3/10 [01:13<02:50, 24.42s/it]

Epoch 3/10, Loss: 1.6584043502807617


 40%|████      | 4/10 [01:37<02:26, 24.41s/it]

Epoch 4/10, Loss: 1.4612865447998047


 50%|█████     | 5/10 [02:02<02:02, 24.42s/it]

Epoch 5/10, Loss: 1.4622478485107422


 60%|██████    | 6/10 [02:26<01:37, 24.41s/it]

Epoch 6/10, Loss: 1.462381362915039


 70%|███████   | 7/10 [02:51<01:13, 24.42s/it]

Epoch 7/10, Loss: 1.4614914655685425


 80%|████████  | 8/10 [03:15<00:48, 24.42s/it]

Epoch 8/10, Loss: 1.4613432884216309


 90%|█████████ | 9/10 [03:39<00:24, 24.40s/it]

Epoch 9/10, Loss: 1.5236897468566895


100%|██████████| 10/10 [04:04<00:00, 24.43s/it]

Epoch 10/10, Loss: 1.5237348079681396





Accuracy of the model on the test set: 47.77777777777778%
