# Text-to-speech

We will try text-to-speech instead of keyword spotting as the dataset available for keyword spotting is fairly limited and doesn't contain the desired commands ("sit", "down", "stay", "come")

In [2]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

LANG_ID = "en"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
SAMPLES = 10

# test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)


Downloading builder script: 100%|██████████| 26.7k/26.7k [00:00<00:00, 26.8MB/s]
Downloading metadata: 100%|██████████| 174k/174k [00:00<00:00, 3.97MB/s]
Downloading readme: 100%|██████████| 62.4k/62.4k [00:00<00:00, 676kB/s]
            This version of the Common Voice dataset is deprecated.
            You can download the latest one with
            >>> load_dataset("mozilla-foundation/common_voice_11_0", "en")
            


Downloading and preparing dataset common_voice/en to C:/Users/eliot/.cache/huggingface/datasets/common_voice/en/6.1.0/220833898d6a60c50f621126e51fb22eb2dfe5244392c70dccd8e6e2f055f4bf...


Downloading data:   0%|          | 48.4M/60.6G [00:04<1:32:27, 10.9MB/s]  


KeyboardInterrupt: 

In [5]:
from huggingsound import SpeechRecognitionModel
import pyaudio
import wave
import tempfile
import os

In [37]:
CHUNK = 320  # number of audio samples per frame
FORMAT = pyaudio.paInt16  # audio format
CHANNELS = 1  # mono audio
RATE = 16000  # sampling rate in Hz
RECORD_SECONDS = 1 # duration of each recording in seconds
FILE_NAME = f"temp.wav"

def record_audio():
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    try:
        frames = []
        for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): # fill with 0 for the first recording
            frames.append(0)
        while True:
            frames = frames[int(RATE / CHUNK * RECORD_SECONDS):]  # to store audio frames

            for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
                data = stream.read(CHUNK)
                frames.append(data)
            # write frames to temporary WAV file
            print(len(frames))
            
            wav_filename =  FILE_NAME
            wf = wave.open(wav_filename, 'wb')
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(p.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))
            wf.close()

            # read contents of WAV file a

            yield wav_filename

    except KeyboardInterrupt:
        pass

    stream.stop_stream()
    stream.close()
    p.terminate()

In [None]:
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")

In [38]:
for wav_data in record_audio():
    # pass the WAV data to your keyword spotter here
    print(model.transcribe([wav_data]))

50


100%|██████████| 1/1 [00:00<00:00,  3.92it/s]


[{'transcription': 'th e e e', 'start_timestamps': [0, 20, 40, 60, 80, 140, 160, 320], 'end_timestamps': [20, 40, 60, 80, 120, 160, 280, 340], 'probabilities': [0.16704869270324707, 0.24431806802749634, 0.2124597579240799, 0.1689237654209137, 0.15427014231681824, 0.1589607447385788, 0.1805240958929062, 0.15466219186782837]}]
50


100%|██████████| 1/1 [00:00<00:00,  3.90it/s]


[{'transcription': 'e', 'start_timestamps': [280], 'end_timestamps': [300], 'probabilities': [0.3504197895526886]}]
50


100%|██████████| 1/1 [00:00<00:00,  4.13it/s]


[{'transcription': 'he e  e e e', 'start_timestamps': [0, 20, 40, 100, 160, 200, 280, 340, 460, 520, 580], 'end_timestamps': [20, 40, 60, 120, 180, 220, 300, 360, 480, 540, 600], 'probabilities': [0.46855443716049194, 0.3873225152492523, 0.28562167286872864, 0.2563060522079468, 0.1692150980234146, 0.19211958348751068, 0.3819015920162201, 0.3070848286151886, 0.2072964310646057, 0.22003218531608582, 0.24269701540470123]}]
50


100%|██████████| 1/1 [00:00<00:00,  4.24it/s]


[{'transcription': 'e e e    e', 'start_timestamps': [20, 40, 80, 180, 280, 360, 460, 640, 680, 740], 'end_timestamps': [40, 60, 100, 220, 300, 420, 480, 660, 700, 760], 'probabilities': [0.16614550352096558, 0.16085714101791382, 0.158889040350914, 0.21185928583145142, 0.21634039282798767, 0.19379360973834991, 0.2384834587574005, 0.2526828348636627, 0.23227396607398987, 0.21912403404712677]}]
50


100%|██████████| 1/1 [00:00<00:00,  4.15it/s]


[{'transcription': 'he  te', 'start_timestamps': [0, 20, 40, 180, 240, 280], 'end_timestamps': [20, 40, 100, 220, 260, 300], 'probabilities': [0.125918909907341, 0.20426006615161896, 0.18182748556137085, 0.21807429194450378, 0.17109593749046326, 0.37004363536834717]}]
50


100%|██████████| 1/1 [00:00<00:00,  4.12it/s]


[{'transcription': 'the e te e e  ee', 'start_timestamps': [0, 20, 40, 60, 120, 160, 240, 280, 340, 440, 460, 580, 620, 760, 820, 860], 'end_timestamps': [20, 40, 60, 100, 140, 220, 260, 300, 380, 460, 500, 600, 660, 780, 840, 880], 'probabilities': [0.17368084192276, 0.2302727997303009, 0.27368634939193726, 0.21767188608646393, 0.17029349505901337, 0.20116424560546875, 0.212190642952919, 0.3703060448169708, 0.24967092275619507, 0.15805868804454803, 0.2164023369550705, 0.26077836751937866, 0.26555484533309937, 0.24262267351150513, 0.2391595095396042, 0.19524741172790527]}]
50


100%|██████████| 1/1 [00:00<00:00,  4.16it/s]


[{'transcription': '', 'start_timestamps': None, 'end_timestamps': None, 'probabilities': None}]
50


100%|██████████| 1/1 [00:00<00:00,  4.06it/s]


[{'transcription': '', 'start_timestamps': None, 'end_timestamps': None, 'probabilities': None}]


In [8]:
wav_data

'temp.wav'