In [13]:
from dotenv import load_dotenv
load_dotenv()
import os
import azure.cognitiveservices.speech as speechsdk
import time

RECOGNIZED: SpeechRecognitionEventArgs(session_id=bd6290c5730a48eeb8805452242b3613, result=SpeechRecognitionResult(result_id=573fe51bb27047e48c142213fc2bd6f7, text="I see now that the circumstances of 1's birth are irrelevant. It is what you do with the gift of life that determines who you are.", reason=ResultReason.RecognizedSpeech))
CANCELED SpeechRecognitionCanceledEventArgs(session_id=bd6290c5730a48eeb8805452242b3613, result=SpeechRecognitionResult(result_id=e729158cdf0a484099929cdbd337a2f0, text="", reason=ResultReason.Canceled))
CLOSING on SpeechRecognitionCanceledEventArgs(session_id=bd6290c5730a48eeb8805452242b3613, result=SpeechRecognitionResult(result_id=e729158cdf0a484099929cdbd337a2f0, text="", reason=ResultReason.Canceled))
SESSION STOPPED SessionEventArgs(session_id=bd6290c5730a48eeb8805452242b3613)
CLOSING on SessionEventArgs(session_id=bd6290c5730a48eeb8805452242b3613)


# Speech Recognition
This notebook aims to test how speech recognition can be achieved using Azure Speech Services. [It uses code from this article.](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)

In [5]:
def check_result_from_azure(speech_recognition_result):
    """
    Having received a result from Azure, check whether it worked or not
    """
    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return speech_recognition_result.text, 200
   
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        e = "No speech could be recognized: {}.".format(speech_recognition_result.no_match_details)
        return e, 400
   
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        e = "Speech Recognition canceled: {}.".format(cancellation_details.reason)

        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            extra_details = f" Error details: {cancellation_details.error_details}. Did you set the speech resource key and endpoint values?"
        else:
            extra_details = ""

        return e + extra_details, 400

def transcribe_file(filename: str, language: str="en-GB"):
    speech_key = os.getenv("SPEECH_KEY")
    service_region = "uksouth"

    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) 
    speech_config.speech_recognition_language=language 
    print(speech_config.speech_recognition_language)
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    return check_result_from_azure(speech_recognition_result)

In [6]:
output, status_code = transcribe_file(r"C:\Users\Beth\Documents\Bupa\Real_Time_STT\audio\mewtwo.wav")
if status_code == 200:
    print(output)

en-GB
The human sacrificed himself to save the Pokémon. I pitted them against each other, but not until they set aside their differences did I see the true power they all shared deep inside.


In [7]:
output, status_code = transcribe_file(r"C:\Users\Beth\Documents\Bupa\Real_Time_STT\audio\deutsches_audio.wav", "de-DE")
if status_code == 200:
    print(output)

de-DE
And you guested. I want to feel a flush linger console goodbye. Garnish them.


The issue with the above code is that it only lasts for a short period of time: "The previous examples use single-shot recognition, which recognizes a single utterance. The end of a single utterance is determined by listening for silence at the end or until a maximum of 15 seconds of audio is processed". This is for both files and mic recordings. Therefore, I looked into the continuous transcription options:

In [45]:
def speech_recognize_continuous_from_file(filename: str, language:str="en-GB"):
    """
    performs continuous speech recognition with input from an audio file
    """
    speech_config = speechsdk.SpeechConfig(subscription=os.getenv("SPEECH_KEY"), endpoint="https://uksouth.api.cognitive.microsoft.com")
    speech_config.speech_recognition_language=language 
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print("CLOSING on {}".format(evt))
        nonlocal done
        done = True

    final_transcription = []

    def text_recognized(evt):
        final_transcription.append(evt.result.text)

    # collect recognized text
    speech_recognizer.recognized.connect(text_recognized)

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.session_started.connect(lambda evt: print("SESSION STARTED: {}".format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print("SESSION STOPPED {}".format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print("CANCELED {}".format(evt)))
    # Stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(0.5)

    speech_recognizer.stop_continuous_recognition()    

    return " ".join(final_transcription)


def speech_recognize_continuous_async_from_microphone(language: str="en-GB"):
    """performs continuous speech recognition asynchronously with input from microphone"""
    speech_config = speechsdk.SpeechConfig(subscription=os.getenv("SPEECH_KEY"), endpoint="https://uksouth.api.cognitive.microsoft.com")
    speech_config.speech_recognition_language=language 
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

    done = False

    def recognizing_cb(evt: speechsdk.SpeechRecognitionEventArgs):
        print("RECOGNIZING: {}".format(evt))

    final_transcription = []
    def text_recognized(evt):
        final_transcription.append(evt.result.text)

    def recognized_cb(evt: speechsdk.SpeechRecognitionEventArgs):
        print("RECOGNIZED: {}".format(evt))
        text_recognized(evt)       
    
    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition"""
        print("CLOSING on {}".format(evt))
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the speech recognizer
    #speech_recognizer.recognizing.connect(recognizing_cb)
    speech_recognizer.recognized.connect(recognized_cb)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Perform recognition. `start_continuous_recognition_async asynchronously initiates continuous recognition operation,
    # Other tasks can be performed on this thread while recognition starts...
    # wait on result_future.get() to know when initialization is done.
    # Call stop_continuous_recognition_async() to stop recognition.
    result_future = speech_recognizer.start_continuous_recognition_async()

    result_future.get()  # wait for voidfuture, so we know engine initialization is done.
    print("Continuous Recognition is now running, say something.")

    while not done:
        # No real sample parallel work to do on this thread, so just wait for user to type stop.
        # Can't exit function or speech_recognizer will go out of scope and be destroyed while running.
        print('type "stop" then enter when done')
        stop = input()
        if stop.lower() == "stop":
            print("Stopping async recognition.")
            speech_recognizer.stop_continuous_recognition_async()
            break

    print("recognition stopped, main thread can exit now.")

    return " ".join(final_transcription)

In [42]:
transcription = speech_recognize_continuous_from_file(r"C:\Users\Beth\Documents\Bupa\Real_Time_STT\audio\converted.wav")

transcription

SESSION STARTED: SessionEventArgs(session_id=30a47fa795e041ffbe8f6e41de4a718d)
CANCELED SpeechRecognitionCanceledEventArgs(session_id=30a47fa795e041ffbe8f6e41de4a718d, result=SpeechRecognitionResult(result_id=7e8393923de146958041d5b72da1a3e6, text="", reason=ResultReason.Canceled))
CLOSING on SpeechRecognitionCanceledEventArgs(session_id=30a47fa795e041ffbe8f6e41de4a718d, result=SpeechRecognitionResult(result_id=7e8393923de146958041d5b72da1a3e6, text="", reason=ResultReason.Canceled))
SESSION STOPPED SessionEventArgs(session_id=30a47fa795e041ffbe8f6e41de4a718d)
CLOSING on SessionEventArgs(session_id=30a47fa795e041ffbe8f6e41de4a718d)


"According to all known laws of aviation, there is no way a bee should be able to fly. Its wings are too small to get its fat little body off the ground. The bee, of course, flies anyway. Because bees don't care what humans think is impossible. Yellow. Black. Yellow. Black. Yellow, Black. Yellow, Black. Oh, black and yellow. Let's shake it up a little. Barry. Breakfast is ready. Coming. Hang on a second. Hello, Barry. Adam, Can you believe this is happening? I can't. I'll pick you up. Looking sharp. As your father pay good money for those. Sorry, I'm excited. Here's their graduate. We're very proud of you, son. A perfect report card. All bees. Very proud of thing going here. You got a lint on your fuzz. Oh, that's me. Wave to us. Will be in row 118,000. Bye, Barry. I told you, stop flying in the house. Hey, Adam. Hey, Barry. Is that fuzz? Joe? A little special day graduation."

In [43]:
speech_recognize_continuous_from_file(r"C:\Users\Beth\Documents\Bupa\Real_Time_STT\audio\deutsches_audio.wav", "de-DE")

SESSION STARTED: SessionEventArgs(session_id=ed636ce972c7406181317e10ad370b4e)
CANCELED SpeechRecognitionCanceledEventArgs(session_id=ed636ce972c7406181317e10ad370b4e, result=SpeechRecognitionResult(result_id=4e25102ff37e47648f0038de83e59d66, text="", reason=ResultReason.Canceled))
CLOSING on SpeechRecognitionCanceledEventArgs(session_id=ed636ce972c7406181317e10ad370b4e, result=SpeechRecognitionResult(result_id=4e25102ff37e47648f0038de83e59d66, text="", reason=ResultReason.Canceled))
SESSION STOPPED SessionEventArgs(session_id=ed636ce972c7406181317e10ad370b4e)
CLOSING on SessionEventArgs(session_id=ed636ce972c7406181317e10ad370b4e)


'Manche Gäste, darunter viele Flüchtlinge, können so gut wie gar nicht schwimmen. Auch die. Auch diesen indischen Studenten fällt es schwer, den Kopf über Wasser zu halten. Dem Rettungsschwimmer ist das nicht geheuer, er greift ein, sie sollen im Nichtschwimmerbereich bleiben, während. Der Welle sind fast 23 Stunden davon sind runtergegangen und deswegen nur hab ich sag ich mal so verwandt, dass sie sich bis zur Linie eigentlich halten müssen und ja nicht mehr in die Tiefe reinkommen. Viele Nationalitäten, viele sprachen, das macht die Verständigung. Recht. Eben wird er gerade untergegangen. Beinahe, oder? Ja, ja, wie war das? Das war sehr schön.'

In [46]:
speech_recognize_continuous_async_from_microphone()

Continuous Recognition is now running, say something.
type "stop" then enter when done
RECOGNIZED: SpeechRecognitionEventArgs(session_id=ba7f3c07d987495a875763cd5ed77538, result=SpeechRecognitionResult(result_id=7c02db30fb3048b2ba7a77a0a7cb57a2, text="Hello, I hope you can hear me. I don't really understand the stopping command. I got it wrong last time, so we were recording for a while and I broke everything, so I'm sorry about that. Maybe this time I might get it right, but I have to keep talking for longer than what was it? 15 seconds? 30 seconds? That's a long time. I'll get the B movie script up again. Where is it? Am I able to find it while I'm talking? I probably can. No, I don't have it open anymore by the looks of it. Oh, we didn't need it. It's fine. I think it's already been 20.", reason=ResultReason.RecognizedSpeech))
Stopping async recognition.
recognition stopped, main thread can exit now.


"Hello, I hope you can hear me. I don't really understand the stopping command. I got it wrong last time, so we were recording for a while and I broke everything, so I'm sorry about that. Maybe this time I might get it right, but I have to keep talking for longer than what was it? 15 seconds? 30 seconds? That's a long time. I'll get the B movie script up again. Where is it? Am I able to find it while I'm talking? I probably can. No, I don't have it open anymore by the looks of it. Oh, we didn't need it. It's fine. I think it's already been 20."

In [49]:
speech_recognize_continuous_async_from_microphone("de-DE")

Continuous Recognition is now running, say something.
type "stop" then enter when done
Stopping async recognition.
recognition stopped, main thread can exit now.


''