# Usage of Speech to Text APIs in Python

In [None]:
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
from openai import AzureOpenAI
import time

import os
import json

load_dotenv()
speech_key = os.getenv("SPEECH_KEY")
service_region = os.getenv("SERVICE_REGION")
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

In [None]:
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

## Recognise from mic

In [None]:
def from_mic() -> speechsdk.SpeechRecognitionResult:
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

    print("Speak into your microphone.")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()
    print(speech_recognition_result.text)
    return speech_recognition_result


speech_recognition_result = from_mic()

print(json.dumps(json.loads(speech_recognition_result.json), indent=4))

## From a file

In [None]:
FILE_NAME = "../data/dummy-call-centre.wav"
audio_config = speechsdk.AudioConfig(filename=FILE_NAME)


def from_file() -> speechsdk.SpeechRecognitionResult:
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config
    )

    print(f"Recognizing speech from file: {FILE_NAME}")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()
    return speech_recognition_result


speech_recognition_result = from_file()

print(json.dumps(json.loads(speech_recognition_result.json), indent=4))

### Understanding `speechsdk.SpeechRecognitionResult`

SKD returns a `speechsdk.SpeechRecognitionResult` which can be used to understand and process output in various situatiions. This will be used in the next section when we perform continuous Speech recognition.

In [None]:
def recognize_from_microphone():
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config
    )

    print("Speak into your microphone.")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(speech_recognition_result.text))
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print(
            "No speech could be recognized: {}".format(
                speech_recognition_result.no_match_details
            )
        )
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")


# Don't speak into the mic to see alternate results
recognize_from_microphone()

## Continuous Speech recognition

We can use `start_continuous_recognition()` and `stop_continuous_recognition()` to start recognizing Speech in the background. SDK provides _callbacks_ when data in available.

In [None]:
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config, audio_config=audio_config
)


## Callback function that is called each time a speech recognition event occurs
def process_callback(evt: speechsdk.SpeechRecognitionEventArgs):
    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
        # Print final recognised text
        print("Recognised: ", evt.result.text)
    elif evt.result.reason == speechsdk.ResultReason.RecognizingSpeech:
        # Continuously print recognised text
        print("Recognising: ", evt.result.text, end="\r")
    else:
        print("Event: {}".format(evt))

In [None]:
# We are using the same callback funcation for each kind of event
#   The most interestng events are RecognizingSpeech and RecognizedSpeech.
#   RecognizingSpeech is called when the speech recognizer has hypothesized a partial recognition result
#   RecognizedSpeech is called when the speech recognizer has recognized a final recognition result
speech_recognizer.recognizing.connect(process_callback)
speech_recognizer.recognized.connect(process_callback)
speech_recognizer.session_started.connect(process_callback)
speech_recognizer.session_stopped.connect(process_callback)
speech_recognizer.canceled.connect(process_callback)
speech_recognizer.session_stopped.connect(process_callback)
speech_recognizer.canceled.connect(process_callback)

In [None]:
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()

In [None]:
speech_recognizer.stop_continuous_recognition()

### Optional: Continuous Speech recognition on File

In [None]:
FILE_NAME = "../data/dummy-call-centre.wav"
audio_config = speechsdk.AudioConfig(filename=FILE_NAME)


def from_file():
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config
    )

    print(f"Recognizing speech from file: {FILE_NAME}")

    done = False

    def stop_recognition(evt):
        print("CLOSING on {}".format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    speech_recognizer.recognizing.connect(process_callback)
    speech_recognizer.recognized.connect(process_callback)
    speech_recognizer.session_stopped.connect(stop_recognition)
    speech_recognizer.canceled.connect(stop_recognition)

    speech_recognizer.start_continuous_recognition()
    while not done:
        pass


from_file()

## Continuous Speech recognition with diarization

In [None]:
speech_config.set_property(
    property_id=speechsdk.PropertyId.SpeechServiceResponse_DiarizeIntermediateResults,
    value="true",
)


def process_transcription_callback(evt: speechsdk.SpeechRecognitionEventArgs):
    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
        # Print final recognised text
        if evt.result.speaker_id:
            print(f"Speaker {evt.result.speaker_id}: {evt.result.text}")
        else:
            print("Recognised: ", evt.result.text)
    elif evt.result.reason == speechsdk.ResultReason.RecognizingSpeech:
        # Continuously print recognised text
        if evt.result.speaker_id:
            print(f"Speaker {evt.result.speaker_id}: {evt.result.text}", end="\r")
        else:
            print("Recognising: ", evt.result.text, end="\r")
    else:
        print("Event: {}".format(evt))


def transcribe(file=None):
    if file:
        audio_config = speechsdk.AudioConfig(filename=FILE_NAME)
    else:
        audio_config = speechsdk.AudioConfig(use_default_microphone=True)
    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
        speech_config=speech_config, audio_config=audio_config
    )

    print(f"Recognizing speech from file: {FILE_NAME}")

    done = False

    def stop_transcription(evt):
        print("CLOSING on {}".format(evt))
        conversation_transcriber.stop_transcribing_async()
        nonlocal done
        done = True

    conversation_transcriber.transcribing.connect(process_transcription_callback)
    conversation_transcriber.transcribed.connect(process_transcription_callback)
    conversation_transcriber.session_stopped.connect(stop_transcription)
    conversation_transcriber.canceled.connect(stop_transcription)

    conversation_transcriber.start_transcribing_async()

    # Keep looping until keyboard interrupt
    try:
        while not done:
            time.sleep(0.5)
    except KeyboardInterrupt:
        conversation_transcriber.stop_transcribing_async()

In [None]:
transcribe()

In [None]:
transcribe(file="../data/dummy-call-centre.wav")

## Fast Transcription

In [None]:
import requests

url = "https://uksouth.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
headers = {"Ocp-Apim-Subscription-Key": speech_key}
files = {
    "audio": open("../data/dummy-call-centre.wav", "rb"),
    "definition": (None, '{"locales":["en-US"]}'),
}

response = requests.post(url, headers=headers, files=files)

if response.status_code == 200:
    for phrase in response.json()["phrases"]:
        print(phrase["text"])

## Azure OpenAI Whisper

In [None]:
client = AzureOpenAI(
    api_key=azure_openai_api_key,
    api_version="2024-02-01",
    azure_endpoint=azure_openai_endpoint,
)

deployment_id = "whisper"  # This will correspond to the custom name you chose for your deployment when you deployed a model."
audio_test_file = "../data/dummy-call-centre.wav"

result = client.audio.transcriptions.create(
    file=open(audio_test_file, "rb"), model=deployment_id
)

print(result.text)