In [13]:
#!/usr/bin/env python
# coding: utf-8

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

import logging
import sys
import requests
import time
import swagger_client

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")

In [14]:
import os
from dotenv import load_dotenv
load_dotenv()

API_VERSION = "2024-11-15"

# Your subscription key and region for the speech service
SUBSCRIPTION_KEY = os.getenv("AZURE_KEY")
SERVICE_REGION = os.getenv("AZURE_SERVICE_REGION")

NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"

LOCALE = "en-US"
RECORDINGS_BLOB_URI = "https://smlbird.blob.core.windows.net/speechtotext/The Links Golf 1.m4a"

# Provide the uri of a container with audio files for transcribing all of them
# with a single request. At least 'read' and 'list' (rl) permissions are required.
RECORDINGS_CONTAINER_URI = "https://smlbird.blob.core.windows.net/speechtotext?sp=r&st=2025-01-21T10:43:59Z&se=2025-01-21T18:43:59Z&spr=https&sv=2022-11-02&sr=c&sig=Vcv5Zhz9hLKIJ0yM1AxcY52oNvLghj6ycPv%2FoHi4RKQ%3D"

# Set model information when doing transcription with custom models
MODEL_REFERENCE = None  # guid of a custom model

In [25]:
def transcribe_from_single_blob(uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        properties=properties
    )

    return transcription_definition


def transcribe_with_custom_model(client, uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    # Model information (ADAPTED_ACOUSTIC_ID and ADAPTED_LANGUAGE_ID) must be set above.
    if MODEL_REFERENCE is None:
        logging.error("Custom model ids must be set when using custom models")
        sys.exit()

    model = {'self': f'{client.configuration.host}/models/{MODEL_REFERENCE}'}

    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        model=model,
        properties=properties
    )

    return transcription_definition


def transcribe_from_container(uri, properties):
    """
    Transcribe all files in the container located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_container_url=uri,
        properties=properties
    )

    return transcription_definition


def _paginate(api, paginated_object):
    """
    The autogenerated client does not support pagination. This function returns a generator over
    all items of the array that the paginated object `paginated_object` is part of.
    """
    yield from paginated_object.values
    typename = type(paginated_object).__name__
    auth_settings = ["api_key"]
    while paginated_object.next_link:
        link = paginated_object.next_link[len(api.api_client.configuration.host):]
        paginated_object, status, headers = api.api_client.call_api(link, "GET",
            response_type=typename, auth_settings=auth_settings)

        if status == 200:
            yield from paginated_object.values
        else:
            raise Exception(f"could not receive paginated data: status {status}")


def delete_all_transcriptions(api):
    """
    Delete all transcriptions associated with your speech resource.
    """
    logging.info("Deleting all existing completed transcriptions.")

    # get all transcriptions for the subscription
    transcriptions = list(_paginate(api, api.get_transcriptions()))

    # Delete all pre-existing completed transcriptions.
    # If transcriptions are still running or not started, they will not be deleted.
    for transcription in transcriptions:
        transcription_id = transcription._self.split('/')[-1]
        logging.debug(f"Deleting transcription with id {transcription_id}")
        try:
            api.delete_transcription(transcription_id)
        except swagger_client.rest.ApiException as exc:
            logging.error(f"Could not delete transcription {transcription_id}: {exc}")


def transcribe():
    logging.info("Starting transcription client...")

    # Configure API key authorization: subscription_key
    configuration = swagger_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext"

    # Create the client object and authenticate
    client = swagger_client.ApiClient(configuration)

    # Create an instance of the transcription API class
    api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)

    # Specify transcription properties by passing a dict to the properties parameter
    properties = swagger_client.TranscriptionProperties(6)

    # Use base models for transcription
    transcription_definition = transcribe_from_single_blob(RECORDINGS_BLOB_URI, properties)

    # Create the transcription
    created_transcription, status, headers = api.transcriptions_create_with_http_info(
        transcription=transcription_definition
    )

    # Get the transcription ID from the location URI
    transcription_id = headers["location"].split("/")[-1].split("?")[0]

    logging.info(f"Created new transcription with id '{transcription_id}' in region {SERVICE_REGION}")
    logging.info("Checking status.")

    completed = False

    while not completed:
        time.sleep(5)

        transcription = api.transcriptions_get(transcription_id)
        logging.info(f"Transcriptions status: {transcription.status}")

        if transcription.status in ("Failed", "Succeeded"):
            completed = True

        if transcription.status == "Succeeded":
            if properties.destination_container_url is not None:
                logging.info("Transcription succeeded. Results are located in your Azure Blob Storage.")
                break

            pag_files = api.transcriptions_list_files(transcription_id)
            for file_data in _paginate(api, pag_files):
                if file_data.kind != "Transcription":
                    continue

                audiofilename = file_data.name
                results_url = file_data.links.content_url
                results = requests.get(results_url)
                logging.info(f"Results for {audiofilename}:\n{results.content.decode('utf-8')}")


if __name__ == "__main__":
    transcribe()

01/23/2025 09:51:19 AM SE Asia Standard Time Starting transcription client...


ApiException: (404)
Reason: Resource Not Found
HTTP response headers: HTTPHeaderDict({'Content-Length': '56', 'Content-Type': 'application/json', 'apim-request-id': '489aa807-9b38-44b9-b9be-7f0d0506b245', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'Date': 'Thu, 23 Jan 2025 02:51:20 GMT'})
HTTP response body: {"error":{"code":"404","message": "Resource not found"}}


In [19]:
def list_methods():
    # configure API key authorization: subscription_key
    configuration = swagger_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext"

    # create the client object and authenticate
    client = swagger_client.ApiClient(configuration)

    # create an instance of the transcription api class
    api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)

    # Print available methods
    print(dir(api))

if __name__ == "__main__":
    list_methods()

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'api_client', 'transcriptions_create', 'transcriptions_create_with_http_info', 'transcriptions_delete', 'transcriptions_delete_with_http_info', 'transcriptions_get', 'transcriptions_get_file', 'transcriptions_get_file_with_http_info', 'transcriptions_get_with_http_info', 'transcriptions_list', 'transcriptions_list_files', 'transcriptions_list_files_with_http_info', 'transcriptions_list_supported_locales', 'transcriptions_list_supported_locales_with_http_info', 'transcriptions_list_with_http_info', 'transcriptions_update', 'transcriptions_update_with_http_info']


# Old version

In [33]:
import time, os, requests, json

AZURE_SERVICE_REGION = os.getenv("AZURE_SERVICE_REGION")
AZURE_KEY = os.getenv("AZURE_KEY")
transcription_endpoint = f"https://{AZURE_SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions"

language_code = 'id-ID'

# Audio files in Azure Blob Storage
audio_files = [
    "https://smlbird.blob.core.windows.net/speechtotext/The Links Golf 1.m4a"
]

In [35]:
# Create transcription job
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_KEY,
    "Content-Type": "application/json"
}

body = {
    "contentUrls": audio_files,
    "locale": language_code,
    "displayName": "Batch Transcription Example",
    "properties": {
    "diarizationEnabled": False,
    "wordLevelTimestampsEnabled": True,
    "displayFormWordLevelTimestampsEnabled": False,
    "punctuationMode": "DictatedAndAutomatic",
    "profanityFilterMode": "Masked"
}
}

response = requests.post(transcription_endpoint, headers=headers, data=json.dumps(body))
if response.status_code == 201:
    print("Batch transcription job created successfully.")
    job_location = response.headers["Location"]
else:
    print(f"Failed to create batch transcription job: {response.status_code}")
    print("Response:", response.json())
    exit()

# Check transcription job status
def check_transcription_status(job_location):
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_KEY
    }
    response = requests.get(job_location, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to get job status: {response.status_code}")
        print("Response:", response.json())
        exit()

# Wait for the transcription job to complete
while True:
    job_status = check_transcription_status(job_location)
    status = job_status["status"]
    if status in ["Succeeded", "Failed"]:
        break
    print(f"Job status: {status}. Waiting for 30 seconds...")
    time.sleep(30)

# Retrieve transcription results and save to a variable
transcription_results = []
if status == "Succeeded":
    files_url = job_status["links"]["files"]
    files_response = requests.get(files_url, headers=headers)
    if files_response.status_code == 200:
        files = files_response.json()
        for file in files["values"]:
            result_url = file["links"]["contentUrl"]
            result_response = requests.get(result_url)
            if result_response.status_code == 200:
                result_content = result_response.json()
                transcription_results.append(result_content)
            else:
                print(f"Failed to get transcription result: {result_response.status_code}")
    else:
        print(f"Failed to get result files: {files_response.status_code}")
else:
    print(f"Transcription job failed: {job_status.get('error', {}).get('message', 'Unknown error')}")

# Process results
if transcription_results:
    print("Transcription results:", transcription_results[0])

# Function to extract the best transcription result
def get_best_transcription(transcription_result):
    # Extract the 'combinedRecognizedPhrases' list
    phrases = transcription_result.get('combinedRecognizedPhrases', [])
    
    # Check if there are any phrases
    if not phrases:
        return None
    
    # Extract the first phrase (assuming it's the best one)
    best_phrase = phrases[0]
    
    # Return the 'display' transcription as it's usually the most readable
    return best_phrase.get('display', '')

# Example of accessing the first transcription result
if transcription_results:
    first_result = transcription_results[0]
    best_transcription = get_best_transcription(first_result)
    print(best_transcription)

    with open("output.txt", "w") as file:
        # Write the variable to the file
        file.write(best_transcription)

Batch transcription job created successfully.
Job status: Running. Waiting for 30 seconds...
Transcription job failed: Unknown error


In [38]:
job_status

{'self': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/be314c87-b815-4591-9009-5a030f326bfc',
 'displayName': 'Batch Transcription Example',
 'locale': 'id-ID',
 'createdDateTime': '2025-01-23T03:21:14Z',
 'lastActionDateTime': '2025-01-23T03:21:24Z',
 'status': 'Failed',
 'model': {'self': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/36cc2887-f827-4311-8178-003dd2a78ba7'},
 'links': {'files': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/be314c87-b815-4591-9009-5a030f326bfc/files'},
 'properties': {'diarizationEnabled': False,
  'wordLevelTimestampsEnabled': True,
  'displayFormWordLevelTimestampsEnabled': False,
  'channels': [0, 1],
  'punctuationMode': 'DictatedAndAutomatic',
  'profanityFilterMode': 'Masked',
  'error': {'code': 'InvalidData',
   'message': 'The recordings URI contains invalid data.'}}}

In [29]:
import logging
import time
from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer
from azure.cognitiveservices.speech.transcription import TranscriptionDefinition, TranscriptionProperties
from azure.identity import DefaultAzureCredential


# Replace these with your details
SUBSCRIPTION_KEY = os.getenv("AZURE_KEY")
REGION = os.getenv("AZURE_SERVICE_REGION")
AUDIO_URI = "https://smlbird.blob.core.windows.net/speechtotext/The Links Golf 1.m4a"
LOCALE = "id-ID"  # Specify the language of the audio

def transcribe_batch(audio_uri, subscription_key, region, locale):
    """
    Submits a batch transcription job to Azure.
    """
    from azure.cognitiveservices.speech.transcription import TranscriptionDefinition, TranscriptionProperties
    from azure.cognitiveservices.speech import SpeechConfig
    import azure.cognitiveservices.speech.transcription as speech

    # Set up speech configuration
    config = SpeechConfig(subscription=subscription_key, region=region)

    # Define transcription properties
    properties = TranscriptionProperties()
    properties.locale = locale
    properties.content_urls = [audio_uri]

    # Create a transcription definition
    transcription_definition = TranscriptionDefinition(
        content_urls=[audio_uri],
        locale=locale,
        properties=properties,
    )

    # Submit transcription job
    client = speech.SpeechTranscriptionClient(config)
    transcription = client.transcriptions.create(transcription_definition)

    logging.info(f"Created transcription with ID: {transcription.id}")
    return transcription.id

def check_status(transcription_id, subscription_key, region):
    """
    Poll the transcription job status until it is complete.
    """
    from azure.cognitiveservices.speech.transcription import SpeechTranscriptionClient
    from azure.cognitiveservices.speech import SpeechConfig

    # Set up speech configuration
    config = SpeechConfig(subscription=subscription_key, region=region)
    client = SpeechTranscriptionClient(config)

    while True:
        transcription = client.transcriptions.get(transcription_id)
        logging.info(f"Transcription status: {transcription.status}")

        if transcription.status in ("Succeeded", "Failed"):
            break

        time.sleep(10)

    if transcription.status == "Succeeded":
        logging.info("Transcription completed successfully.")
        for file_data in transcription.results:
            logging.info(f"Result URL: {file_data.content_url}")

# Submit batch transcription
transcription_id = transcribe_batch(AUDIO_URI, SUBSCRIPTION_KEY, REGION, LOCALE)

# Poll status
check_status(transcription_id, SUBSCRIPTION_KEY, REGION)


ImportError: cannot import name 'TranscriptionDefinition' from 'azure.cognitiveservices.speech.transcription' (c:\Users\firmansyah.atmojo\AppData\Local\Programs\Python\Python311\Lib\site-packages\azure\cognitiveservices\speech\transcription.py)

# New Try 31 Jan 2025: Azure Batch Transcription

### Single File

In [17]:
# Source: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py

import requests
import time
import json
import os

# Replace these with your Azure Speech Service details
SUBSCRIPTION_KEY = os.getenv("AZURE_KEY")
SERVICE_REGION = "southeastasia"
API_VERSION = "2024-11-15"

# Replace with your audio file's SAS URI (Azure Blob Storage)
RECORDINGS_BLOB_URI = "https://smlbird.blob.core.windows.net/speechtotext/Recording%208.wav?sp=r&st=2025-01-31T08:42:12Z&se=2025-01-31T16:42:12Z&spr=https&sv=2022-11-02&sr=b&sig=8LWuvhpdCDUDf423H%2F%2BvS5RBOKLKyKRL%2BoUL%2FXAfE%2Fk%3D"

# Transcription settings
DISPLAY_NAME = "Simple Transcription"
DESCRIPTION = "Transcription of a single audio file"
LOCALE = "en-US"

# API endpoints
TRANSCRIPTION_API_URL = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.1/transcriptions"

def create_transcription():
    """Create a new transcription job."""
    headers = {
        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY,
        "Content-Type": "application/json"
    }
    body = {
        "displayName": DISPLAY_NAME,
        "description": DESCRIPTION,
        "locale": LOCALE,
        "contentUrls": [RECORDINGS_BLOB_URI],
        "properties": {
            "timeToLive": "PT6H"  # Transcription will expire after 6 hours
        }
    }
    response = requests.post(TRANSCRIPTION_API_URL, headers=headers, json=body)
    response.raise_for_status()
    return response.headers["Location"]  # URL to check transcription status

def get_transcription_status(transcription_url):
    """Check the status of the transcription job."""
    headers = {
        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY
    }
    response = requests.get(transcription_url, headers=headers)
    response.raise_for_status()
    return response.json()

def get_transcription_results(transcription_url):
    """Retrieve the transcription results."""
    headers = {
        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY
    }
    response = requests.get(f"{transcription_url}/files", headers=headers)
    response.raise_for_status()
    
    # Find the JSON file containing the transcription results
    for file in response.json()["values"]:
        if file["kind"] == "Transcription":
            results_url = file["links"]["contentUrl"]
            results_response = requests.get(results_url)
            results_response.raise_for_status()
            return results_response.json()

def main():
    # Step 1: Create a transcription job
    print("Creating transcription job...")
    transcription_url = create_transcription()
    transcription_id = transcription_url.split("/")[-1]
    print(f"Transcription ID: {transcription_id}")

    # Step 2: Poll for transcription status
    print("Waiting for transcription to complete...")
    while True:
        status_response = get_transcription_status(transcription_url)
        status = status_response["status"]
        print(f"Status: {status}")

        if status in ["Succeeded", "Failed"]:
            break
        time.sleep(5)  # Wait 5 seconds before checking again

    # Step 3: Retrieve and process results
    if status == "Succeeded":
        print("Transcription succeeded. Retrieving results...")
        results = get_transcription_results(transcription_url)

        # Save the results to a variable
        transcription_results = extract_transcription_text(results)
        print("Transcription Results:")
        print(transcription_results)
        return transcription_results
    else:
        print("Transcription failed.")

def extract_transcription_text(results):
    """
    Extract and format the transcription text from the results JSON.
    """
    combined_text = ""
    
    # Extract text from the "combinedRecognizedPhrases" field
    if "combinedRecognizedPhrases" in results:
        for phrase in results["combinedRecognizedPhrases"]:
            combined_text += phrase["display"] + " "

    # Extract text from the "recognizedPhrases" field (if needed)
    if "recognizedPhrases" in results:
        for phrase in results["recognizedPhrases"]:
            if phrase["recognitionStatus"] == "Success":
                for nbest in phrase["nBest"]:
                    combined_text += nbest["display"] + " "

    # Remove extra spaces and return the cleaned text
    return combined_text.strip()

transcription = main()

Creating transcription job...
Transcription ID: ed3268f0-2509-48f9-8f1e-50d8a32c9f89
Waiting for transcription to complete...
Status: Running
Status: Running
Status: Running
Status: Running
Status: Running
Status: Running
Status: Running
Status: Succeeded
Transcription succeeded. Retrieving results...
Transcription Results:
Three years ago a one bedroom off plant has been 150, so now it's 215 after wow, three years and something Yes, yes, the land prices are increasing, the construction prices are increasing. If we want to resell also, they are making the profit. Yeah. So if you resell what? Sorry, what land title do they get? We only build on pink commercial zones. This is what you mean or what? I know if they sell the land, yes, or the house, yes. Freehold. No. No, no, not freehold. Leasehold. No, no. So we transfer from the owner, it transfers to the new buyer. Yes. Leasehold. Still. Still leasehold. Yeah, exactly. With the remaining years now we're going to go, right. OK. Rain are 

In [18]:
transcription

"Three years ago a one bedroom off plant has been 150, so now it's 215 after wow, three years and something Yes, yes, the land prices are increasing, the construction prices are increasing. If we want to resell also, they are making the profit. Yeah. So if you resell what? Sorry, what land title do they get? We only build on pink commercial zones. This is what you mean or what? I know if they sell the land, yes, or the house, yes. Freehold. No. No, no, not freehold. Leasehold. No, no. So we transfer from the owner, it transfers to the new buyer. Yes. Leasehold. Still. Still leasehold. Yeah, exactly. With the remaining years now we're going to go, right. OK. Rain are shining. Maybe you will go. Maybe you'll visit your son in Spain and pass by. If you have so many friends, we can go right here now. And we're going to look over the fence somehow. I'll show you where. Three years ago a one bedroom off plant has been 150, so now it's 215 after wow, three years and something Yes, yes, the la

### Multiple Files: Seems Failed

In [None]:
import requests
import time

file1 = "https://smlbird.blob.core.windows.net/speechtotext/Pandawa%202.wav?sp=r&st=2025-01-31T10:27:09Z&se=2025-01-31T18:27:09Z&spr=https&sv=2022-11-02&sr=b&sig=i7p0%2BCIWQjBTz%2FclN3VwL1hjsNKA4aGGramGNpnT51U%3D"
file2 = "https://smlbird.blob.core.windows.net/speechtotext/Recording%208.wav?sp=r&st=2025-01-31T08:42:12Z&se=2025-01-31T16:42:12Z&spr=https&sv=2022-11-02&sr=b&sig=8LWuvhpdCDUDf423H%2F%2BvS5RBOKLKyKRL%2BoUL%2FXAfE%2Fk%3D"
# List of SAS URIs for the audio files
RECORDINGS_BLOB_URIS = [
    file1,
    file2
]

# Transcription settings
DISPLAY_NAME = "Batch Transcription"
DESCRIPTION = "Transcription of multiple audio files"
LOCALE = "en-US"

# API endpoints
TRANSCRIPTION_API_URL = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.1/transcriptions"

def create_transcription(content_urls):
    """Create a new transcription job for multiple files."""
    headers = {
        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY,
        "Content-Type": "application/json"
    }
    body = {
        "displayName": DISPLAY_NAME,
        "description": DESCRIPTION,
        "locale": LOCALE,
        "contentUrls": content_urls,
        "properties": {
            "timeToLive": "PT6H"  # Transcription will expire after 6 hours
        }
    }
    response = requests.post(TRANSCRIPTION_API_URL, headers=headers, json=body)
    response.raise_for_status()
    return response.headers["Location"]  # URL to check transcription status

def get_transcription_status(transcription_url):
    """Check the status of the transcription job."""
    headers = {
        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY
    }
    response = requests.get(transcription_url, headers=headers)
    response.raise_for_status()
    return response.json()

def get_transcription_results(transcription_url):
    """Retrieve the transcription results."""
    headers = {
        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY
    }
    response = requests.get(f"{transcription_url}/files", headers=headers)
    response.raise_for_status()
    
    results = []
    # Find the JSON files containing the transcription results
    for file in response.json()["values"]:
        if file["kind"] == "Transcription":
            results_url = file["links"]["contentUrl"]
            results_response = requests.get(results_url)
            results_response.raise_for_status()
            results.append(results_response.json())
    return results

def extract_transcription_text(results):
    """
    Extract and format the transcription text from the results JSON.
    """
    combined_text = ""
    
    for result in results:
        # Extract text from the "combinedRecognizedPhrases" field
        if "combinedRecognizedPhrases" in result:
            for phrase in result["combinedRecognizedPhrases"]:
                combined_text += phrase["display"] + " "

        # Extract text from the "recognizedPhrases" field (if needed)
        if "recognizedPhrases" in result:
            for phrase in result["recognizedPhrases"]:
                if phrase["recognitionStatus"] == "Success":
                    for nbest in phrase["nBest"]:
                        combined_text += nbest["display"] + " "

    # Remove extra spaces and return the cleaned text
    return combined_text.strip()

def main():
    # Step 1: Create a transcription job for multiple files
    print("Creating transcription job for multiple files...")
    transcription_url = create_transcription(RECORDINGS_BLOB_URIS)
    transcription_id = transcription_url.split("/")[-1]
    print(f"Transcription ID: {transcription_id}")

    # Step 2: Poll for transcription status
    print("Waiting for transcription to complete...")
    while True:
        status_response = get_transcription_status(transcription_url)
        status = status_response["status"]
        print(f"Status: {status}")

        if status in ["Succeeded", "Failed"]:
            break
        time.sleep(5)  # Wait 5 seconds before checking again

    # Step 3: Retrieve and process results
    if status == "Succeeded":
        print("Transcription succeeded. Retrieving results...")
        results = get_transcription_results(transcription_url)

        # Save the results to a variable
        transcription_results = extract_transcription_text(results)
        print("Transcription Results:")
        print(transcription_results)

        # Save the transcription to a text file
        with open("transcription.txt", "w", encoding="utf-8") as file:
            file.write(transcription_results)
        print("Transcription saved to 'transcription.txt'.")
        
        return transcription_results
    else:
        print("Transcription failed.")

transcription = main()

Creating transcription job for multiple files...
Transcription ID: 0fcec08a-db2b-4da2-a3d7-2979facef363
Waiting for transcription to complete...
Status: Running
Status: Running
Status: Running
Status: Running
Status: Running
Status: Running
Status: Running


In [23]:
transcription_url = create_transcription(RECORDINGS_BLOB_URIS)

In [24]:
get_transcription_status(transcription_url)

{'self': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.1/transcriptions/18b9b2a4-bdb3-48aa-b935-e3ec1283414a',
 'displayName': 'Batch Transcription',
 'description': 'Transcription of multiple audio files',
 'locale': 'en-US',
 'createdDateTime': '2025-01-31T10:00:15Z',
 'lastActionDateTime': '2025-01-31T10:00:15Z',
 'status': 'Running',
 'model': {'self': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.1/models/base/10e98dd4-3d36-4296-b383-3508d63b1e0b'},
 'links': {'files': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.1/transcriptions/18b9b2a4-bdb3-48aa-b935-e3ec1283414a/files'},
 'properties': {'diarizationEnabled': False,
  'wordLevelTimestampsEnabled': False,
  'displayFormWordLevelTimestampsEnabled': False,
  'channels': [0, 1],
  'punctuationMode': 'DictatedAndAutomatic',
  'profanityFilterMode': 'Masked',
  'timeToLive': 'PT6H'}}

In [28]:
import logging
import sys
import requests
import time
import swagger_client as cris_client

# Configure Logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%d/%m/%Y %I:%M:%S %p %Z")

# Your subscription key and region for the speech service
#SUBSCRIPTION_KEY = "YourSubscriptionKey"  # Replace with your key
#SERVICE_REGION = "YourServiceRegion"  # Replace with your region

NAME = "Transcriptions Test"
DESCRIPTION = "Demo"

LOCALE = "en-US"
RECORDINGS_BLOB_URI = "https://smlbird.blob.core.windows.net/speechtotext/Recording%208.wav?sp=r&st=2025-01-31T10:09:43Z&se=2025-01-31T18:09:43Z&spr=https&sv=2022-11-02&sr=b&sig=cE7jmhPkPoB9FuE%2F2leY1gO6zYsTaQ9Y53M9m8z5gqE%3D"

# Provide the URI of a container with audio files for transcribing all of them with a single request
RECORDINGS_CONTAINER_URI = "https://smlbird.blob.core.windows.net/speechtotext?sp=r&st=2025-01-31T10:12:29Z&se=2025-01-31T18:12:29Z&spr=https&sv=2022-11-02&sr=c&sig=FxefrdFuEK2ht4eOLerPMs89vaA9G2yI9WrJ%2FWTjKpc%3D"

def transcribe_from_single_blob(uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = cris_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        properties=properties
    )
    return transcription_definition

def transcribe_from_container(uri, properties):
    """
    Transcribe all files in the container located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = cris_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_container_url=uri,
        properties=properties
    )
    return transcription_definition

def _paginate(api, paginated_object):
    """
    The autogenerated client does not support pagination. This function returns a generator over
    all items of the array that the paginated object `paginated_object` is part of.
    """
    yield from paginated_object.values
    typename = type(paginated_object).__name__
    auth_settings = ["apiKeyHeader", "apiKeyQuery"]
    while paginated_object.next_link:
        link = paginated_object.next_link[len(api.api_client.configuration.host):]
        paginated_object, status, headers = api.api_client.call_api(link, "GET", response_type=typename, auth_settings=auth_settings)
        if status == 200:
            yield from paginated_object.values
        else:
            raise Exception(f"could not receive paginated data: status {status}")

def transcribe():
    logging.info("Starting transcription client...")

    # Configure API key authorization: subscription_key
    configuration = cris_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.1"

    # Create the client object and authenticate
    client = cris_client.ApiClient(configuration)

    # Create an instance of the transcription API class
    api = cris_client.CustomSpeechTranscriptionsApi(api_client=client)

    # Specify transcription properties
    properties = {
        "punctuationMode": "DictatedAndAutomatic",
        "profanityFilterMode": "Masked",
        "wordLevelTimestampsEnabled": True,
        "diarizationEnabled": True,
        "destinationContainerUrl": "<Your SAS URI to the transcripts container>",  # TODO: Supply SAS URI
        "timeToLive": "PT1H"
    }

    # Choose your transcription method
    transcription_definition = transcribe_from_container(RECORDINGS_CONTAINER_URI, properties)

    # Create the transcription job
    created_transcription, status, headers = api.transcriptions_submit_with_http_info(transcription=transcription_definition)

    # Get the transcription ID from the location URI
    transcription_id = headers["location"].split("/")[-1]

    # Log information about the created transcription
    logging.info(f"Created new transcription with id '{transcription_id}' in region {SERVICE_REGION}")

    logging.info("Checking status.")

    completed = False

    while not completed:
        # Wait for 5 seconds before refreshing the transcription status
        time.sleep(5)

        transcription = api.transcriptions_get(transcription_id)
        logging.info(f"Transcriptions status: {transcription.status}")

        if transcription.status in ("Failed", "Succeeded"):
            completed = True

        if transcription.status == "Succeeded":
            pag_files = api.transcriptions_list_files(transcription_id)
            for file_data in _paginate(api, pag_files):
                if file_data.kind != "Transcription":
                    continue

                audiofilename = file_data.name
                results_url = file_data.links.content_url
                results = requests.get(results_url)
                logging.info(f"Results for {audiofilename}:\n{results.content.decode('utf-8')}")
        elif transcription.status == "Failed":
            logging.info(f"Transcription failed: {transcription.properties.error.message}")

if __name__ == "__main__":
    transcribe()

31/01/2025 05:24:16 PM SE Asia Standard Time Starting transcription client...


AttributeError: 'CustomSpeechTranscriptionsApi' object has no attribute 'transcriptions_submit_with_http_info'