In [13]:
#!/usr/bin/env python
# coding: utf-8

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

import logging
import sys
import requests
import time
import swagger_client

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")

In [14]:
import os
from dotenv import load_dotenv
load_dotenv()

API_VERSION = "2024-11-15"

# Your subscription key and region for the speech service
SUBSCRIPTION_KEY = os.getenv("AZURE_KEY")
SERVICE_REGION = os.getenv("AZURE_SERVICE_REGION")

NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"

LOCALE = "en-US"
RECORDINGS_BLOB_URI = "https://smlbird.blob.core.windows.net/speechtotext/The Links Golf 1.m4a"

# Provide the uri of a container with audio files for transcribing all of them
# with a single request. At least 'read' and 'list' (rl) permissions are required.
RECORDINGS_CONTAINER_URI = "https://smlbird.blob.core.windows.net/speechtotext?sp=r&st=2025-01-21T10:43:59Z&se=2025-01-21T18:43:59Z&spr=https&sv=2022-11-02&sr=c&sig=Vcv5Zhz9hLKIJ0yM1AxcY52oNvLghj6ycPv%2FoHi4RKQ%3D"

# Set model information when doing transcription with custom models
MODEL_REFERENCE = None  # guid of a custom model

In [25]:
def transcribe_from_single_blob(uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        properties=properties
    )

    return transcription_definition


def transcribe_with_custom_model(client, uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    # Model information (ADAPTED_ACOUSTIC_ID and ADAPTED_LANGUAGE_ID) must be set above.
    if MODEL_REFERENCE is None:
        logging.error("Custom model ids must be set when using custom models")
        sys.exit()

    model = {'self': f'{client.configuration.host}/models/{MODEL_REFERENCE}'}

    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        model=model,
        properties=properties
    )

    return transcription_definition


def transcribe_from_container(uri, properties):
    """
    Transcribe all files in the container located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_container_url=uri,
        properties=properties
    )

    return transcription_definition


def _paginate(api, paginated_object):
    """
    The autogenerated client does not support pagination. This function returns a generator over
    all items of the array that the paginated object `paginated_object` is part of.
    """
    yield from paginated_object.values
    typename = type(paginated_object).__name__
    auth_settings = ["api_key"]
    while paginated_object.next_link:
        link = paginated_object.next_link[len(api.api_client.configuration.host):]
        paginated_object, status, headers = api.api_client.call_api(link, "GET",
            response_type=typename, auth_settings=auth_settings)

        if status == 200:
            yield from paginated_object.values
        else:
            raise Exception(f"could not receive paginated data: status {status}")


def delete_all_transcriptions(api):
    """
    Delete all transcriptions associated with your speech resource.
    """
    logging.info("Deleting all existing completed transcriptions.")

    # get all transcriptions for the subscription
    transcriptions = list(_paginate(api, api.get_transcriptions()))

    # Delete all pre-existing completed transcriptions.
    # If transcriptions are still running or not started, they will not be deleted.
    for transcription in transcriptions:
        transcription_id = transcription._self.split('/')[-1]
        logging.debug(f"Deleting transcription with id {transcription_id}")
        try:
            api.delete_transcription(transcription_id)
        except swagger_client.rest.ApiException as exc:
            logging.error(f"Could not delete transcription {transcription_id}: {exc}")


def transcribe():
    logging.info("Starting transcription client...")

    # Configure API key authorization: subscription_key
    configuration = swagger_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext"

    # Create the client object and authenticate
    client = swagger_client.ApiClient(configuration)

    # Create an instance of the transcription API class
    api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)

    # Specify transcription properties by passing a dict to the properties parameter
    properties = swagger_client.TranscriptionProperties(6)

    # Use base models for transcription
    transcription_definition = transcribe_from_single_blob(RECORDINGS_BLOB_URI, properties)

    # Create the transcription
    created_transcription, status, headers = api.transcriptions_create_with_http_info(
        transcription=transcription_definition
    )

    # Get the transcription ID from the location URI
    transcription_id = headers["location"].split("/")[-1].split("?")[0]

    logging.info(f"Created new transcription with id '{transcription_id}' in region {SERVICE_REGION}")
    logging.info("Checking status.")

    completed = False

    while not completed:
        time.sleep(5)

        transcription = api.transcriptions_get(transcription_id)
        logging.info(f"Transcriptions status: {transcription.status}")

        if transcription.status in ("Failed", "Succeeded"):
            completed = True

        if transcription.status == "Succeeded":
            if properties.destination_container_url is not None:
                logging.info("Transcription succeeded. Results are located in your Azure Blob Storage.")
                break

            pag_files = api.transcriptions_list_files(transcription_id)
            for file_data in _paginate(api, pag_files):
                if file_data.kind != "Transcription":
                    continue

                audiofilename = file_data.name
                results_url = file_data.links.content_url
                results = requests.get(results_url)
                logging.info(f"Results for {audiofilename}:\n{results.content.decode('utf-8')}")


if __name__ == "__main__":
    transcribe()

01/23/2025 09:51:19 AM SE Asia Standard Time Starting transcription client...


ApiException: (404)
Reason: Resource Not Found
HTTP response headers: HTTPHeaderDict({'Content-Length': '56', 'Content-Type': 'application/json', 'apim-request-id': '489aa807-9b38-44b9-b9be-7f0d0506b245', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'Date': 'Thu, 23 Jan 2025 02:51:20 GMT'})
HTTP response body: {"error":{"code":"404","message": "Resource not found"}}


In [19]:
def list_methods():
    # configure API key authorization: subscription_key
    configuration = swagger_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext"

    # create the client object and authenticate
    client = swagger_client.ApiClient(configuration)

    # create an instance of the transcription api class
    api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)

    # Print available methods
    print(dir(api))

if __name__ == "__main__":
    list_methods()

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'api_client', 'transcriptions_create', 'transcriptions_create_with_http_info', 'transcriptions_delete', 'transcriptions_delete_with_http_info', 'transcriptions_get', 'transcriptions_get_file', 'transcriptions_get_file_with_http_info', 'transcriptions_get_with_http_info', 'transcriptions_list', 'transcriptions_list_files', 'transcriptions_list_files_with_http_info', 'transcriptions_list_supported_locales', 'transcriptions_list_supported_locales_with_http_info', 'transcriptions_list_with_http_info', 'transcriptions_update', 'transcriptions_update_with_http_info']


# Old version

In [33]:
import time, os, requests, json

AZURE_SERVICE_REGION = os.getenv("AZURE_SERVICE_REGION")
AZURE_KEY = os.getenv("AZURE_KEY")
transcription_endpoint = f"https://{AZURE_SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions"

language_code = 'id-ID'

# Audio files in Azure Blob Storage
audio_files = [
    "https://smlbird.blob.core.windows.net/speechtotext/The Links Golf 1.m4a"
]

In [35]:
# Create transcription job
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_KEY,
    "Content-Type": "application/json"
}

body = {
    "contentUrls": audio_files,
    "locale": language_code,
    "displayName": "Batch Transcription Example",
    "properties": {
    "diarizationEnabled": False,
    "wordLevelTimestampsEnabled": True,
    "displayFormWordLevelTimestampsEnabled": False,
    "punctuationMode": "DictatedAndAutomatic",
    "profanityFilterMode": "Masked"
}
}

response = requests.post(transcription_endpoint, headers=headers, data=json.dumps(body))
if response.status_code == 201:
    print("Batch transcription job created successfully.")
    job_location = response.headers["Location"]
else:
    print(f"Failed to create batch transcription job: {response.status_code}")
    print("Response:", response.json())
    exit()

# Check transcription job status
def check_transcription_status(job_location):
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_KEY
    }
    response = requests.get(job_location, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to get job status: {response.status_code}")
        print("Response:", response.json())
        exit()

# Wait for the transcription job to complete
while True:
    job_status = check_transcription_status(job_location)
    status = job_status["status"]
    if status in ["Succeeded", "Failed"]:
        break
    print(f"Job status: {status}. Waiting for 30 seconds...")
    time.sleep(30)

# Retrieve transcription results and save to a variable
transcription_results = []
if status == "Succeeded":
    files_url = job_status["links"]["files"]
    files_response = requests.get(files_url, headers=headers)
    if files_response.status_code == 200:
        files = files_response.json()
        for file in files["values"]:
            result_url = file["links"]["contentUrl"]
            result_response = requests.get(result_url)
            if result_response.status_code == 200:
                result_content = result_response.json()
                transcription_results.append(result_content)
            else:
                print(f"Failed to get transcription result: {result_response.status_code}")
    else:
        print(f"Failed to get result files: {files_response.status_code}")
else:
    print(f"Transcription job failed: {job_status.get('error', {}).get('message', 'Unknown error')}")

# Process results
if transcription_results:
    print("Transcription results:", transcription_results[0])

# Function to extract the best transcription result
def get_best_transcription(transcription_result):
    # Extract the 'combinedRecognizedPhrases' list
    phrases = transcription_result.get('combinedRecognizedPhrases', [])
    
    # Check if there are any phrases
    if not phrases:
        return None
    
    # Extract the first phrase (assuming it's the best one)
    best_phrase = phrases[0]
    
    # Return the 'display' transcription as it's usually the most readable
    return best_phrase.get('display', '')

# Example of accessing the first transcription result
if transcription_results:
    first_result = transcription_results[0]
    best_transcription = get_best_transcription(first_result)
    print(best_transcription)

    with open("output.txt", "w") as file:
        # Write the variable to the file
        file.write(best_transcription)

Batch transcription job created successfully.
Job status: Running. Waiting for 30 seconds...
Transcription job failed: Unknown error


In [38]:
job_status

{'self': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/be314c87-b815-4591-9009-5a030f326bfc',
 'displayName': 'Batch Transcription Example',
 'locale': 'id-ID',
 'createdDateTime': '2025-01-23T03:21:14Z',
 'lastActionDateTime': '2025-01-23T03:21:24Z',
 'status': 'Failed',
 'model': {'self': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/36cc2887-f827-4311-8178-003dd2a78ba7'},
 'links': {'files': 'https://southeastasia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/be314c87-b815-4591-9009-5a030f326bfc/files'},
 'properties': {'diarizationEnabled': False,
  'wordLevelTimestampsEnabled': True,
  'displayFormWordLevelTimestampsEnabled': False,
  'channels': [0, 1],
  'punctuationMode': 'DictatedAndAutomatic',
  'profanityFilterMode': 'Masked',
  'error': {'code': 'InvalidData',
   'message': 'The recordings URI contains invalid data.'}}}

In [29]:
import logging
import time
from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer
from azure.cognitiveservices.speech.transcription import TranscriptionDefinition, TranscriptionProperties
from azure.identity import DefaultAzureCredential


# Replace these with your details
SUBSCRIPTION_KEY = os.getenv("AZURE_KEY")
REGION = os.getenv("AZURE_SERVICE_REGION")
AUDIO_URI = "https://smlbird.blob.core.windows.net/speechtotext/The Links Golf 1.m4a"
LOCALE = "id-ID"  # Specify the language of the audio

def transcribe_batch(audio_uri, subscription_key, region, locale):
    """
    Submits a batch transcription job to Azure.
    """
    from azure.cognitiveservices.speech.transcription import TranscriptionDefinition, TranscriptionProperties
    from azure.cognitiveservices.speech import SpeechConfig
    import azure.cognitiveservices.speech.transcription as speech

    # Set up speech configuration
    config = SpeechConfig(subscription=subscription_key, region=region)

    # Define transcription properties
    properties = TranscriptionProperties()
    properties.locale = locale
    properties.content_urls = [audio_uri]

    # Create a transcription definition
    transcription_definition = TranscriptionDefinition(
        content_urls=[audio_uri],
        locale=locale,
        properties=properties,
    )

    # Submit transcription job
    client = speech.SpeechTranscriptionClient(config)
    transcription = client.transcriptions.create(transcription_definition)

    logging.info(f"Created transcription with ID: {transcription.id}")
    return transcription.id

def check_status(transcription_id, subscription_key, region):
    """
    Poll the transcription job status until it is complete.
    """
    from azure.cognitiveservices.speech.transcription import SpeechTranscriptionClient
    from azure.cognitiveservices.speech import SpeechConfig

    # Set up speech configuration
    config = SpeechConfig(subscription=subscription_key, region=region)
    client = SpeechTranscriptionClient(config)

    while True:
        transcription = client.transcriptions.get(transcription_id)
        logging.info(f"Transcription status: {transcription.status}")

        if transcription.status in ("Succeeded", "Failed"):
            break

        time.sleep(10)

    if transcription.status == "Succeeded":
        logging.info("Transcription completed successfully.")
        for file_data in transcription.results:
            logging.info(f"Result URL: {file_data.content_url}")

# Submit batch transcription
transcription_id = transcribe_batch(AUDIO_URI, SUBSCRIPTION_KEY, REGION, LOCALE)

# Poll status
check_status(transcription_id, SUBSCRIPTION_KEY, REGION)


ImportError: cannot import name 'TranscriptionDefinition' from 'azure.cognitiveservices.speech.transcription' (c:\Users\firmansyah.atmojo\AppData\Local\Programs\Python\Python311\Lib\site-packages\azure\cognitiveservices\speech\transcription.py)