In [654]:
import io
import os
import subprocess
import time

from docx import Document
import IPython.display as ipd
from etils import epath as ep
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import jiwer
import pandas as pd
import plotly.graph_objs as go
from pydub import AudioSegment

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "glassy-groove-461110-u0"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[glassy-groove-461110-u0]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "europe-west4")
print(f"Using project {PROJECT_ID} in location {LOCATION}")

BUCKET_NAME = "blob_speech"  # @param {type:"string", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}
FILE_NAME = "S1.wav" # @param {type:"string", isTemplate: true}
print(f"Using bucket {BUCKET_URI} to convert {FILE_NAME} to text")

Using project verdant-branch-457906-a2 in location europe-west4
Using bucket gs://bdav42 to convert S1.wav to text


In [666]:
from google.cloud import storage

client = SpeechClient(
    client_options=ClientOptions(
        api_endpoint=f"{LOCATION}-speech.googleapis.com",
    )
)

storage_client = storage.Client()
blob = storage_client.bucket(BUCKET_URI.replace("gs://", "")).blob(f"Spch2txt/AudioInput/{FILE_NAME}")

if not blob.exists():
    print(f"Error: File '{BUCKET_URI}/Spch2txt/AudioInput/{FILE_NAME}' does not exist in Google Cloud Storage.")
    exit(1)  # Stop execution if file is missing

RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, "_")

In [656]:
API_ENDPOINT = f"{LOCATION}-speech.googleapis.com"

client = SpeechClient(
    client_options=ClientOptions(
        api_endpoint=API_ENDPOINT,
    )
)

# INPUT_AUDIO_SAMPLE_FILE_URI = (
#     "gs://github-repo/audio_ai/speech_recognition/attention_is_all_you_need_podcast.wav"
# )
INPUT_LONG_AUDIO_SAMPLE_FILE_URI = (
    f"{BUCKET_URI}/Spch2txt/AudioInput/{FILE_NAME}"
)

RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, "_")


In [657]:
def read_audio_file(audio_file_path: str) -> bytes:
    """
    Read audio file as bytes.
    """
    if audio_file_path.startswith("gs://"):
        with ep.Path(audio_file_path).open("rb") as f:
            audio_bytes = f.read()
    else:
        with open(audio_file_path, "rb") as f:
            audio_bytes = f.read()
    return audio_bytes


def save_audio_sample(audio_bytes: bytes, output_file_uri: str) -> None:
    """
    Save audio sample as a file in Google Cloud Storage.
    """

    output_file_path = ep.Path(output_file_uri)
    if not output_file_path.parent.exists():
        output_file_path.parent.mkdir(parents=True, exist_ok=True)

    with output_file_path.open("wb") as f:
        f.write(audio_bytes)


def extract_audio_sample(audio_bytes: bytes, duration: int) -> bytes:
    """
    Extracts a random audio sample of a given duration from an audio file.
    """
    audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
    start_time = 0
    audio_sample = audio[start_time : start_time + duration * 1000]

    audio_bytes = io.BytesIO()
    audio_sample.export(audio_bytes, format="wav")
    audio_bytes.seek(0)

    return audio_bytes.read()


def play_audio_sample(audio_bytes: bytes) -> None:
    """
    Plays the audio sample in a notebook.
    """
    ipd.display(ipd.Audio(io.BytesIO(audio_bytes).read(), rate=44100))


def audio_sample_chunk_n(audio_bytes: bytes, num_chunks: int) -> list[bytes]:
    """
    Chunks an audio sample into a specified number of chunks and returns a list of bytes for each chunk.
    """
    audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
    total_duration = len(audio)
    chunk_duration = total_duration // num_chunks

    chunks = []
    start_time = 0

    for _ in range(num_chunks):
        end_time = min(start_time + chunk_duration, total_duration)
        chunk = audio[start_time:end_time]

        audio_bytes_chunk = io.BytesIO()
        chunk.export(audio_bytes_chunk, format="wav")
        audio_bytes_chunk.seek(0)
        chunks.append(audio_bytes_chunk.read())

        start_time = end_time

    return chunks


def audio_sample_merge(audio_chunks: list[bytes]) -> bytes:
    """
    Merges a list of audio chunks into a single audio sample.
    """
    audio = AudioSegment.empty()
    for chunk in audio_chunks:
        audio += AudioSegment.from_file(io.BytesIO(chunk))

    audio_bytes = io.BytesIO()
    audio.export(audio_bytes, format="wav")
    audio_bytes.seek(0)

    return audio_bytes.read()


def compress_for_streaming(audio_bytes: bytes) -> bytes:
    """
    Compresses audio bytes for streaming using ffmpeg, ensuring the output size is under MAX_CHUNK_SIZE bytes.
    """
    # Temporary file to store original audio
    with open("temp_original.wav", "wb") as f:
        f.write(audio_bytes)

    # Initial compression attempt with moderate bitrate
    bitrate = "32k"
    subprocess.run(
        [
            "ffmpeg",
            "-i",
            "temp_original.wav",
            "-b:a",
            bitrate,
            "-y",
            "temp_compressed.mp3",
        ]
    )

    # Check if compressed size is within limit
    compressed_size = os.path.getsize("temp_compressed.mp3")
    if compressed_size <= MAX_CHUNK_SIZE:
        with open("temp_compressed.mp3", "rb") as f:
            compressed_audio_bytes = f.read()
    else:
        # If too large, reduce bitrate and retry
        while compressed_size > MAX_CHUNK_SIZE:
            bitrate = str(int(bitrate[:-1]) - 8) + "k"  # Reduce bitrate by 8kbps
            subprocess.run(
                [
                    "ffmpeg",
                    "-i",
                    "temp_original.wav",
                    "-b:a",
                    bitrate,
                    "-y",
                    "temp_compressed.mp3",
                ]
            )
            compressed_size = os.path.getsize("temp_compressed.mp3")

        with open("temp_compressed.mp3", "rb") as f:
            compressed_audio_bytes = f.read()

    # Clean up temporary files
    os.remove("temp_original.wav")
    os.remove("temp_compressed.mp3")

    return compressed_audio_bytes


def parse_streaming_recognize_response(response) -> list[tuple[str, int]]:
    """Parse streaming responses from the Speech-to-Text API"""
    streaming_recognize_results = []
    for r in response:
        for result in r.results:
            streaming_recognize_results.append(
                (result.alternatives[0].transcript, result.result_end_offset)
            )
    return streaming_recognize_results


def parse_real_time_recognize_response(response) -> list[tuple[str, int]]:
    """Parse real-time responses from the Speech-to-Text API"""
    real_time_recognize_results = []
    for result in response.results:
        real_time_recognize_results.append(
            (result.alternatives[0].transcript, result.result_end_offset)
        )
    return real_time_recognize_results


def parse_batch_recognize_response(
    response, audio_sample_file_uri: str = INPUT_LONG_AUDIO_SAMPLE_FILE_URI
) -> list[tuple[str, int]]:
    """Parse batch responses from the Speech-to-Text API"""
    batch_recognize_results = []
    for result in response.results[
        audio_sample_file_uri
    ].inline_result.transcript.results:
        batch_recognize_results.append(
            (result.alternatives[0].transcript, result.result_end_offset)
        )
    return batch_recognize_results


def get_recognize_output(
    audio_bytes: bytes, recognize_results: list[tuple[str, int]]
) -> list[tuple[bytes, str]]:
    """
    Get the output of recognize results, handling 0 timedelta and ensuring no overlaps or gaps.
    """
    audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
    recognize_output = []
    start_time = 0

    initial_end_time = recognize_results[0][1].total_seconds() * 1000

    # This loop handles the streaming case where result timestamps might be zero.
    if initial_end_time == 0:
        for i, (transcript, timedelta) in enumerate(recognize_results):
            if i < len(recognize_results) - 1:
                # Use the next timedelta if available
                next_end_time = recognize_results[i + 1][1].total_seconds() * 1000
                end_time = next_end_time
            else:
                next_end_time = len(audio)
                end_time = next_end_time

            # Ensure no gaps between chunks
            chunk = audio[start_time:end_time]
            chunk_bytes = io.BytesIO()
            chunk.export(chunk_bytes, format="wav")
            chunk_bytes.seek(0)
            recognize_output.append((chunk_bytes.read(), transcript))

            # Set start_time for the next iteration
            start_time = end_time
    else:
        for i, (transcript, timedelta) in enumerate(recognize_results):
            # Calculate end_time in milliseconds
            end_time = timedelta.total_seconds() * 1000

            # Ensure no gaps between chunks
            chunk = audio[start_time:end_time]
            chunk_bytes = io.BytesIO()
            chunk.export(chunk_bytes, format="wav")
            chunk_bytes.seek(0)
            recognize_output.append((chunk_bytes.read(), transcript))

            # Set start_time for the next iteration
            start_time = end_time

    return recognize_output


def print_transcription(audio_sample_bytes: bytes, transcription: str) -> None:
    """Prettify the play of the audio and the associated print of the transcription text in a notebook"""

    # Play the audio sample
    display(ipd.HTML("<b>Audio:</b>"))
    play_audio_sample(audio_sample_bytes)
    display(ipd.HTML("<br>"))

    # Display the transcription text
    display(ipd.HTML("<b>Transcription:</b>"))
    formatted_text = f"<pre style='font-family: monospace; white-space: pre-wrap;'>{transcription}</pre>"
    display(ipd.HTML(formatted_text))


# def evaluate_stt(
#     actual_transcriptions: list[str],
#     reference_transcriptions: list[str],
#     audio_sample_file_uri: str = INPUT_LONG_AUDIO_SAMPLE_FILE_URI,
# ) -> pd.DataFrame:
#     """
#     Evaluate speech-to-text (STT) transcriptions against reference transcriptions.
#     """
#     audio_uris = [audio_sample_file_uri] * len(actual_transcriptions)
#     evaluations = []
#     for audio_uri, actual_transcription, reference_transcription in zip(
#         audio_uris, actual_transcriptions, reference_transcriptions
#     ):
#         evaluation = {
#             "audio_uri": audio_uri,
#             "actual_transcription": actual_transcription,
#             "reference_transcription": reference_transcription,
#             "wer": jiwer.wer(reference_transcription, actual_transcription),
#             "cer": jiwer.cer(reference_transcription, actual_transcription),
#         }
#         evaluations.append(evaluation)

#     evaluations_df = pd.DataFrame(evaluations)
#     evaluations_df.reset_index(inplace=True, drop=True)
#     return evaluations_df


def plot_evaluation_results(
    evaluations_df: pd.DataFrame,
) -> go.Figure:
    """
    Plot the mean Word Error Rate (WER) and Character Error Rate (CER) from the evaluation results.
    """
    mean_wer = evaluations_df["wer"].mean()
    mean_cer = evaluations_df["cer"].mean()

    trace_means = go.Bar(
        x=["WER", "CER"], y=[mean_wer, mean_cer], name="Mean Error Rate"
    )

    trace_baseline = go.Scatter(
        x=["WER", "CER"], y=[0.5, 0.5], mode="lines", name="Baseline (0.5)"
    )

    layout = go.Layout(
        title="Speech-to-Text Evaluation Results",
        xaxis=dict(title="Metric"),
        yaxis=dict(title="Error Rate", range=[0, 1]),
        barmode="group",
    )

    fig = go.Figure(data=[trace_means, trace_baseline], layout=layout)
    return fig

In [658]:
# # Read the audio file
# input_audio_bytes = read_audio_file(INPUT_LONG_AUDIO_SAMPLE_FILE_URI)

# # Extract a random audio sample 
# short_audio_sample_bytes = extract_audio_sample(input_audio_bytes, 30)

# play_audio_sample(short_audio_sample_bytes)

In [659]:
### Perform batch recognition

# def batch_recognize
batch_recognition_config = cloud_speech.RecognitionConfig(
    language_codes=["fa-IR"],
    model="chirp_2",
    features=cloud_speech.RecognitionFeatures(
        enable_automatic_punctuation=True,
    ),
    auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
)

# Set the audio file URI
audio_metadata = cloud_speech.BatchRecognizeFileMetadata(
    uri=INPUT_LONG_AUDIO_SAMPLE_FILE_URI
)

# Create the request
batch_recognition_request = cloud_speech.BatchRecognizeRequest(
    config=batch_recognition_config,
    files=[audio_metadata],
    recognition_output_config=cloud_speech.RecognitionOutputConfig(
        inline_response_config=cloud_speech.InlineOutputConfig(),
    ),
    recognizer=RECOGNIZER,
)


In [660]:
# Run the batch recognition operation
operation = client.batch_recognize(request=batch_recognition_request)
# Wait for the operation to complete

while True:
    if not operation.done():
        print("Waiting for operation to complete...")
        time.sleep(60)
    else:
        print("Operation completed.")
        break

response = operation.result()

Waiting for operation to complete...
Waiting for operation to complete...
Operation completed.


In [661]:
# Visualize the results

batch_recognize_results = parse_batch_recognize_response(
    response, audio_sample_file_uri=INPUT_LONG_AUDIO_SAMPLE_FILE_URI
)
# batch_recognize_output = get_recognize_output(
#     long_audio_sample_bytes, batch_recognize_results
# )
# for audio_sample_bytes, transcription in batch_recognize_output:
#     print_transcription(audio_sample_bytes, transcription)
print(batch_recognize_results)
type(batch_recognize_results)

[('سلام. هی، چطور هستید؟ من خوبم. اوکی، ببخشید، درباره تغییرات. اوکی، من آماده هستم تا به شما گوش کنم و دلیل اینکه شما می خواستید با کسی صحبت کنید و کمی به خودتان درباره خودتان بگویید.', datetime.timedelta(seconds=30)), ('اوکی فرسلی ام گون تو تیل یو باف میسلف ام ام شالی نایلز ام سیری کنسٹرکشن انسپکٹر ام ورکنگ ویت نیما اچولی یا یا ہیز دا ون دت ریکمینڈڈ یو ٹو می یا اچولی ہی ہیز سینڈ ون میسج اینڈ ریگاردنگ یو اینڈ انٹریڈ یو سٹ یو ٹو می یا اوکی یا ای نو اوکی اوکی یا', datetime.timedelta(seconds=60)), ("So, um, what happened? I'm feeling, I have a lot going on in my mind and I'm feeling kind of overwhelmed also. Because what happened? I came to Canada in 2018. I was married and I was living with my husband. Unfortunately, the marriage didn't last because he was very like bossy and I walked out of my marriage like about a month and a half ago.", datetime.timedelta(seconds=90)), ('و نیم ساعت بعد از اینکه به کانادا آمدم. از آن زمان من با عمویم زندگی می کردم. من دقیقاً نمی دانم شما الان جدا شده 

list

In [662]:
    # document = Document()
    # for result in response.results:
    #     for alternative in result.alternatives:
    #         document.add_paragraph(alternative.transcript)
    #         for word_info in alternative.words:
    #             document.add_paragraph(f"Word: {word_info.word}, Speaker: {word_info.speaker_tag}")
    #         document.add_paragraph("-" * 20)

    # document.save("transcription.docx")
    # print("Transcription saved to transcription.docx")

document = Document()
document.add_heading(f'Persian Translation Results {FILE_NAME}', level=0)

for transcription, duration in batch_recognize_results:
    document.add_paragraph(transcription)
    # document.add_paragraph(f"Duration: {duration}")   # Optional: Add duration if needed# 
    document.add_paragraph() # Add an empty paragraph for spacing   

document.save(f'{FILE_NAME[:-4]}.docx')
print(f"Persian translation saved to {FILE_NAME}.docx")

Persian translation saved to Shelly.wav.docx


In [663]:
# # Evaluate the results: if you have a reference transcription, you can compare it with the actual transcription.

# actual_transcriptions = [t for _, t in batch_recognize_output]
# reference_transcriptions = [
#     """sentence 1""",
#     """next""",
#     ""next""",
#     """next""",
# ]

# evaluation_df = evaluate_stt(actual_transcriptions, reference_transcriptions)
# plot_evaluation_results(evaluation_df)