In [17]:
FILE_NAME = "S56.wav" # @param {type:"string", isTemplate: true}
PROJECT_ID = "verdant-branch-457906-a2" 

In [None]:
CUSTOM_PHRASES = [ {
                    "value": "کاستاندا","boost": 10.0},
                    {"value": "شامن","boost": 5.0},
                    {"value": "فلوریندا","boost": 15.0},
                    {"value": "دون خوان","boost": 20.0},
                    {"value": "دون خنارو","boost": 20.0},
                    ]

In [19]:
import io
import os
import subprocess
import time

from docx import Document
import IPython.display as ipd
from etils import epath as ep
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import jiwer
import pandas as pd
import plotly.graph_objs as go
from pydub import AudioSegment

In [20]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "verdant-branch-457906-a2" 
if not PROJECT_ID or PROJECT_ID == "[verdant-branch-457906-a2]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "europe-west4")
print(f"Using project {PROJECT_ID} in location {LOCATION}")

BUCKET_NAME = "bdav42"  # @param {type:"string", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

print(f"Using bucket {BUCKET_URI} to convert {FILE_NAME} to text")

Using project verdant-branch-457906-a2 in location europe-west4
Using bucket gs://bdav42 to convert S56.wav to text


In [21]:
from google.cloud import storage

client = SpeechClient(
    client_options=ClientOptions(
        api_endpoint=f"{LOCATION}-speech.googleapis.com",
    )
)

storage_client = storage.Client()
blob = storage_client.bucket(BUCKET_URI.replace("gs://", "")).blob(f"Spch2txt/AudioInput/{FILE_NAME}")

if not blob.exists():
    print(f"Error: File '{BUCKET_URI}/Spch2txt/AudioInput/{FILE_NAME}' does not exist in Google Cloud Storage.")
    exit(1)  # Stop execution if file is missing

RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, "_")

In [22]:
API_ENDPOINT = f"{LOCATION}-speech.googleapis.com"

client = SpeechClient(
    client_options=ClientOptions(
        api_endpoint=API_ENDPOINT,
    )
)

# INPUT_AUDIO_SAMPLE_FILE_URI = (
#     "gs://github-repo/audio_ai/speech_recognition/attention_is_all_you_need_podcast.wav"
# )
INPUT_LONG_AUDIO_SAMPLE_FILE_URI = (
    f"{BUCKET_URI}/Spch2txt/AudioInput/{FILE_NAME}"
)

RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, "_")


In [23]:
def read_audio_file(audio_file_path: str) -> bytes:
    """
    Read audio file as bytes.
    """
    if audio_file_path.startswith("gs://"):
        with ep.Path(audio_file_path).open("rb") as f:
            audio_bytes = f.read()
    else:
        with open(audio_file_path, "rb") as f:
            audio_bytes = f.read()
    return audio_bytes




def parse_batch_recognize_response(
    response, audio_sample_file_uri: str = INPUT_LONG_AUDIO_SAMPLE_FILE_URI
) -> list[tuple[str, int]]:
    """Parse batch responses from the Speech-to-Text API"""
    batch_recognize_results = []
    for result in response.results[
        audio_sample_file_uri
    ].inline_result.transcript.results:
        batch_recognize_results.append(
            (result.alternatives[0].transcript, result.result_end_offset)
        )
    return batch_recognize_results



In [24]:
### Perform batch recognition


batch_recognition_config = cloud_speech.RecognitionConfig(
        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
        language_codes=["fa-IR","en-US"],
        model="chirp_2",
        # Use model adaptation
        adaptation=cloud_speech.SpeechAdaptation(
          phrase_sets=[
              cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
                  inline_phrase_set=cloud_speech.PhraseSet(phrases=CUSTOM_PHRASES)
              )
          ]
        )
    )


# Set the audio file URI
audio_metadata = cloud_speech.BatchRecognizeFileMetadata(
    uri=INPUT_LONG_AUDIO_SAMPLE_FILE_URI
)

# Create the request
batch_recognition_request = cloud_speech.BatchRecognizeRequest(
    config=batch_recognition_config,
    files=[audio_metadata],
    recognition_output_config=cloud_speech.RecognitionOutputConfig(
        inline_response_config=cloud_speech.InlineOutputConfig(),
    ),
    recognizer=RECOGNIZER,
)


In [25]:
# Run the batch recognition operation
operation = client.batch_recognize(request=batch_recognition_request)
# Wait for the operation to complete

while True:
    if not operation.done():
        print("Waiting for operation to complete...")
        time.sleep(60)
    else:
        print("Operation completed.")
        break

response = operation.result()

Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Operation completed.


In [26]:
# Visualize the results

batch_recognize_results = parse_batch_recognize_response(
    response, audio_sample_file_uri=INPUT_LONG_AUDIO_SAMPLE_FILE_URI
)
# batch_recognize_output = get_recognize_output(
#     long_audio_sample_bytes, batch_recognize_results
# )
# for audio_sample_bytes, transcription in batch_recognize_output:
#     print_transcription(audio_sample_bytes, transcription)
print(batch_recognize_results)
type(batch_recognize_results)

[('این فایل ها رو که میفرستادم برای برزو که اینا رو به تکست تبدیل کنه متوجه شدم که ما تو جلسات فکر کنم دو جلسه رو اشتباه کردیم یعنی این جلسه الان پنجاه و ششمین نیست پنجاه و چهارمین یعنی چون ریکورد ها رو ما دو ساعت دو ساعت بود دو تا دو تا یه ساعت گرفتیم وقتی زدیم یعنی اونا رو من مثلا یه جلسه دو تا جلسه حساب شده بود برای همین ولی به هر حال الان پنجاه و چهارمین جلسه است که ما رسیدیم به کتاب چرخ', datetime.timedelta(seconds=30)), ('زمان حالا منم برای اینکه چون براتون رو خونی کنم ویدیو رو میبندم چون این کتاب خدمت شما عرض کنم که با کتاب های دیگه یه مقداری فرق میکنه و اونم اینه که در واقع کاستاندا بعد از اینکه هشت کتاب اولشو نوشت قدرت سکوت که هشتمین کتابش بود بعد از اون اومد این هشت تا کتاب رو', datetime.timedelta(seconds=60)), ('در واقع اولین بار اومده بود اینا رو میخواست موضوع بندی کنه بر اساس موضوعات بیاد توضیح بده ولی وقتی رفت جلو دید چنین کاری امکان پذیر نیست و این مطلب خیلی مطلب مهمیه که کسی که خودش تمام این کتاب ها رو نوشته بود نمیتونست مثلا فرض کن موضوع کمین و شکار موضوع خدمت شما عرض 

list

In [27]:
    # document = Document()
    # for result in response.results:
    #     for alternative in result.alternatives:
    #         document.add_paragraph(alternative.transcript)
    #         for word_info in alternative.words:
    #             document.add_paragraph(f"Word: {word_info.word}, Speaker: {word_info.speaker_tag}")
    #         document.add_paragraph("-" * 20)

    # document.save("transcription.docx")
    # print("Transcription saved to transcription.docx")

document = Document()
document.add_heading(f'Persian Translation Results {FILE_NAME}', level=0)

for transcription, duration in batch_recognize_results:
    document.add_paragraph(transcription)
    # document.add_paragraph(f"Duration: {duration}")   # Optional: Add duration if needed# 
    document.add_paragraph() # Add an empty paragraph for spacing   

document.save(f'{FILE_NAME[:-4]}.docx')
print(f"Persian translation saved to {FILE_NAME}.docx")

Persian translation saved to S56.wav.docx


In [28]:
# # Evaluate the results: if you have a reference transcription, you can compare it with the actual transcription.

# actual_transcriptions = [t for _, t in batch_recognize_output]
# reference_transcriptions = [
#     """sentence 1""",
#     """next""",
#     ""next""",
#     """next""",
# ]

# evaluation_df = evaluate_stt(actual_transcriptions, reference_transcriptions)
# plot_evaluation_results(evaluation_df)