In [20]:
END_INDEX=3

# replace with last file index: Ex. if last file is S (6).wav, set END_INDEX=6

In [21]:
CUSTOM_PHRASES = [ {
                    "value": "کاستاندا","boost": 10.0},
                    {"value": "شامن","boost": 5.0},
                    {"value": "فلوریندا","boost": 15.0},
                    {"value": "دون خوان","boost": 20.0},
                    {"value": "دون خنارو","boost": 20.0},
                    {"value": "تولتک","boost": 15.0},
                    {"value": "تولنکهای کهن","boost": 12.0},
                    {"value": "دقت اول","boost": 12.0},
                    {"value": "دقت دوم","boost": 12.0}
                    ]

In [None]:
# Prepare the environment
import io
import os
import subprocess
import time

from docx import Document
import IPython.display as ipd
from etils import epath as ep
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import jiwer
import pandas as pd
import plotly.graph_objs as go
from pydub import AudioSegment
import os 

PROJECT_ID = "glassy-groove-461110-u0" 

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "europe-west4")
BUCKET_NAME = "blob_speech"  # @param {type:"string", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

print(f"Using project {PROJECT_ID} in location {LOCATION}")
print(f"Using bucket {BUCKET_URI} to convert {FILE_NAME} to text")

Using project verdant-branch-457906-a2 in location europe-west4
Using bucket gs://bdav42 to convert S (2).wav to text


In [23]:
from google.cloud import storage

client = SpeechClient(
    client_options=ClientOptions(
        api_endpoint=f"{LOCATION}-speech.googleapis.com",
    )
)
storage_client = storage.Client()
RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, "_")
INPUT_LONG_AUDIO_SAMPLE_FILE_URI = ""

In [25]:
def read_audio_file(audio_file_path: str) -> bytes:
    """
    Read audio file as bytes.
    """
    if audio_file_path.startswith("gs://"):
        with ep.Path(audio_file_path).open("rb") as f:
            audio_bytes = f.read()
    else:
        with open(audio_file_path, "rb") as f:
            audio_bytes = f.read()
    return audio_bytes

def parse_batch_recognize_response(
    response, audio_sample_file_uri: str = INPUT_LONG_AUDIO_SAMPLE_FILE_URI
) -> list[tuple[str, int]]:
    """Parse batch responses from the Speech-to-Text API"""
    batch_recognize_results = []
    for result in response.results[
        audio_sample_file_uri
    ].inline_result.transcript.results:
        batch_recognize_results.append(
            (result.alternatives[0].transcript, result.result_end_offset)
        )
    return batch_recognize_results

In [26]:
for i in range(1, END_INDEX + 1):
    FILE_NAME = f"S ({i}).wav"
    INPUT_LONG_AUDIO_SAMPLE_FILE_URI = (f"{BUCKET_URI}/Spch2txt/AudioInput/{FILE_NAME}")
    blob = storage_client.bucket(BUCKET_URI.replace("gs://", "")).blob(f"Spch2txt/AudioInput/{FILE_NAME}")
    if not blob.exists():
        print(f"Error: File '{BUCKET_URI}/Spch2txt/AudioInput/{FILE_NAME}' does not exist in Google Cloud Storage.")
        exit(1)  # Stop execution if file is missing
    
    ### Perform batch recognition
    batch_recognition_config = cloud_speech.RecognitionConfig(
            auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
            language_codes=["fa-IR","en-US"],
            model="chirp_2",
            # Use model adaptation
            adaptation=cloud_speech.SpeechAdaptation(
            phrase_sets=[
                cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
                    inline_phrase_set=cloud_speech.PhraseSet(phrases=CUSTOM_PHRASES)
                )
            ]
            )
        )
    # Set the audio file URI
    audio_metadata = cloud_speech.BatchRecognizeFileMetadata(
        uri=INPUT_LONG_AUDIO_SAMPLE_FILE_URI)
    # Create the request
    batch_recognition_request = cloud_speech.BatchRecognizeRequest(
        config=batch_recognition_config,
        files=[audio_metadata],
        recognition_output_config=cloud_speech.RecognitionOutputConfig(
            inline_response_config=cloud_speech.InlineOutputConfig(),
        ),
        recognizer=RECOGNIZER,
    )
    # Run the batch recognition operation
    operation = client.batch_recognize(request=batch_recognition_request)
    while True:
        if not operation.done():
            print(f"Waiting for transcribing {FILE_NAME} to complete...")
            time.sleep(60)
        else:
            print(f"Operation completed for {FILE_NAME}.")
            break
    response = operation.result()
    batch_recognize_results = parse_batch_recognize_response(response, audio_sample_file_uri=INPUT_LONG_AUDIO_SAMPLE_FILE_URI)
    print(batch_recognize_results)
    type(batch_recognize_results)# Save the results to a Word document
    document = Document()
    document.add_heading(f'Persian Translation Results {FILE_NAME}', level=0)

    for transcription, duration in batch_recognize_results:
        document.add_paragraph(transcription)
        # document.add_paragraph(f"Duration: {duration}")   # Optional: Add duration if needed# 
        document.add_paragraph() # Add an empty paragraph for spacing   

    document.save(f'{FILE_NAME[:-4]}.docx')
    print(f"Persian translation saved to {FILE_NAME}.docx")
print("🎉 All files processed!")

Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Waiting for transcribing S (1).wav to complete...
Operation completed for S (1).wav.
[('محاسبه نمیکنم بزن خرداد و خرداد آها تو این یه جای خوب پیدا کنم ویدیو رو بگیرم آره با موسیقی های زیبا بگیر که موسیقی هم پخش میشه حتماً یادگار بمونه پس نگاه کن پس اول ولیوه', datetime.timedelta(seconds=30)), ('ولیو میره رو هدف هدف پراجکت حالا بعضی وقتا یه پراجکت چند تا ساب پراجکت میشه یا مینی پراجکت میشه یا هم نباشه بعد میشن تسک تسک میشه دیوی اسکچول حالا مثلا فرض کن من میخوام پیشرفت کنم به فرض میخوام مهاجرت کنم میخوام برم امریکا', datetime.timedelta(seconds=60)), ('می\u200cخوام برم کانادا می\u200cخوام برم یه جایی این هدفه ولی چه ولیویی داره ولیوش اینه که من 