# **Google Cloud Speech to Text (STT) API com Python**

## Setup ambiente

In [None]:
! pip3 install --user --upgrade google-cloud-speech librosa jiwer protobuf --quiet

Replace with your variables

In [None]:
PROJECT_ID = "INSIRA SEU PROJECT ID"
REGION = "us-central1" # modifique se necessario
BUCKET_URI = "gs://audios-transcrever-poc" # modifique se necessario

In [None]:
AUDIO_NAME = "INSIRA O NOME DO AUDIO COM EXTENSAO ex.: test.wav"

In [None]:
long_audio_origin_uri = (
    f"{BUCKET_URI}/{AUDIO_NAME}"  # @param {type:"string"}
)

In [None]:
from pathlib import Path as p
import librosa
from IPython.display import Audio as play
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
from google.api_core.client_options import ClientOptions
import time
import json
from pprint import pprint
import jiwer

In [None]:
data_folder = p.cwd() / "data"
p(data_folder).mkdir(parents=True, exist_ok=True)

In [None]:
transcriptions_folder = p.cwd() / "transcriptions"
p(transcriptions_folder).mkdir(parents=True, exist_ok=True)

## Configuração da requisição para STT

In [None]:
client = SpeechClient(
    client_options=ClientOptions(api_endpoint=f"{REGION}-speech.googleapis.com")
)

Caso não tenha criado *Recognizers* antes, realize os passos abaixo:

In [None]:
language_code = "pt-BR"
recognizer_id = f"chirp-{language_code.lower()}-test"

recognizer_request = cloud_speech.CreateRecognizerRequest(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}",
    recognizer_id=recognizer_id,
    recognizer=cloud_speech.Recognizer(
        language_codes=[language_code],
        model="chirp",
    ),
)

In [None]:
create_operation = client.create_recognizer(request=recognizer_request)
recognizer_creation = create_operation.result()
recognizer = recognizer_creation.name

Caso já possua seu *Recognizer*, utilize:

In [None]:
RECOGNIZER_NAME=f"chirp-pt-br-test"

In [None]:
recognizer = f"projects/{PROJECT_ID}/locations/{REGION}/recognizers/{RECOGNIZER_NAME}"

Lembre-se de atualizar de acordo com as funcionalidades disponíveis do modelo a ser testado.

In [None]:
long_audio_config = cloud_speech.RecognitionConfig(
    features=cloud_speech.RecognitionFeatures(
        enable_automatic_punctuation=True, 
        enable_word_time_offsets=True
    ),
    auto_decoding_config={}
)

## Inferência

In [None]:
long_audio_request = cloud_speech.BatchRecognizeRequest(
    recognizer=recognizer,
    recognition_output_config={
        "gcs_output_config": {"uri": f"{BUCKET_URI}/transcriptions"}
    },
    files=[{"config": long_audio_config, "uri": long_audio_origin_uri}],
)

In [None]:
long_audio_operation = client.batch_recognize(request=long_audio_request)

In [None]:
long_audio_operation.result()

## Análise do resultado

In [None]:
transcriptions_uri = f"{BUCKET_URI}/transcriptions/transcriptions_model_adapt.json"
transcriptions_file_path = str(data_folder / "transcriptions2.text")

! gsutil cp {transcriptions_uri} {transcriptions_file_path}

In [None]:
transcriptions = json.loads(open(transcriptions_file_path, "r").read())
transcriptions = transcriptions["results"]
transcriptions = [
    transcription["alternatives"][0]["transcript"]
    for transcription in transcriptions
    if "alternatives" in transcription.keys()
]
long_audio_transcription = " ".join(transcriptions)
print(long_audio_transcription)