# Referência:

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/upload_file.html

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/download_file.html

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/transcribe/client/start_transcription_job.html

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/transcribe/client/get_transcription_job.html

# Dependências

In [None]:
import time
import os
import json

import boto3

# Constantes

In [None]:
# Nome do bucket S3 para armazenar vídeos e transcrições
BUCKET_NAME = 'nome-do-seu-bucket'
# Caminho do vídeo na máquina local
VIDEO_PATH = 'caminho/local/video.mp4'
# Caminho do vídeo no bucket S3
S3_KEY = 'input/video.mp4'
# Formato do arquivo de mídia
MEDIA_FORMAT = 'mp4'
# Código do idioma para transcrição
LANGUAGE_CODE = 'pt-BR'
# Prefixo da pasta no S3 para armazenar transcrições
OUTPUT_PREFIX = 'output/'
# Pasta local para salvar respostas das chamadas AWS
OUTPUT_FOLDER = 'respostas'

# Criar pasta local para respostas

In [None]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Cliente AWS

In [None]:
s3_client = boto3.client('s3')
transcribe_client = boto3.client('transcribe')

# Upload do vídeo para S3

In [None]:
s3_client.upload_file(
                Filename=VIDEO_PATH,
                Bucket=BUCKET_NAME,
                Key=S3_KEY
            )

# Teste do método Start Transcription Job

In [None]:
job_name = f'transcribe-job-{int(time.time())}'

In [None]:
response = transcribe_client.start_transcription_job(
                              TranscriptionJobName=job_name,
                              Media={'MediaFileUri': f's3://{BUCKET_NAME}/{S3_KEY}'},
                              MediaFormat=MEDIA_FORMAT,
                              LanguageCode=LANGUAGE_CODE,
                              OutputBucketName=BUCKET_NAME,
                              OutputKey=OUTPUT_PREFIX
                            )

In [None]:
# Vamos ver a estrutura geral do response exportanto o resultado em um arquivo JSON
with open(f"{OUTPUT_FOLDER}/response.json", 'w', encoding='utf-8') as json_file:
    json.dump(response, json_file, ensure_ascii=False, indent=4, default=str)

# Aguardar conclusão da Transcrição

In [None]:
status = transcribe_client.get_transcription_job(
                                TranscriptionJobName=job_name
                            )

with open(f"{OUTPUT_FOLDER}/status.json", 'w', encoding='utf-8') as json_file:
    json.dump(status, json_file, ensure_ascii=False, indent=4, default=str)

# Download da transcrição

In [None]:
s3_output_key = f'{OUTPUT_PREFIX}{job_name}.json'
local_file = os.path.join(OUTPUT_FOLDER, f'{job_name}.json')

In [None]:
s3_client.download_file(
    Bucket=BUCKET_NAME,
    Key=s3_output_key,
    Filename=local_file
)

# Extrair texto e salvar em TXT

In [None]:
with open(local_file, 'r', encoding='utf-8') as f:
    transcription_data = json.load(f)
transcript_text = transcription_data['results']['transcripts'][0]['transcript']

In [None]:
local_txt = os.path.join(OUTPUT_FOLDER, f'{job_name}.txt')
with open(local_txt, 'w', encoding='utf-8') as f:
    f.write(transcript_text)