In [2]:
import subprocess
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import translate_v2 as translate
from google.cloud import texttospeech


In [3]:

def extract_audio(video_file):
    audio_file = "temp_audio.wav"
    subprocess.run(["ffmpeg", "-i", video_file, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2", audio_file], check=True)
    return audio_file

In [4]:

def transcribe_audio(audio_file, language_code):
    client = speech.SpeechClient()
    with open(audio_file, "rb") as audio:
        content = audio.read()
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code=language_code,
    )
    response = client.recognize(config=config, audio=audio)
    transcript = " ".join([result.alternatives[0].transcript for result in response.results])
    return transcript

In [5]:

def translate_text(text, target_language):
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language)
    return result["translatedText"]

In [6]:

def generate_audio(text, target_language):
    client = texttospeech.TextToSpeechClient()
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(
        language_code=target_language,
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16
    )
    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    with open("dubbed_audio.wav", "wb") as out:
        out.write(response.audio_content)

In [6]:
def combine_audio_video(video_file, audio_file, output_file):
    subprocess.run(["ffmpeg", "-i", video_file, "-i", audio_file, "-c:v", "copy", "-c:a", "aac", "-map", "0:v:0", "-map", "1:a:0", "-y", output_file], check=True)

In [None]:
if __name__ == "__main__":
    video_file = "videos\jap_audio.mkv"
    source_language = "ja"
    target_language = "en"

    audio_file = extract_audio(video_file)
    transcript = transcribe_audio(audio_file, source_language)
    translated_text = translate_text(transcript, target_language)
    generate_audio(translated_text, target_language)
    combine_audio_video(video_file, "dubbed_audio.wav", "dubbed_video.mp4")

In [None]:

video_file = "videos\jap_audio.mkv"
source_language = "ja"
target_language = "en"

audio_file = extract_audio(video_file)

In [None]:

transcript = transcribe_audio(audio_file, source_language)

In [None]:

translated_text = translate_text(transcript, target_language)

In [None]:

generate_audio(translated_text, target_language)

In [None]:

combine_audio_video(video_file, "dubbed_audio.wav", "dubbed_video.mp4")

# notes
- https://cloud.google.com/translate/docs/overview
- https://cloud.google.com/docs/authentication/client-libraries#python
