<a href="https://colab.research.google.com/github/bemxio/colab-notebooks/blob/main/AutomaticVoiceover/AutomaticVoiceover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automatic Voice-Over
A Colab notebook to take an audio of a video file, process it using [Whisper](https://github.com/openai/whisper) to get subtitles out of the audio and translate them to English, then use [gTTS](https://github.com/pndurette/gTTS) together with [pydub](https://github.com/jiaaro/pydub) to generate the voice-over audio, and finally use [ffmpeg](https://github.com/FFmpeg/FFmpeg) to add the voice-over audio to the video.

Also using [`srt`]() for reading the subtitle file and [`audio-effects`]() for slowing down a sample.

Made just for pure fun and laughs, do not use it in professional stuff, unless you want yourself to look dumb. :3

#### Install required dependencies

In [None]:
%pip install openai-whisper srt gTTS pydub audio-effects

#### Set the parameters and upload the video file

In [None]:
import pathlib

from moviepy.editor import ipython_display
from google.colab import files

# constants set by the user in the notebook
MODEL = "large" # @param ["tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large-v1", "large-v2", "large"]
LANGUAGE = "English" # @param ["Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Assamese", "Azerbaijani", "Bashkir", "Basque", "Belarusian", "Bengali", "Bosnian", "Breton", "Bulgarian", "Burmese", "Castilian", "Catalan", "Chinese", "Croatian", "Czech", "Danish", "Dutch", "Estonian", "Faroese", "Finnish", "Flemish", "French", "Galician", "Georgian", "German", "Greek", "Gujarati", "Haitian", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean", "Lao", "Latin", "Latvian", "Letzeburgesch", "Lingala", "Lithuanian", "Luxembourgish", "Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori", "Marathi", "Moldavian", "Moldovan", "Mongolian", "Myanmar", "Nepali", "Norwegian", "Nynorsk", "Occitan", "Panjabi", "Pashto", "Persian", "Polish", "Portuguese", "Punjabi", "Pushto", "Romanian", "Russian", "Sanskrit", "Serbian", "Shona", "Sindhi", "Sinhala", "Sinhalese", "Slovak", "Slovenian", "Somali", "Spanish", "Sundanese", "Swahili", "Swedish", "Tagalog", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tibetan", "Turkish", "Turkmen", "Ukrainian", "Urdu", "Uzbek", "Valencian", "Vietnamese", "Welsh", "Yiddish", "Yoruba"]

TTS_LANGUAGE = "en" # @param {type: "string"}
TTS_REGION = "co.uk" # @param {type: "string"}

# show a prompt for file upload
uploads = files.upload()

# get the path of the file
path = pathlib.Path(next(iter(uploads)))

# show a preview of the video
ipython_display(str(path), filetype="video", maxduration=300)

#### Generate the subtitles based on the video file

In [None]:
!python3 -m whisper "{path}" --model {MODEL} --language {LANGUAGE} --task translate

#### Generate the voice-over based on subtitles

In [None]:
from io import BytesIO

import srt
import gtts
from pydub import AudioSegment

from pydub.effects import speedup as speed_up
from audio_effects import speed_down

# a helper function for changing the speed of a sample
def change_length(segment: AudioSegment, length: float) -> AudioSegment:
    multiplier = segment.duration_seconds / length

    if multiplier > 1:
        return speed_up(segment, multiplier, chunk_size=50)
    elif multiplier < 1:
        return speed_down(segment, multiplier)
    else:
        return segment

# set the cache dictionary
cache = {}

with open(path.with_suffix(".srt"), "r", encoding="utf-8") as file:
    # get the subtitles in a list
    subtitles = list(srt.parse(file.read()))

    # get the duration of the whole voice-over in miliseconds
    duration = subtitles[-1].end.total_seconds() * 1000

    # make a silent audio segment
    audio = AudioSegment.silent(duration=duration)

    for subtitle in subtitles:
        if subtitle.content in cache:
            stream = cache[subtitle.content] # no need to call the API that way
        else:
            stream = BytesIO()

            speech = gtts.gTTS(subtitle.content, lang=TTS_LANGUAGE, tld=TTS_REGION, slow=True)
            speech.write_to_fp(stream)

            cache[subtitle.content] = stream

        # seek to the beginning of the stream
        stream.seek(0)

        # import the audio from the stream
        speech = AudioSegment.from_file(stream, format="mp3")

        # get the start, end and the duration of a subtitle
        start = subtitle.start
        end = subtitle.end

        length = (end - start).total_seconds()

        # change the speed of the sample and overlay it to the voice-over
        audio = audio.overlay(change_length(speech, length), start.total_seconds() * 1000)

        # print debug information about the subtitle
        print(f"Subtitle: {subtitle.content}")
        print(f"Start: {start}")
        print(f"End: {end}")
        print(f"Length: {length}\n")

# get the voice-over path
voiceover = path.with_suffix(".wav")

# export the voice-over to a WAV file
audio.export(voiceover, format="wav")

# show a preview of the voice-over
ipython_display(str(voiceover), filetype="audio", maxduration=300)

#### Make a video file with the new voice-over

In [None]:
# get the video output path
output = path.with_stem(path.stem + ' voiceover')

# replace the audio track in the video file
!ffmpeg -i "{path}" -i "{voiceover}" -c:v copy -map 0:v:0 -map 1:a:0 "{output}"

# show a preview of the video file
ipython_display(str(output), filetype="video", maxduration=300)

# download the video file
files.download(str(output))