In [2]:
import whisper
import moviepy as mp
import eyed3

In [None]:

def generate_subtitles(video_path, audio_path, output_srt_path):
    """Generates subtitles for a video using Whisper.
    models (https://github.com/openai/whisper):
        "tiny.en" | "tiny"
        "base.en" | "base"
        "small.en" | small"
        "medium.en" | edium"
        "large-v1"
        "large-v2"
        "large-v3"
        "large"
        "large-v3-turbo"
        "turbo"
    """

    # Extract audio from the video
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)

    # Load the Whisper model
    model = whisper.load_model("turbo", device="cuda")  # Or another model like "small", "medium", "large", "turbo"

    # Transcribe the audio
    result = model.transcribe(audio_path, verbose=False, condition_on_previous_text=False)

    # Generate SRT file
    with open(output_srt_path, "w", encoding="utf-8") as f:
        for i, segment in enumerate(result["segments"]):
            start = segment["start"]
            end = segment["end"]
            text = segment["text"]

            f.write(f"{i+1}\n")
            f.write(f"{format_time(start)} --> {format_time(end)}\n")
            f.write(f"{text}\n\n")

def format_time(seconds):
    """Formats time in seconds to SRT format."""
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(seconds*1000 % 1000):03}"


In [15]:
def add_tag(audio_file: str, artist: str, album: str):
    file = eyed3.load(audio_file)
    file.tag.artist = artist
    file.tag.album = album
    file.tag.save()

In [None]:
# %%timeit
if __name__ == "__main__":
    artist = "Sarah Beth Durst"
    album_name = "The Spellshop"
    file_name = "36 The Spellshop"
    video_path = f"videos/{album_name}/{file_name}.mp4"
    audio_path = f"videos/{album_name}/{file_name}.mp3"
    output_srt_path = f"videos/{file_name}_gemini.srt"
    generate_subtitles(video_path, audio_path, output_srt_path)
    add_tag(audio_path, artist, album_name)

''' results
1g= base nocuda:9m 38s base cuda:4m 27s turbo cuda: 9m 36s
10g= turbo cuda: 4m 17s
369k=
sample_4g=turbo cuda: 11s

'''

frame_index:   2%|▏         | 44/2880 [07:15<01:30, 31.32it/s, now=None]

MoviePy - Writing audio in videos/TheSpellshop/36 The Spellshop.mp3


frame_index:   2%|▏         | 44/2880 [07:23<01:30, 31.32it/s, now=None]

MoviePy - Done.


' results\n1g= base nocuda:9m 38s base cuda:4m 27s turbo cuda: 9m 36s\n10g= turbo cuda: 4m 17s\n369k=\nsample_4g=turbo cuda: 11s\n\n'

In [38]:

#! does not work
def embed_subtitles(video_input, subtitle_path, video_output):
    """Embeds subtitles into a video using MoviePy."""

    video = mp.VideoFileClip(video_input)
    subtitles = mp.TextClip(font=r'fonts\swansea-font\SwanseaItalic-AwqD.ttf', text=subtitle_path, font_size=24, color='white', duration=video.duration)

    # Composite video and subtitles
    final_video = mp.CompositeVideoClip([video, subtitles])

    # Write the final video to a file
    print(f'{video.duration=}')
    final_video.write_videofile(video_output, codec='libx264')


if __name__ == "__main__":
    file_name = "sample_1"
    video_input = f"videos/{file_name}.mkv"
    subtitle_path = f"videos/{file_name}_gemini.srt"
    video_output = f"videos/{file_name}_embedded.mkv"
    
    embed_subtitles(video_input, subtitle_path, video_input)


video.duration=120.03
MoviePy - Building video videos/sample_1.mkv.
MoviePy - Writing audio in sample_1TEMP_MPY_wvf_snd.mp3


                                                                      

MoviePy - Done.
MoviePy - Writing video videos/sample_1.mkv



frame_index:   2%|▏         | 44/2880 [00:01<01:30, 31.32it/s, now=None]

KeyboardInterrupt: 

frame_index:   2%|▏         | 44/2880 [00:18<01:30, 31.32it/s, now=None]