<a href="https://colab.research.google.com/github/alexfazio/Python/blob/main/whisper_vid_2_subs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Subtitles Generator 🔤

By [Alex Fazio](https://twitter.com/alxfazio) | [GitHub Repo](https://github.com/alexfazio/Python/)

Automatically generate bilingual subtitles for your videos using OpenAI's [Whisper](https://github.com/openai/whisper) speech recognition model! 🌍

Get started in just a few steps:
1. Run the setup cells to install dependencies
2. Provide your video file path or YouTube URL
3. Choose your desired Whisper model and subtitle format
4. Press ▶️ and let the magic happen!

# Install Requirements


The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [None]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q pytube transformers sentencepiece tqdm

In [None]:
#@markdown ### Check Type of GPU and VRAM available.
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

# Transcribe the Video

In [None]:
import os
from tqdm import tqdm
import whisper
import numpy as np
from pathlib import Path
from pytube import YouTube


#@markdown If `video_path` is a YouTube link, the video will be downloaded at the `save_path`.
video_path = 'filepath.mp4' #@param {type: 'string'}
#@markdown Choose a Whisper model. `base` is the fastest and uses the least amount of memory.
model_type = 'small'  #@param ["base", "small", "medium", "large"]
#@markdown Video Language Code
video_lang = 'en'   #@param {type: 'string'}
#@markdown Where to save the video and subtitle.
save_path = 'data'  #@param {type: 'string'}
save_path = Path(save_path)
save_path.mkdir(exist_ok=True, parents=True)
#@markdown What to name the saved video and subtitle.
filename = 'demo' #@param {type: 'string'}
#@markdown Which format to save the subtitle in.
format = 'srt' #@param ["srt", "txt"]



def get_video_from_youtube_url(url, save_path=None, filename=None):
    yt = YouTube(url)
    video_file = str(save_path/f'{filename}.mp4')
    s = (yt.streams.filter(progressive=True, file_extension='mp4')
         .order_by('resolution').desc().first()
    )
    s.download(filename=video_file)
    return video_file


def transcribe(video, save_path, filename, model_type='small'):
    if video.startswith('http'):
        print("Downloading Youtube Video\n")
        video = get_video_from_youtube_url(video, save_path=save_path, filename=filename
        )
    options = whisper.DecodingOptions(fp16=False, language=video_lang)
    model = whisper.load_model(model_type)
    result = model.transcribe(video, **options.__dict__, verbose=False)
    return result, video


def segments_to_srt(segs):
    text = []
    for i,s in tqdm(enumerate(segs)):
        text.append(str(i+1))

        time_start = s['start']
        hours, minutes, seconds = int(time_start/3600), (time_start/60) % 60, (time_start) % 60
        timestamp_start = "%02d:%02d:%06.3f" % (hours, minutes, seconds)
        timestamp_start = timestamp_start.replace('.',',')
        time_end = s['end']
        hours, minutes, seconds = int(time_end/3600), (time_end/60) % 60, (time_end) % 60
        timestamp_end = "%02d:%02d:%06.3f" % (hours, minutes, seconds)
        timestamp_end = timestamp_end.replace('.',',')
        text.append(timestamp_start + " --> " + timestamp_end)

        text.append(s['text'].strip() + "\n")

    return "\n".join(text)


def convert_to_subtitle(segs):
    if format == 'srt':
        sub = segments_to_srt(segs)
    elif format == 'txt':
        sub = transcribed_text(segs)
    else:
        raise ValueError(f"format {format} is not supported!")
    return sub


def save_subtitle(sub, save_path, filename, format='srt'):
    srt_file = save_path/f'{filename}.{format}'
    with open(srt_file, 'w') as f:
        f.write(sub)
    return srt_file


def transcribed_text(segs):
    texts = [s['text'] for s in segs]
    text = '\n'.join(texts)
    return text


print("Loading the model")
model = whisper.load_model(f'{model_type}')
print("Transcribing")
result, video = transcribe(video_path, save_path, filename, model_type=model_type)
sub = convert_to_subtitle(result['segments'])
sub_transcribed = save_subtitle(sub, save_path, filename+'-sub', format=format)
print(f"\n\nsubtitle is saved at {sub_transcribed}")