In [5]:
import os
import whisper
import warnings
import tempfile
import ffmpeg

def get_audio(paths):
    temp_dir = tempfile.gettempdir()
    audio_paths = {}
    for path in paths:
        filename = os.path.basename(path).split('.')[0]
        print(f"Extracting audio from {os.path.basename(path)}...")
        output_path = os.path.join(temp_dir, f"{filename}.wav")

        ffmpeg.input(path).output(
            output_path,
            acodec="pcm_s16le", ac=1, ar="16k"
        ).run(quiet=True, overwrite_output=True)
        audio_paths[path] = output_path
    return audio_paths

def write_transcript(audio_path, srt_path, transcribe):
    print(f"Generating transcript for {os.path.basename(audio_path)} audio... This might take a while.")

    warnings.filterwarnings("ignore")
    result = transcribe(audio_path)
    warnings.filterwarnings("default")

    with open(srt_path, "w", encoding="utf-8") as f:
        f.write(result["text"] + os.linesep)
    return result

def get_transcript(audio_paths: list, output_text: bool, output_dir: str, transcribe: callable):
    text_path = output_dir if output_text else tempfile.gettempdir()
    for path, audio_path in audio_paths.items():
        filename = os.path.basename(path).split('.')[0]
        srt_path = os.path.join(text_path, f"{filename}.txt")

        result = write_transcript(audio_path, srt_path, transcribe)
    return result

def initiate_stt(video_path:str, model:str, output_dir:str, srt:bool, verbose:bool, 
                ):
    os.makedirs(output_dir, exist_ok=True)
    if model.endswith(".en"):
        print(f"{model} is a English model")
    model = whisper.load_model(model)
    audio = get_audio(video_path)
    subtitle = get_transcript(
                audio, srt, output_dir, lambda audio_path: model.transcribe(audio_path, 
                                                verbose=verbose, task='transcribe'))
    return subtitle

In [6]:
import sys
vid_path = r"D:\Recordings\final\pyproject\003_Project Setup.mp4"
try:
    subtitle = initiate_stt(video_path=[vid_path],
    model='tiny.en', output_dir='transcript_text', srt=True, verbose=False)
except Exception as e:
    print(f'error {sys.stderr}')

tiny.en is a English model
Extracting audio from 003_Project Setup.mp4...
Generating transcript for 003_Project Setup.wav audio... This might take a while.


100%|██████████| 18421/18421 [00:16<00:00, 1147.14frames/s]


In [7]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer

# Load tokenizer 
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
# Load model 
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

In [8]:
inputs = tokenizer(subtitle['text'], 
                max_length=512, 
                truncation=True,
                return_tensors="pt")

summary_ids = model.generate(inputs["input_ids"], max_new_tokens=512)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

" In this video, we will be creating our basic project setup . Let's start with GitHub, then click on Repostries and create a private repository . Next step is we will just clone this report to our local system . We can make whatever changes we want in this local repo and then push it after every change ."