In [1]:
import ffmpeg
import whisper
import srt
import os
import datetime
import cv2
import numpy as np
import io
from pydub import AudioSegment
from transformers import pipeline
import textwrap


def extract_audio(video_path, output_audio_path):
    """Extract audio from video using FFmpeg."""
    try:
        ffmpeg.input(video_path).output(output_audio_path).run(overwrite_output=True)
        print("Audio extracted successfully.")
    except ffmpeg.Error as e:
        print(f"FFmpeg error: {e.stderr.decode()}")

def preprocess_audio(input_file):
    """
    Preprocess the audio file:
    - Converts stereo to mono.
    - Resamples to 16kHz for compatibility with Whisper.
    - Returns the processed audio as a NumPy array and sample rate.
    """
    try:
        print("Preprocessing audio...")
        audio = AudioSegment.from_file(input_file)
        audio = audio.set_channels(1)  # Convert to mono
        audio = audio.set_frame_rate(16000)  # Resample to 16kHz
        
        # Export to raw audio bytes
        raw_audio = io.BytesIO()
        audio.export(raw_audio, format="wav")
        raw_audio.seek(0)

        # Load audio into a NumPy array
        waveform = np.frombuffer(raw_audio.read(), dtype=np.int16)
        sample_rate = 16000  # Since we resampled to 16kHz
        print("Audio preprocessing completed.")
        return waveform, sample_rate
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None, None

def transcribe_with_whisper(waveform, sample_rate):
    """
    Transcribe the given audio waveform to text using Whisper.
    """
    try:
        print("Loading Whisper model...")
        model = whisper.load_model("base")  # large
        print("Transcribing audio with Whisper...")
        
        # Whisper requires audio in float32 format, normalized to [-1, 1]
        audio_float = waveform.astype(np.float32) / 32768.0
        
        # Transcribe with Whisper
        result = model.transcribe(audio_float, fp16=False)
        return result["text"], result["segments"]
    except Exception as e:
        return f"Error during transcription: {e}", None

def generate_srt_from_segments(segments, output_srt_path):
    """Generate SRT file from Whisper segments."""
    try:
        subtitles = []
        for i, segment in enumerate(segments):
            start_time = datetime.timedelta(seconds=segment["start"])
            end_time = datetime.timedelta(seconds=segment["end"])
            subtitle = srt.Subtitle(index=i + 1, start=start_time, end=end_time, content=segment["text"])
            subtitles.append(subtitle)
        with open(output_srt_path, "w", encoding="utf-8") as f:
            f.write(srt.compose(subtitles))
        print(f"SRT file generated: {output_srt_path}")
    except Exception as e:
        print(f"Error generating SRT file: {e}")

def restore_punctuation(text):
    """Restore punctuation using a Hugging Face model."""
    try:
        punctuator = pipeline("text2text-generation", model="oliverguhr/fullstop-punctuation-multilang-large")
        restored_text = punctuator(text, max_length=1024)[0]["generated_text"]
        print("Punctuation restored successfully.")
        return restored_text
    except Exception as e:
        print(f"Error restoring punctuation: {e}")
        return text

def video_to_text_advanced(video_path, output_srt_path):
    """Main function to process video and generate accurate subtitles."""
    raw_audio_path = "temp_raw_audio.wav"
    try:
        # Extract audio from video
        extract_audio(video_path, raw_audio_path)

        # Preprocess audio for Whisper
        waveform, sample_rate = preprocess_audio(raw_audio_path)
        if waveform is None:
            print("Audio preprocessing failed.")
            return

        # Transcribe audio with Whisper
        transcript, segments = transcribe_with_whisper(waveform, sample_rate)
        if segments is None:
            print("Transcription failed.")
            return

        # Restore punctuation
        transcript_with_punctuation = restore_punctuation(transcript)

        # Generate SRT file
        generate_srt_from_segments(segments, output_srt_path)

        # Display the final transcription
        print("Final Transcription with Punctuation:\n")
        print(transcript_with_punctuation)
    finally:
        if os.path.exists(raw_audio_path):
            os.remove(raw_audio_path)

def wrap_text(text, width=50):
    """Wrap text into lines of specified width."""
    return "\n".join(textwrap.wrap(text, width))

def synchronize_text_with_video(video_path, srt_path):
    """Display subtitles on video."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file.")
        return
    
    with open(srt_path, "r", encoding="utf-8") as f:
        subtitles = list(srt.parse(f.read()))
    
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1
    font_color = (255, 255, 255)
    thickness = 2
    
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
    subtitle_index = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        current_time = datetime.timedelta(seconds=cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        if subtitle_index < len(subtitles):
            subtitle = subtitles[subtitle_index]
            if subtitle.start <= current_time <= subtitle.end:
                cv2.putText(frame, subtitle.content, (50, 50), font, font_scale, font_color, thickness)
            elif current_time > subtitle.end:
                subtitle_index += 1
        
        frame = cv2.resize(frame, (500, 750))
        cv2.imshow("Video with Subtitles", frame)
        if cv2.waitKey(1000 // frame_rate) & 0xFF == ord("q"):
            break
    
    cap.release()
    cv2.destroyAllWindows()


# Example Usage
video_path = "best_advice_for_life.mp4"
output_srt_path = "output_subtitles_advanced.srt"

# Process video and generate subtitles
video_to_text_advanced(video_path, output_srt_path)

synchronize_text_with_video(video_path, output_srt_path)


Audio extracted successfully.
Preprocessing audio...
Audio preprocessing completed.
Loading Whisper model...
Transcribing audio with Whisper...



Device set to use cpu
The model 'XLMRobertaForTokenClassification' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForCondit

Error restoring punctuation: 'Text2TextGenerationPipeline' object has no attribute 'prefix'
SRT file generated: output_subtitles_advanced.srt
Final Transcription with Punctuation:

 You work hard, you make money, you do it for yourself. That's not life. You go out, you seek for people who need your help, you make their lives better. You become that sponge which can absorb all the negativity and you become that person who can emit beautiful positive vibes and when you realize that you have changed someone's life and because of you, this person didn't give up. That is the day when you live.
