In [2]:
import ffmpeg
import whisper
import srt
import os
import datetime
import cv2
import numpy as np
import io
from pydub import AudioSegment
from transformers import pipeline
import textwrap
import re


def extract_audio(video_path, output_audio_path):
    """Extract audio from video using FFmpeg."""
    try:
        ffmpeg.input(video_path).output(output_audio_path).run(overwrite_output=True)
        print("Audio extracted successfully.")
    except ffmpeg.Error as e:
        print(f"FFmpeg error: {e.stderr.decode()}")

def preprocess_audio(input_file):
    """
    Preprocess the audio file:
    - Converts stereo to mono.
    - Resamples to 16kHz for compatibility with Whisper.
    - Returns the processed audio as a NumPy array and sample rate.
    """
    try:
        print("Preprocessing audio...")
        audio = AudioSegment.from_file(input_file)
        audio = audio.set_channels(1)  # Convert to mono
        audio = audio.set_frame_rate(16000)  # Resample to 16kHz
        
        # Export to raw audio bytes
        raw_audio = io.BytesIO()
        audio.export(raw_audio, format="wav")
        raw_audio.seek(0)

        # Load audio into a NumPy array
        waveform = np.frombuffer(raw_audio.read(), dtype=np.int16)
        sample_rate = 16000  # Since we resampled to 16kHz
        print("Audio preprocessing completed.")
        return waveform, sample_rate
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None, None

def transcribe_with_whisper(waveform, sample_rate):
    """
    Transcribe the given audio waveform to text using Whisper.
    """
    try:
        print("Loading Whisper model...")
        model = whisper.load_model("base")  # large
        print("Transcribing audio with Whisper...")
        
        # Whisper requires audio in float32 format, normalized to [-1, 1]
        audio_float = waveform.astype(np.float32) / 32768.0
        
        # Transcribe with Whisper
        result = model.transcribe(audio_float, fp16=False)
        return result["text"], result["segments"]
    except Exception as e:
        return f"Error during transcription: {e}", None

def generate_srt_from_segments(segments, output_srt_path):
    """Generate SRT file from Whisper segments."""
    try:
        subtitles = []
        for i, segment in enumerate(segments):
            start_time = datetime.timedelta(seconds=segment["start"])
            end_time = datetime.timedelta(seconds=segment["end"])
            subtitle = srt.Subtitle(index=i + 1, start=start_time, end=end_time, content=segment["text"])
            subtitles.append(subtitle)
        with open(output_srt_path, "w", encoding="utf-8") as f:
            f.write(srt.compose(subtitles))
        print(f"SRT file generated: {output_srt_path}")
    except Exception as e:
        print(f"Error generating SRT file: {e}")

def restore_punctuation(text):
    """Restore punctuation using a Hugging Face model."""
    try:
        punctuator = pipeline("text2text-generation", model="oliverguhr/fullstop-punctuation-multilang-large")
        restored_text = punctuator(text, max_length=1024)[0]["generated_text"]
        print("Punctuation restored successfully.")
        return restored_text
    except Exception as e:
        print(f"Error restoring punctuation: {e}")
        return text

def video_to_text_advanced(video_path, output_srt_path):
    """Main function to process video and generate accurate subtitles."""
    raw_audio_path = "temp_raw_audio.wav"
    try:
        # Extract audio from video
        extract_audio(video_path, raw_audio_path)

        # Preprocess audio for Whisper
        waveform, sample_rate = preprocess_audio(raw_audio_path)
        if waveform is None:
            print("Audio preprocessing failed.")
            return

        # Transcribe audio with Whisper
        transcript, segments = transcribe_with_whisper(waveform, sample_rate)
        if segments is None:
            print("Transcription failed.")
            return

        # Restore punctuation
        transcript_with_punctuation = restore_punctuation(transcript)

        # Generate SRT file
        generate_srt_from_segments(segments, output_srt_path)

        # Display the final transcription
        print("Final Transcription with Punctuation:\n")
        print(transcript_with_punctuation)
    finally:
        if os.path.exists(raw_audio_path):
            os.remove(raw_audio_path)

def wrap_text(text, width=50):
    """Wrap text into lines of specified width."""
    return "\n".join(textwrap.wrap(text, width))

# Braille Unicode character mapping
braille_alphabet = {
    'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 
    'i': '⠊', 'j': '⠚', 'k': '⠅', 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 
    'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', 'w': '⠺', 'x': '⠭', 
    'y': '⠽', 'z': '⠵', '1': '⠁', '2': '⠃', '3': '⠉', '4': '⠙', '5': '⠑', '6': '⠋', 
    '7': '⠛', '8': '⠓', '9': '⠊', '0': '⠚', ' ': ' ', ',': '⠂', '.': '⠲', '?': '⠦', 
    '!': '⠮', ':': '⠰', ';': '⠱', '-': '⠤', '"': '⠦', "'": '⠄'
}

def convert_to_braille(text):
    """
    Converts the given text to Braille using Unicode Braille patterns.
    """
    braille_text = ''
    for char in text.lower():
        braille_text += braille_alphabet.get(char, '')  # Get Braille character or skip if not found
    return braille_text

def process_srt_file(file_path):
    """
    Processes the SRT file, extracts the text and converts it to Braille.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Regular expression to extract subtitle text from the SRT file
    subtitle_blocks = re.findall(r'\d+\n(.*?)(?=\n\d+|$)', content, re.DOTALL)
    
    braille_subtitles = []
    
    for block in subtitle_blocks:
        # Remove timestamps and extra whitespace
        text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3}', '', block).strip()
        braille_text = convert_to_braille(text)
        braille_subtitles.append(braille_text)
    
    return '\n'.join(braille_subtitles)

def save_braille_output(output_file, braille_text):
    """
    Saves the Braille text to an output file.
    """
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(braille_text)


def wrap_text_to_width(text, max_pixel_width, font, scale, thickness):
    """Wrap text into multiple lines so each line fits within max_pixel_width."""
    words = text.split()
    lines = []
    current_line = ""

    for word in words:
        test_line = current_line + " " + word if current_line else word
        text_size = cv2.getTextSize(test_line, font, scale, thickness)[0][0]
        if text_size <= max_pixel_width:
            current_line = test_line
        else:
            lines.append(current_line)
            current_line = word

    if current_line:
        lines.append(current_line)

    return lines


def synchronize_text_with_video(video_path, srt_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file.")
        return

    # Load subtitles
    with open(srt_path, "r", encoding="utf-8") as f:
        subtitles = list(srt.parse(f.read()))

    # Video properties
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Font settings (scaled based on video height)
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = height / 720 * 0.8
    thickness = 2
    color = (255, 255, 255)
    margin = 30  # left/right margin from edge

    subtitle_index = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        width = 400
        height = 600
        frame = cv2.resize(frame, (width, height))

        current_time = datetime.timedelta(seconds=cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        if subtitle_index < len(subtitles):
            sub = subtitles[subtitle_index]
            if sub.start <= current_time <= sub.end:
                lines = wrap_text_to_width(sub.content, width - 2 * margin, font, font_scale, thickness)

                for i, line in enumerate(lines):
                    text_size = cv2.getTextSize(line, font, font_scale, thickness)[0]
                    x = (width - text_size[0]) // 2
                    y = height - 50 - (len(lines) - i - 1) * int(40 * font_scale)
                    # Draw background box
                    (text_w, text_h), baseline = cv2.getTextSize(line, font, font_scale, thickness)
                    box_coords = ((x - 10, y - text_h - 10), (x + text_w + 10, y + baseline + 10))

                    overlay = frame.copy()
                    cv2.rectangle(overlay, box_coords[0], box_coords[1], (0, 0, 0), cv2.FILLED)
                    alpha = 0.5
                    cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)

                    # Draw text
                    cv2.putText(frame, line, (x, y), font, font_scale, color, thickness, cv2.LINE_AA)


            elif current_time > sub.end:
                subtitle_index += 1

        cv2.imshow("Video with Subtitles", frame)
        if cv2.waitKey(1000 // frame_rate) & 0xFF == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()


# Example Usage
# video_path = "best_advice_for_life.mp4"
video_path = "motivation.mp4"
output_srt_path = "output_subtitles_advanced.srt"

# Example usage
input_srt_file = output_srt_path  # Path to your SRT file
output_braille_file = 'braille_output.txt'  # Output file to store Braille text


# Process video and generate subtitles
video_to_text_advanced(video_path, output_srt_path)

synchronize_text_with_video(video_path, output_srt_path)

braille_text = process_srt_file(input_srt_file)
save_braille_output(output_braille_file, braille_text)

print("Conversion complete! Braille text saved to:", output_braille_file)


Audio extracted successfully.
Preprocessing audio...
Audio preprocessing completed.
Loading Whisper model...
Transcribing audio with Whisper...


Device set to use cpu
The model 'XLMRobertaForTokenClassification' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForCondit

Error restoring punctuation: 'Text2TextGenerationPipeline' object has no attribute 'prefix'
SRT file generated: output_subtitles_advanced.srt
Final Transcription with Punctuation:

 Ever tried, ever failed. No matter. Try again. Fail again. Fail better.
Conversion complete! Braille text saved to: braille_output.txt


In [1]:
import whisper

model = whisper.load_model("base")
print(model.device)  # Check if it's running on CPU or GPU
print(whisper.__file__)  # Get the location of the whisper module


  checkpoint = torch.load(fp, map_location=device)


cpu
c:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\whisper\__init__.py
