In [5]:
import whisper
from moviepy.editor import VideoFileClip
import os

def extract_audio_from_video(video_path, audio_path):
    """
    Extracts the audio from a video file and saves it as a separate file.
    """
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    video.close()

def generate_subtitles(video_path, model_name="base"):
    """
    Generates subtitles for a video using the Whisper model.
    """
    # Check if the Whisper model is installed
    try:
        model = whisper.load_model(model_name)
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
        return
    
    # Extract audio from video
    audio_path = "temp_audio.wav"
    extract_audio_from_video(video_path, audio_path)

    # Transcribe audio
    print("Transcribing audio...")
    result = model.transcribe(audio_path)

    # Create SRT file
    srt_path = os.path.splitext(video_path)[0] + ".srt"
    with open(srt_path, "w", encoding="utf-8") as srt_file:
        for i, segment in enumerate(result["segments"]):
            # Write SRT segment
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            text = segment["text"].strip()
            srt_file.write(f"{i + 1}\n{start} --> {end}\n{text}\n\n")

    print(f"Subtitles saved to {srt_path}")

    # Clean up
    os.remove(audio_path)

def format_timestamp(seconds):
    """
    Formats seconds into an SRT timestamp (HH:MM:SS,ms).
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

# Example usage
video_path = "video.mp4"  # Replace with your video file path
generate_subtitles(video_path, model_name="base")


MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
Transcribing audio...
Subtitles saved to video.srt


In [8]:
import os
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from moviepy.editor import VideoFileClip
import numpy as np

def extract_audio_from_video(video_path, audio_path):
    """
    Extracts the audio from a video file and saves it as a separate file.
    """
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    video.close()

def generate_subtitles_from_model(video_path, model, processor):
    """
    Generates subtitles for a video using the self-trained Wav2Vec2 model.
    """
    # Extract audio from video
    audio_path = "temp_audio.wav"
    extract_audio_from_video(video_path, audio_path)

    # Load the audio file using librosa
    audio, sr = librosa.load(audio_path, sr=16000)  # Ensure 16kHz sampling rate
    os.remove(audio_path)  # Remove the temporary audio file

    # Preprocess the audio and get model predictions
    input_values = processor(audio, return_tensors="pt", sampling_rate=sr).input_values
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode the logits into transcription text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    # Filter out the padding tokens (if any)
    transcription = transcription.replace("<pad>", "").strip()

    # Split transcription into words or segments (adjust for your model output)
    segments = segment_transcription(transcription, predicted_ids, processor, audio_length=len(audio)/sr)
    
    # Create SRT file with timestamps
    srt_path = os.path.splitext(video_path)[0] + "asrn_model_subtitles.srt"
    with open(srt_path, "w", encoding="utf-8") as srt_file:
        for i, segment in enumerate(segments):
            start = format_timestamp(segment['start_time'])
            end = format_timestamp(segment['end_time'])
            text = segment['text'].strip()
            srt_file.write(f"{i + 1}\n{start} --> {end}\n{text}\n\n")

    print(f"Subtitles saved to {srt_path}")

def segment_transcription(transcription, predicted_ids, processor, audio_length, window_size=10, stride=5):
    """
    Segments the transcription text into segments with approximate timestamps.
    This is an approximation and can be adjusted according to model output.
    """
    tokens = processor.tokenizer.convert_ids_to_tokens(predicted_ids[0].tolist())
    
    # Remove padding tokens from the transcription
    tokens = [token for token in tokens if token != "<pad>"]

    segments = []
    num_tokens = len(tokens)
    start_time = 0
    # Calculate time per token (based on total audio length and number of tokens)
    time_per_token = audio_length / num_tokens
    
    for i in range(0, len(tokens), window_size):
        end_time = start_time + (window_size * time_per_token)  # Approximate based on window size
        
        # Ensure to avoid the last token being too long
        segment_text = " ".join(tokens[i:i + window_size]).strip()
        
        segments.append({
            "start_time": start_time,
            "end_time": min(end_time, audio_length),  # Ensure the end time doesn't exceed audio length
            "text": segment_text
        })
        
        start_time = end_time
    
    return segments

def format_timestamp(seconds):
    """
    Formats seconds into an SRT timestamp (HH:MM:SS,ms).
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

# Load your self-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("./subtitle-generator")
model = Wav2Vec2ForCTC.from_pretrained("./subtitle-generator")

# Example usage
video_path = "video2.mp4"  # Replace with your video file path
generate_subtitles_from_model(video_path, model, processor)


MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
Subtitles saved to video2asrn_model_subtitles.srt


In [4]:
import whisper
from moviepy.editor import VideoFileClip
import os
from transformers import pipeline

def extract_audio_from_video(video_path, audio_path):
    """
    Extracts the audio from a video file and saves it as a separate file.
    """
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    video.close()

def format_timestamp(seconds):
    """
    Formats seconds into an SRT timestamp (HH:MM:SS,ms).
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

def generate_subtitles(video_path, model_path, processor_path, whisper_model_name="base"):
    """
    Generates subtitles for a video using the fine-tuned Wav2Vec2 model for transcription and the Whisper model for correction.
    """
    # Load Whisper model for correction
    try:
        whisper_model = whisper.load_model(whisper_model_name)
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
        return
    
    # Load the fine-tuned Wav2Vec2 model and processor
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model=model_path,
        tokenizer=processor_path
    )

    # Extract audio from video
    audio_path = "temp_audio.wav"
    extract_audio_from_video(video_path, audio_path)

    # Step 1: Transcribe audio with Wav2Vec2 model
    print("Transcribing audio using self-trained Wav2Vec2 model...")
    result_wav2vec = asr_pipeline(audio_path)
    wav2vec_transcript = result_wav2vec['text'].lower()  # Convert transcript to lowercase

    # Step 2: Use Whisper model for better vocabulary and timestamp correction
    print("Transcribing audio using Whisper model for correction...")
    result_whisper = whisper_model.transcribe(audio_path)

    # Create SRT file path
    srt_path = os.path.splitext(video_path)[0]+"asr" + ".srt"

    # Step 3: Generate subtitles using Whisper's improved segments
    with open(srt_path, "w", encoding="utf-8") as srt_file:
        for i, segment in enumerate(result_whisper["segments"]):
            # Write SRT segment using Whisper's improved timing and vocabulary
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            text = segment["text"].strip()
            srt_file.write(f"{i + 1}\n{start} --> {end}\n{text}\n\n")

    print(f"Subtitles saved to {srt_path}")

    # Clean up temporary audio file
    os.remove(audio_path)

# Example usage
video_path = "video2.mp4"  # Replace with your video file path
model_path = "E:\\video_player\\subtitle-generator"  # Path to the fine-tuned Wav2Vec2 model
processor_path = "E:\\video_player\\subtitle-generator"  # Path to the saved processor
generate_subtitles(video_path, model_path, processor_path)



MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
Transcribing audio using self-trained Wav2Vec2 model...
Transcribing audio using Whisper model for correction...
Subtitles saved to video2asr.srt


Main Code

In [None]:
import whisper
from moviepy.editor import VideoFileClip
import os
from transformers import pipeline

def extract_audio_from_video(video_path, audio_path):
    """
    Extracts the audio from a video file and saves it as a separate file.
    """
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    video.close()

def format_timestamp(seconds):
    """
    Formats seconds into an SRT timestamp (HH:MM:SS,ms).
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

def generate_subtitles(video_path, model_path, processor_path, whisper_model_name="base"):
    """
    Generates subtitles for a video using the fine-tuned Wav2Vec2 model for transcription and the Whisper model for correction.
    """
    # Load Whisper model for correction
    try:
        whisper_model = whisper.load_model(whisper_model_name)
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
        return
    
    # Load the fine-tuned Wav2Vec2 model and processor
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model=model_path,
        tokenizer=processor_path
    )

    # Extract audio from video
    audio_path = "temp_audio.wav"
    extract_audio_from_video(video_path, audio_path)

    # Step 1: Transcribe audio with Wav2Vec2 model
    print("Transcribing audio using self-trained Wav2Vec2 model...")
    result_wav2vec = asr_pipeline(audio_path)
    wav2vec_transcript = result_wav2vec['text'].lower()  # Convert transcript to lowercase

    # Display the transcript from the self-trained Wav2Vec2 model
    print("\nTranscript from self-trained Wav2Vec2 model:")
    print(wav2vec_transcript)

    # Step 2: Use Whisper model for better vocabulary and timestamp correction
    print("\nTranscribing audio using Whisper model for correction...")
    result_whisper = whisper_model.transcribe(audio_path)

    # Display the corrected subtitles from Whisper
    print("\nCorrected subtitles from Whisper model:")
    for segment in result_whisper["segments"]:
        print(f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}: {segment['text'].strip()}")

    # Create SRT file path
    srt_path = os.path.splitext(video_path)[0] + ".srt"

    # Step 3: Generate subtitles using Whisper's improved segments
    with open(srt_path, "w", encoding="utf-8") as srt_file:
        for i, segment in enumerate(result_whisper["segments"]):
            # Write SRT segment using Whisper's improved timing and vocabulary
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            text = segment["text"].strip()
            srt_file.write(f"{i + 1}\n{start} --> {end}\n{text}\n\n")

    print(f"\nSubtitles saved to {srt_path}")

    # Clean up temporary audio file
    os.remove(audio_path)

# Example usage
video_path = "video2.mp4"  # Replace with your video file path
model_path = "E:\\video_player\\subtitle-generator"  # Path to the fine-tuned Wav2Vec2 model
processor_path = "E:\\video_player\\subtitle-generator"  # Path to the saved processor
generate_subtitles(video_path, model_path, processor_path)



MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
Transcribing audio using self-trained Wav2Vec2 model...

Transcript from self-trained Wav2Vec2 model:
so what's new mark how is your new job going to be honest i can't complain i really love the company that i am working for my co workers are all really friendly and helpful they really help me feel welcome it's a really energetic and fun atmosphere my boss is hilarius and he's really flexible really how so he allows me to come in when i want and make my own hours i can also leave early if i start early there is no real dress coat either i can wer genes and a tea shirt if i want i can even wear shorts in the summer wow it sounds really cool i can't stand wearing a suit every day which do you prefer working late or finishing early i prefer finishing early i really enjoy the morning i love getting up early and going for a rant there is nothing like watching the sunrise while drinking my morning coffee really i am opposite i love sleeping in iam most alert in the evenings i

In [3]:
def convert_srt_to_vtt(srt_file_path, vtt_file_path):
    try:
        with open(srt_file_path, 'r', encoding='utf-8') as srt_file:
            srt_content = srt_file.readlines()

        vtt_content = ['WEBVTT\n\n']  # VTT files need to start with "WEBVTT"

        for line in srt_content:
            # Replace commas with dots in timestamps
            if '-->' in line:
                line = line.replace(',', '.')
            vtt_content.append(line)

        with open(vtt_file_path, 'w', encoding='utf-8') as vtt_file:
            vtt_file.writelines(vtt_content)

        print(f"Successfully converted {srt_file_path} to {vtt_file_path}")

    except FileNotFoundError:
        print("Error: SRT file not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Usage example:
srt_file = 'video2.srt'  # Replace with your .srt file path
vtt_file = 'video2.vtt'  # Replace with desired .vtt file path
convert_srt_to_vtt(srt_file, vtt_file)


Successfully converted video2.srt to video2.vtt


In [2]:
import whisper
from moviepy.editor import VideoFileClip
import os
from transformers import pipeline

def extract_audio_from_video(video_path, audio_path):
    """
    Extracts the audio from a video file and saves it as a separate file.
    """
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    video.close()

def format_timestamp(seconds):
    """
    Formats seconds into an SRT timestamp (HH:MM:SS,ms).
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

def generate_subtitles(video_path, model_path, processor_path, whisper_model_name="base"):
    """
    Generates subtitles for a video using the fine-tuned Wav2Vec2 model for transcription and the Whisper model for correction.
    """
    # Load Whisper model for correction
    try:
        whisper_model = whisper.load_model(whisper_model_name)
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
        return
    
    # Load the fine-tuned Wav2Vec2 model and processor
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model=model_path,
        tokenizer=processor_path
    )

    # Extract audio from video
    audio_path = "temp_audio.wav"
    extract_audio_from_video(video_path, audio_path)

    # Step 1: Transcribe audio with Wav2Vec2 model
    print("Transcribing audio using self-trained Wav2Vec2 model...")
    result_wav2vec = asr_pipeline(audio_path)
    wav2vec_transcript = result_wav2vec['text'].lower()  # Convert transcript to lowercase

    # Display the transcript from the self-trained Wav2Vec2 model
    print("\nTranscript from self-trained Wav2Vec2 model:")
    print(wav2vec_transcript)

    # Step 2: Use Whisper model for better vocabulary and timestamp correction
    print("\nTranscribing audio using Whisper model for correction...")
    result_whisper = whisper_model.transcribe(audio_path)

    # Create a list to track changes made by Whisper model
    changes = []
    
    # Create SRT file path
    srt_path = os.path.splitext(video_path)[0] + ".srt"
    output_txt_path = "output.txt"

    # Step 3: Generate subtitles using Whisper's improved segments
    with open(srt_path, "w", encoding="utf-8") as srt_file, open(output_txt_path, "w", encoding="utf-8") as txt_file:
        for i, segment in enumerate(result_whisper["segments"]):
            start = format_timestamp(segment['start'])
            end = format_timestamp(segment['end'])
            whisper_text = segment["text"].strip()

            # Aligning both transcriptions for output
            start_idx = int(segment['start'] * 1000)  # Convert to milliseconds
            end_idx = int(segment['end'] * 1000)  # Convert to milliseconds
            wav2vec_segment = wav2vec_transcript[start_idx:end_idx].strip()

            # Check for changes and save them
            if wav2vec_segment != whisper_text:
                changes.append(f"Original: '{wav2vec_segment}' --> Corrected: '{whisper_text}'")
            
            # Write SRT segment using Whisper's improved timing and vocabulary
            srt_file.write(f"{i + 1}\n{start} --> {end}\n{whisper_text}\n\n")
        
        # Save the changes made by Whisper to the text file
        txt_file.write("Changes made by Whisper model:\n")
        for change in changes:
            txt_file.write(change + "\n")

    print(f"\nSubtitles saved to {srt_path}")
    print(f"\nChanges saved to {output_txt_path}")

    # Clean up temporary audio file
    os.remove(audio_path)

# Example usage
video_path = "video3.mp4"  # Replace with your video file path
model_path = "./subtitle-generator"  # Path to the fine-tuned Wav2Vec2 model
processor_path = "./subtitle-generator"  # Path to the saved processor
generate_subtitles(video_path, model_path, processor_path)


MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.
Transcribing audio using self-trained Wav2Vec2 model...

Transcript from self-trained Wav2Vec2 model:
you probably heard the story of how redit is selling their data to gugle for a i scraping how automatic is selling all the word present tumbler data and now how financial times to selling their data to opena i and is doesn't sound right because we've been told that these  i companies are taking old is data and basically invalidating the data sourcsis ter coming from by sharing the data so you don't get to the source so what exactly's happening here this is more complicated than it sounds you see they're two different things these a i companies can do with the data one miss use it for training in o building nou models basem adata and theyre definitely doing that in some respect but the other one is to use the data ask a grounded source and that's really interesting so thet me explain here sa very simplifid drawing of what happens when you use a a i system like chache b t

In [4]:
pip freeze requirement.txt

absl-py==2.1.0
accelerate==1.2.1
addict==2.4.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
astral==3.2
asttokens==2.4.1
astunparse==1.6.3
async-lru==2.0.4
attrs==23.2.0
audioread==3.0.1
Babel==2.15.0
beautifulsoup4==4.12.3
bleach==6.1.0
cachetools==5.5.0
certifi==2024.6.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.8
cloudpickle==3.1.0
cmdstanpy==1.2.4
colorama==0.4.6
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.3.0
cycler==0.12.1
Cython==3.0.11
datasets==3.2.0
debugpy==1.8.1
decorator==4.4.2
defusedxml==0.7.1
dill==0.3.8
distro==1.9.0
docx2pdf==0.1.8
executing==2.0.1
faiss-cpu==1.9.0.post1
fastjsonschema==2.19.1
filelock==3.16.0
flatbuffers==24.3.25
fonttools==4.54.1
fqdn==1.5.1
frozenlist==1.5.0
fsspec==2024.9.0
gast==0.6.0
geopandas==1.0.1
google-api-core==2.19.2
google-auth==2.34.0
google-cloud==0.34.0
google-cloud-speech==2.27.0
google-pasta==0.2.0
g

: 