In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = device = "mps" if torch.backends.mps.is_available() else "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32


  warn(


In [2]:
model_id = "openai/whisper-large-v3-turbo"


model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [8]:
import librosa
import numpy as np

def get_video_subtitles(video_path):
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
        return_timestamps=True
    )

    result = pipe(video_path)
    return result["text"]

def get_video_subtitles_and_chunks(video_path):
    audio_data, sr = librosa.load(video_path, sr=16000) 
    # Create input features from audio array
    input_features = processor(
        audio_data, 
        sampling_rate=sr,
        return_tensors="pt", 
        truncation=False
    ).input_features
    
    # Move to appropriate device and convert to correct dtype
    input_features = input_features.to(device, dtype=torch_dtype)
    
    # Generate transcription with timestamps
    generated_ids = model.generate(
        input_features, 
        return_timestamps=True, 
        return_segments=True
    )
    
    # Decode the output with timestamp information
    transcript = processor.batch_decode(
        generated_ids["sequences"], 
        skip_special_tokens=True, 
        output_offsets=True
    )
    
    return transcript
    

# sample = "../../data/test_data/bel.mp4"
sample = "../../data/test_data/long_conversation.mp3"

res = get_video_subtitles_and_chunks(sample)

print(res)

[{'text': " I don't know if you know this, but some people call you a fascist. Yeah, they do. So I figure it's all right to call them a communist. Yeah, they call me a lot worse than I call them. A lot of people listening to this, myself included, that doesn't think that Kamala is a communist. I believe you have to fight fire with fire. Politics is a dirty game. It is a dirty game. It's certainly true. How do you win at that game? They suffer from massive Trump derangement syndrome, TDS, and I don't know if it's curable from their standpoint. I think we'd probably have a better world if everybody in Congress took some mushrooms, perhaps. First of all, medical marijuana has been amazing. I've had friends and I've had others and doctors telling me that it's been absolutely amazing. The list of clients that went to the island has not been made public. Yeah, it's very interesting, isn't it? The following is a conversation with Donald Trump on this, the Lex Friedman podcast. They're getting

In [10]:
res[0]["offsets"]

[{'text': " I don't know if you know this, but some people call you a fascist.",
  'timestamp': (0.0, 3.0)},
 {'text': ' Yeah, they do.', 'timestamp': (3.2600000000000002, 3.94)},
 {'text': " So I figure it's all right to call them a communist.",
  'timestamp': (4.16, 6.42)},
 {'text': ' Yeah, they call me a lot worse than I call them.',
  'timestamp': (6.6000000000000005, 8.66)},
 {'text': " A lot of people listening to this, myself included, that doesn't think that Kamala is a communist.",
  'timestamp': (8.66, 15.24)},
 {'text': ' I believe you have to fight fire with fire.',
  'timestamp': (15.540000000000001, 17.8)},
 {'text': ' Politics is a dirty game.', 'timestamp': (17.92, 19.32)},
 {'text': ' It is a dirty game.', 'timestamp': (19.46, 20.56)},
 {'text': " It's certainly true.", 'timestamp': (20.78, 21.8)},
 {'text': ' How do you win at that game?', 'timestamp': (21.98, 23.26)},
 {'text': " They suffer from massive Trump derangement syndrome, TDS, and I don't know if it's cura