#### Download audio using pytube

In [None]:
from pytube import YouTube

# Enter the URL of the YouTube video
video_url = "https://www.youtube.com/watch?v=lks-x8ZM554"

# Create a YouTube object
yt = YouTube(video_url)

# Get only the audio stream
audio_stream = yt.streams.filter(only_audio=True).first()

# Download the audio stream
media_directory = "jeff dean leaders connect bangalore"
audio_stream.download(output_path=f"./{media_directory}", filename="audio_full_talk.mp4")

# Get the video stream
video_stream = yt.streams.first()

# Download the audio stream
media_directory = "jeff dean leaders connect bangalore"
video_stream.download(output_path=f"./{media_directory}", filename="jeff_dean_leaders_connect_full_talk.mp4")

#### Extract relevant portion from video using ffmpeg

In [None]:
# ! cd jeff\ dean\ leaders\ connect\ bangalore

In [None]:
# ! ffmpeg -i jeff_dean_leaders_connect_full_talk.mp4 -ss 00:46:00 -to 01:30:00 -c copy jeff_dean_leaders_connect_fireside_chat.mp4

# ! ffmpeg -i audio_full_talk.mp4 -ss 00:46:00 -to 01:30:00 -c copy audio_fireside_chat.mp4

#### Generate transcript using whisperX

Installation instructions are available at https://github.com/m-bain/whisperX

In [18]:
import whisperx
import gc 

device = "cpu" 
audio_file = "audio_fireside_chat.mp4"
batch_size = 4 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("base", device, compute_type=compute_type)

# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# # delete model if low on GPU resources
# # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.90) in first 30s of audio...
[{'text': " Lower to you. Thank you. Hello, everybody. Welcome. Welcome again, Jeff. Promote. I hope that you will indulge me as I introduce you to the audience with a quick set of rapid fire questions. Jeff, you've been in Bangalore day or two. I hope you're prepared with your answers. Is that okay?", 'start': 0.009, 'end': 25.52}, {'text': " Okay, first one. Coffee or tea? I've had both today, but I'm generally coffee. Coffee or tea? Okay, second one. Masala dosa or avocado toast? Masala dosa. Good one. Is there something called avocado toast? You're a morning person or an evening person. N

#### Generate speaker labels using whisperX

In [19]:
# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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", device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio, min_speakers=3)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

                               segment label     speaker        start  \
0    [ 00:00:00.008 -->  00:00:00.993]     A  SPEAKER_02     0.008489   
1    [ 00:00:00.993 -->  00:00:01.349]     B  SPEAKER_01     0.993209   
2    [ 00:00:08.174 -->  00:00:12.011]     C  SPEAKER_01     8.174873   
3    [ 00:00:12.877 -->  00:00:15.509]     D  SPEAKER_01    12.877759   
4    [ 00:00:16.307 -->  00:00:23.064]     E  SPEAKER_01    16.307301   
..                                 ...   ...         ...          ...   
595  [ 00:43:38.938 -->  00:43:46.358]    VX  SPEAKER_00  2618.938879   
596  [ 00:43:47.156 -->  00:43:48.837]    VY  SPEAKER_02  2627.156197   
597  [ 00:43:49.414 -->  00:43:49.465]    VZ  SPEAKER_02  2629.414261   
598  [ 00:43:49.465 -->  00:43:49.821]    WA  SPEAKER_00  2629.465195   
599  [ 00:43:51.078 -->  00:44:00.144]    WB  SPEAKER_01  2631.078098   

             end  intersection        union  
0       0.993209  -2638.855791  2640.000511  
1       1.349745  -2638.499255 

- Time for transcription - 2 minute 45 seconds
- Time for wav2vec2 alignment (word-level timestamp generation) - 6 minutes 37 seconds [*9 minute 22 seconds - (time for transcription)*]
- Time for diarization - 46 minute 42 seconds


In [16]:
# result_2 = result.copy()

In [22]:
for r in result['segments'][10:20]:
    print(f"{r['speaker']}: {r['text']}")
    print("\n")
    # break

SPEAKER_01:  Okay, first one.


SPEAKER_01: Coffee or tea?


SPEAKER_00: I've had both today, but I'm generally coffee.


SPEAKER_01: Coffee or tea?


SPEAKER_01: Okay, second one.


SPEAKER_01: Masala dosa or avocado toast?


SPEAKER_00: Masala dosa.


SPEAKER_01: Good one.


SPEAKER_02: Is there something called avocado toast?


SPEAKER_01: You're a morning person or an evening person.




#### Create SRT subtitle file

In [25]:
def format_time(seconds):
    """Converts time in seconds to the SRT format HH:MM:SS,MMM."""
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"

def convert_to_srt(segments):
    srt_content = ""
    for i, segment in enumerate(segments, start=1):
        start_time = format_time(segment["start"])
        end_time = format_time(segment["end"])
        srt_content += f"{i}\n{start_time} --> {end_time}\n{segment['text']}\n\n"
    return srt_content

# Convert the speech to text output to SRT format
srt_content = convert_to_srt(result["segments"])

# print(srt_content)

# Write the SRT content to a file
srt_filename = f"{audio_file[:-4]}_subtitle.srt"
with open(srt_filename, "w") as srt_file:
    srt_file.write(srt_content)

print(f"SRT file '{srt_filename}' has been created.")

SRT file 'audio_fireside_chat_subtitle.srt' has been created.


#### Burn subtitles to the video file

In [None]:
# !ffmpeg -i jeff_dean_leaders_connect_fireside_chat.mp4 -vf "subtitles=audio_fireside_chat_subtitle.srt" -c:a copy -max_muxing_queue_size 2048 jeff_dean_leaders_connect_fireside_chat_with_subtitles.mp4