In [2]:
from pytubefix import YouTube
from pytubefix.cli import on_progress
from pyannote.audio import Pipeline
from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import os
import whisper
import warnings
import pandas as pd
import accelerate
import pprint 
pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')
 

In [3]:
#get video urls
speeches = pd.read_excel("speeches.xlsx")

In [3]:
#script to save audio files
def save_audio(url, output_path, filename):
    try:
        yt = YouTube(url)
        print(f"Downloading: {yt.title}")
        
        # Get audio-only stream
        ys = yt.streams.get_audio_only()
        
        # Ensure output directory exists
        os.makedirs(output_path, exist_ok=True)
        
        # Temporary file name (original format)
        temp_filename = f"{filename}.webm"
        temp_filepath = os.path.join(output_path, temp_filename)
        
        # Download audio in original format
        ys.download(output_path=output_path, filename=temp_filename)
        
        # Convert to MP3
        final_filename = f"{filename}.mp3"
        final_filepath = os.path.join(output_path, final_filename)
        audio = AudioSegment.from_file(temp_filepath)
        audio.export(final_filepath, format="mp3")
        
        # Remove the temporary file
        os.remove(temp_filepath)
        print(f"Downloaded and converted to MP3: {final_filename}")
    
    except Exception as e:
        print("----- Download failed: -----")
        print(f"URL: {url}")
        print(f"Error: {e}")


In [8]:
url_list = speeches["url"].to_list()

In [None]:

for i, url in enumerate(url_list, start=1):
    output_path = "./audio_files"  
    filename = f"speech_{i}_{speeches["date"][i-1]}_{speeches["state"][i-1]}"
    save_audio(url, output_path=output_path, filename=filename)

Downloading: 🇺🇸 Donald Trump at Turning Point USA Rally in Phoenix, Arizona (June 6, 2024)
Downloaded and converted to MP3: speech_1_"2024-06-06"_AZ.mp3
Downloading: 🇺🇸 Donald Trump | Save America Rally at Sunset Park in Las Vegas, Nevada (June 9, 2024) [LIVE]
Downloaded and converted to MP3: speech_2_"2024-06-09"_NV.mp3
Downloading: 🇺🇸 Donald Trump | The People’s Convention by Turning Point USA in Detroit, Michigan (June 15, 2024)
Downloaded and converted to MP3: speech_3_"2024-06-15"_MI.mp3
Downloading: 🇺🇸 Donald Trump | Full Speech at Rally in Racine, Wisconsin (Subtitles) [Multilanguage CC]
Downloaded and converted to MP3: speech_4_"2024-06-18"_WI.mp3
Downloading: Donald Trump's speech Philadelphia June 22, 2024 I Temple University I @DonaldJTrumpforPresident
Downloaded and converted to MP3: speech_5_"2024-06-22"_PA.mp3
Downloading: Donald Trump campaign rally in Chesapeake, Virginia, one day after first 2024 presidential debate
Downloaded and converted to MP3: speech_6_"2024-06-28

In [4]:
#initiate model to script the audio files
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=False, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("Model and pipe has been initiated.")

Device set to use cpu


Model and pipe has been initiated.


In [11]:
for i in range(len(url_list)):
    if speeches["trimming_needed"][i] == "Yes":
        continue
    else:
        print(f"Generating transcript of speech_{i+1}_{speeches["date"][i]}_{speeches["state"][i]}.mp3")

        result_txt = pipe(f"./audio_files/speech_{i+1}_{speeches["date"][i]}_{speeches["state"][i]}.mp3", return_timestamps=True)
        text_to_add = list(result_txt.values())[0]

        print("Transcript has been generated.")
        os.makedirs("text_files", exist_ok=True)
        file_path = os.path.join("text_files", f"speech_{i+1}_{speeches["date"][i]}_{speeches["state"][i]}.txt")
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(text_to_add)

        print("Transcript has been added to the folder.")


Generating transcript of speech_1_"2024-06-06"_AZ.mp3


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_2_"2024-06-09"_NV.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_3_"2024-06-15"_MI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_4_"2024-06-18"_WI.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_5_"2024-06-22"_PA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_6_"2024-06-28"_VA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_7_"2024-07-09"_FL.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_9_"2024-07-20"_MI.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_10_"2024-07-24"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_11_"2024-07-27"_MN.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_12_"2024-07-31"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_13_"2024-08-03"_GA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_14_"2024-08-09"_MT.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_15_"2024-08-14"_NC.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_16_"2024-08-17"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_17_"2024-08-21"_NC.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_18_"202

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_22_"2024-09-12"_AZ.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_23_"2024-09-13"_NV.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_24_"2024-09-17"_MI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_25_"2024-09-18"_NY.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_26_"2024-09-21"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_27_"2024-09-23"_PA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_28_"2024-09-24"_GA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_29_"2024-09-25"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_30_"2024-09-27"_MI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_31_"2024-09-28"_WI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_32_"2024-09-29"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_33_"2024-10-01"_WI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_34_"2024-10-03"_MI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_35_"2024-10-04"_NC.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_36_"2024-10-05"_PA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_37_"2024-10-06"_WI.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_38_"2024-10-09"_PA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_39_"2024-10-11"_NV.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_40_"2024-10-11"_CO.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_41_"2024-10-12"_CA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_42_"2024-10-13"_AZ.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_43_"2024-10-14"_PA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_44_"2024-10-15"_GA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_45_"2024-10-18"_MI.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_46_"2024-10-19"_PA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_47_"2024-10-20"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_48_"2024-10-21"_NC.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_49_"2024-10-22"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_50_"2024-10-23"_GA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_51_"2024-10-24"_AZ.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_52_"2024-10-24"_NV.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_53_"2024-10-25"_MI.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_54_"2024-10-26"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_55_"2024-10-27"_NY.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_56_"2024-10-28"_GA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_57_"2024-10-29"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_58_"2024-10-30"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_59_"2024-10-30"_WI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_60_"2024-10-31"_NM.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_61_"2024-10-31"_NV.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_62_"2024-10-31"_AZ.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_63_"2024-11-01"_MI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_64_"2024-11-01"_WI.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_65_"2024-11-02"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_66_"2024-11-02"_VA.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_67_"2024-11-02"_NC.mp3


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_68_"2024-11-03"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_69_"2024-11-03"_NC.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_70_"2024-11-03"_GA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_71_"2024-11-04"_NC.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_72_"2024-11-04"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_73_"2024-11-04"_PA.mp3
Transcript has been generated.
Transcript has been added to the folder.
Generating transcript of speech_74_"2024-11-04"_MI.mp3
Transcript has been generated.
Transcript has been added to the folder.
