<a href="https://colab.research.google.com/github/acrching/WhisperX/blob/main/WhisperX_with_Speaker_Tags.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

If the webcast link does not end in mp3 or mp4, download here using yt-dlp, else skip.

In [None]:
!pip install yt-dlp
!yt-dlp #insert command from The Stream Detector here

Let's install WhisperX

In [None]:
!pip install git+https://github.com/m-bain/whisperx.git
!pip install ctranslate2==4.4.0

Time to transcribe!

In [None]:
import whisperx
import gc

device = "cuda"
audio_file = "audio.mp3"  # you can directly paste the URL if mp3 or mp4
batch_size = 32  # reduce if low on GPU mem
compute_type = "float16"

# Define your initial prompt with company-specific terms and financial terms
company_specific_terms = "These company-specific terms might be mentioned in the call: " #add here company name, product names, and speaker names
financial_terms = (
    "revenue, earnings, EBITDA, gross margin, net income, operating income, "
    "cash flow, CapEx, OpEx, dividends, share buyback, guidance, fiscal year, "
    "quarterly earnings, profit margin, cost of goods sold, EPS, diluted EPS, "
    "working capital, debt, equity, assets, liabilities, return on investment, "
    "operating expenses, net profit, free cash flow, balance sheet, income statement"
)

initial_prompt_text = f"{company_specific_terms}, {financial_terms}"

# 1. Transcribe with WhisperX, using the initial prompt parameter
model = whisperx.load_model("large-v2", device, compute_type=compute_type, asr_options={"initial_prompt": initial_prompt_text})
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, language='en')

# Extract and save the transcript without timestamps and speaker tags
transcript_text = " ".join([segment['text'] for segment in result['segments']])

with open("output.txt", "w", encoding="utf-8") as f:  # you can change the text file name
    f.write(transcript_text)

print("Transcript saved")

print(result["segments"])  # before alignment

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"])  # after alignment

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token='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', device=device)
diarize_segments = diarize_model(audio)
result = whisperx.assign_word_speakers(diarize_segments, result)

current_speaker = None
current_text = ""
output_text = ""

for segment in result["segments"]:
    # Handle segments without a speaker label
    speaker = segment.get("speaker", "UNKNOWN")  # Assign "UNKNOWN" if speaker is missing
    if speaker != current_speaker:
        if current_speaker:
            output_text += f"{current_speaker}^ {current_text.strip()}\n\n"
        current_speaker = speaker
        current_text = ""
    current_text += segment["text"] + " "

# Add the last speaker's text
if current_speaker:
    output_text += f"{current_speaker}^ {current_text.strip()}\n"

# Save to a text file, specifying UTF-8 encoding to handle Unicode characters
output_file = "output with speakers.txt"  # you can change the text file name
with open(output_file, "w", encoding="utf-8") as f:  # Specify UTF-8 encoding
    f.write(output_text)

print(diarize_segments)
print(output_text)