In [None]:
import whisperx
import gc
#!pip install jiwer
from jiwer import wer, cer

Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl.metadata (11 kB)
Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.6 MB ? eta -:--:--
   ------------------- -------------------- 0.8/1.6 MB 1.3 MB/s eta 0:00:01
   -------------------------- ------------- 1.0/1.6 MB 1.3 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 1.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.5 rapidfuzz-3.10.1


In [12]:
device = "cuda" 
audio_file = "data/1min_sample.wav" # path to audio file
text_file = "data/1min_sample.txt" # path to text file
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

In [None]:
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type, language="en") # load model with English language

# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"])

In [None]:
transcription = "".join([i["text"] for i in result["segments"]])[1:]


with open("data/1min_sample.txt", "r") as file:
    reference_transcription = file.read().replace("\n", " ")

print(transcription) # before alignment
print(reference_transcription)


# Calculate Word Error Rate (WER)
word_error_rate = wer(reference_transcription, transcription)
print(f"Word Error Rate (WER): {word_error_rate}")

# Calculate Character Error Rate (CER)
character_error_rate = cer(reference_transcription, transcription)
print(f"Character Error Rate (CER): {character_error_rate}")

Good afternoon. My colleagues and I remain squarely focused on achieving our dual mandate goals of maximum employment and stable prices for the benefit of the American people. Our economy is strong overall and has made significant progress toward our goals over the past two years. The labor market has cooled from its formerly overheated state. Inflation has eased substantially from a peak of 7% to an estimated 2.2% as of August. We're committed to maintaining our economy's strength by supporting maximum employment and returning inflation to our 2% goal. Today, the Federal Open Market Committee decided to reduce the degree of policy restraint by lowering our policy interest rate by a half percentage point.
Good afternoon. My colleagues and I remain squarely focused on achieving our dual-mandate goals of maximum employment and stable prices for the benefit of the American people. Our economy is strong overall and has made significant progress toward our goals over the past two years. The

In [None]:
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

In [None]:
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs