In [26]:
from faster_whisper import WhisperModel

model_size = "large-v3"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

audio_path = "part_1.wav"

faster_whisper_segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

Detected language 'zh' with probability 0.910645


Segment(id=1, seek=2770, start=3.2000000000000046, end=5.58, text='那这段话首先就是', tokens=[50365, 4184, 5562, 28427, 21596, 36490, 5620, 50658], temperature=0.0, avg_logprob=-0.23497024150121779, compression_ratio=1.2933333333333332, no_speech_prob=0.67578125, words=[Word(start=3.2000000000000046, end=3.6400000000000023, word='那', probability=0.09747314453125), Word(start=3.6400000000000023, end=4.08, word='这', probability=0.356689453125), Word(start=4.08, end=4.24, word='段', probability=0.2142333984375), Word(start=4.24, end=4.4, word='话', probability=0.65283203125), Word(start=4.4, end=5.12, word='首先', probability=0.658203125), Word(start=5.12, end=5.58, word='就是', probability=0.75048828125)])

In [27]:
word_segments = []
segments = []
for faster_whisper_segment in faster_whisper_segments:
    word_segment = []
    for word in faster_whisper_segment.words:
        word_segment.append({
            'word': word.word,
            'start': round(word.start, 3),
            'end': round(word.end, 3),
            'score': round(word.probability,3)
        })
    segments.append({
        'start': faster_whisper_segment.start,
        'end': faster_whisper_segment.end,
        'text': faster_whisper_segment.text,
        'words': word_segment
    })
    word_segments.extend(word_segment)

In [28]:
segments[0]

{'start': 3.2000000000000046,
 'end': 5.58,
 'text': '那这段话首先就是',
 'words': [{'word': '那', 'start': 3.2, 'end': 3.64, 'score': 0.097},
  {'word': '这', 'start': 3.64, 'end': 4.08, 'score': 0.357},
  {'word': '段', 'start': 4.08, 'end': 4.24, 'score': 0.214},
  {'word': '话', 'start': 4.24, 'end': 4.4, 'score': 0.653},
  {'word': '首先', 'start': 4.4, 'end': 5.12, 'score': 0.658},
  {'word': '就是', 'start': 5.12, 'end': 5.58, 'score': 0.75}]}

In [29]:
segments[0].keys()

dict_keys(['start', 'end', 'text', 'words'])

In [30]:
word_segments

[{'word': '那', 'start': 3.2, 'end': 3.64, 'score': 0.097},
 {'word': '这', 'start': 3.64, 'end': 4.08, 'score': 0.357},
 {'word': '段', 'start': 4.08, 'end': 4.24, 'score': 0.214},
 {'word': '话', 'start': 4.24, 'end': 4.4, 'score': 0.653},
 {'word': '首先', 'start': 4.4, 'end': 5.12, 'score': 0.658},
 {'word': '就是', 'start': 5.12, 'end': 5.58, 'score': 0.75},
 {'word': '在', 'start': 6.34, 'end': 6.78, 'score': 0.768},
 {'word': 'G', 'start': 6.78, 'end': 7.06, 'score': 0.471},
 {'word': 'AT', 'start': 7.06, 'end': 7.16, 'score': 0.891},
 {'word': '这', 'start': 7.16, 'end': 7.36, 'score': 0.813},
 {'word': '边', 'start': 7.36, 'end': 7.54, 'score': 0.999},
 {'word': '就是', 'start': 7.54, 'end': 7.92, 'score': 0.625},
 {'word': '台', 'start': 7.92, 'end': 8.28, 'score': 0.872},
 {'word': '北', 'start': 8.28, 'end': 8.36, 'score': 0.966},
 {'word': 'IT', 'start': 8.36, 'end': 8.5, 'score': 0.972},
 {'word': '这', 'start': 8.5, 'end': 8.72, 'score': 0.816},
 {'word': '边', 'start': 8.72, 'end': 8.8,

In [31]:
results = [[{
    'segments': segments,
    'word_segments': word_segments
}, audio_path]]

In [32]:
results[0][0]['word_segments']

[{'word': '那', 'start': 3.2, 'end': 3.64, 'score': 0.097},
 {'word': '这', 'start': 3.64, 'end': 4.08, 'score': 0.357},
 {'word': '段', 'start': 4.08, 'end': 4.24, 'score': 0.214},
 {'word': '话', 'start': 4.24, 'end': 4.4, 'score': 0.653},
 {'word': '首先', 'start': 4.4, 'end': 5.12, 'score': 0.658},
 {'word': '就是', 'start': 5.12, 'end': 5.58, 'score': 0.75},
 {'word': '在', 'start': 6.34, 'end': 6.78, 'score': 0.768},
 {'word': 'G', 'start': 6.78, 'end': 7.06, 'score': 0.471},
 {'word': 'AT', 'start': 7.06, 'end': 7.16, 'score': 0.891},
 {'word': '这', 'start': 7.16, 'end': 7.36, 'score': 0.813},
 {'word': '边', 'start': 7.36, 'end': 7.54, 'score': 0.999},
 {'word': '就是', 'start': 7.54, 'end': 7.92, 'score': 0.625},
 {'word': '台', 'start': 7.92, 'end': 8.28, 'score': 0.872},
 {'word': '北', 'start': 8.28, 'end': 8.36, 'score': 0.966},
 {'word': 'IT', 'start': 8.36, 'end': 8.5, 'score': 0.972},
 {'word': '这', 'start': 8.5, 'end': 8.72, 'score': 0.816},
 {'word': '边', 'start': 8.72, 'end': 8.8,

In [33]:
results[0][0]['segments'][0]

{'start': 3.2000000000000046,
 'end': 5.58,
 'text': '那这段话首先就是',
 'words': [{'word': '那', 'start': 3.2, 'end': 3.64, 'score': 0.097},
  {'word': '这', 'start': 3.64, 'end': 4.08, 'score': 0.357},
  {'word': '段', 'start': 4.08, 'end': 4.24, 'score': 0.214},
  {'word': '话', 'start': 4.24, 'end': 4.4, 'score': 0.653},
  {'word': '首先', 'start': 4.4, 'end': 5.12, 'score': 0.658},
  {'word': '就是', 'start': 5.12, 'end': 5.58, 'score': 0.75}]}

In [34]:
results[0][1]

'part_1.wav'

In [46]:
hf_token = 'hf_FAIPqXr.......' # replace your HF token here

In [36]:
from whisperx.diarize import DiarizationPipeline, assign_word_speakers

In [37]:
# >> Diarize
device = 'cuda'
min_speakers = None
max_speakers = None

tmp_results = results
print(">>Performing diarization...")
results = []
diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)
for result, input_audio_path in tmp_results:
    diarize_segments = diarize_model(input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
    #import pdb; pdb.set_trace()
    # assign_word_speakers 函數的主要目的是將說話者標識（speaker labels）分配給轉錄文本中的段落和單詞。
    # 該函數通過將轉錄結果與分段的說話者識別結果（diarization segments）對齊，來確定每個段落和單詞所對應的說話者。
    result = assign_word_speakers(diarize_segments, result) # 基於時間重疊信息，將說話者標識分配給每個段落和單詞。
    
    results.append((result, input_audio_path))

>>Performing diarization...


In [43]:
results[0][0]['segments'][0]

{'start': 3.2000000000000046,
 'end': 5.58,
 'text': '那这段话首先就是',
 'words': [{'word': '那',
   'start': 3.2,
   'end': 3.64,
   'score': 0.097,
   'speaker': 'SPEAKER_04'},
  {'word': '这',
   'start': 3.64,
   'end': 4.08,
   'score': 0.357,
   'speaker': 'SPEAKER_04'},
  {'word': '段',
   'start': 4.08,
   'end': 4.24,
   'score': 0.214,
   'speaker': 'SPEAKER_04'},
  {'word': '话',
   'start': 4.24,
   'end': 4.4,
   'score': 0.653,
   'speaker': 'SPEAKER_04'},
  {'word': '首先',
   'start': 4.4,
   'end': 5.12,
   'score': 0.658,
   'speaker': 'SPEAKER_04'},
  {'word': '就是',
   'start': 5.12,
   'end': 5.58,
   'score': 0.75,
   'speaker': 'SPEAKER_04'}],
 'speaker': 'SPEAKER_04'}

In [44]:
from whisperx.utils import get_writer
output_format = "all"
output_dir = "."
writer = get_writer(output_format, output_dir)

word_options = {
    "highlight_words": False,
    "max_line_count": None,
    "max_line_width": None
}
writer_args = {arg: word_options[arg] for arg in word_options}


In [45]:
# >> Write
for result, audio_path in results:
    align_language = info.language
    result["language"] = align_language
    writer(result, audio_path, writer_args)