# Transcript Alignment Prototype
정답 텍스트만으로 WhisperX align 모델을 호출하는 간단한 실험입니다.

In [None]:
from pathlib import Path
import json
from pprint import pprint

import whisper
import whisperx
from src.utils.config_loader import load_yaml


In [None]:
config = load_yaml(Path('configs/default_config.yaml'))
paths = config['paths']
align_cfg = config['aligner']
alignment_path = Path(paths['label_dir']) / 'test' / 'raw_alignment.jsonl'
with alignment_path.open('r', encoding='utf-8') as fh:
    record = json.loads(next(fh))
record


In [None]:
audio_path = Path(record['audio_path'])
if not audio_path.is_absolute():
    audio_path = Path(paths['input_audio_dir']) / audio_path
text = record['text']
print('audio:', audio_path)
print('text:', text)


In [None]:
device = align_cfg.get('device', 'cpu')
language = align_cfg.get('language', 'ko')
model_name = align_cfg.get('model_name', 'large-v3')
multilingual = not model_name.endswith('.en')
tokenizer = whisper.tokenizer.get_tokenizer(multilingual=multilingual, language=language)
align_model, metadata = whisperx.load_align_model(language_code=language, device=device)
audio = whisperx.load_audio(str(audio_path))
segments = [{
    'text': text,
    'tokens': tokenizer.encode(text)[0]
}]
aligned = whisperx.align(segments, align_model, metadata, audio, device=device)
pprint(aligned['word_segments'][:10])
