# Alignment Debug Notebook
특정 샘플(예: `062fce2e3a36e053`)에 대해 여러 정렬 모델을 실험하는 노트입니다.
- 정답 transcript만 사용해 WhisperX align을 수행합니다.
- align 모델을 바꿔가며 word segments를 비교할 수 있습니다.

In [12]:
from pathlib import Path
import json
import pandas as pd
from IPython.display import Audio, display

import whisper
import whisperx

PROJECT_ROOT = Path('..').resolve()
CONFIG_PATH = PROJECT_ROOT / 'configs/default_config.yaml'


In [2]:
import yaml
with CONFIG_PATH.open('r', encoding='utf-8') as fh:
    CONFIG = yaml.safe_load(fh)
CONFIG['paths']


{'input_audio_dir': './assets/zeroth',
 'noise_dir': './assets/noises',
 'output_dir': './data/augmented_audio',
 'label_dir': './data/labels',
 'hf_dataset_dir': './data/hf_dataset',
 'noise_catalog': './data/noise/noise_catalog.csv',
 'noise_resampled_dir': './data/noise/resampled',
 'raw_samples_path': './data/zeroth/raw_samples_train.jsonl',
 'alignment_output_dir': './data/labels'}

In [3]:
sample_id = '062fce2e3a36e053'
split = 'test'
alignment_path = PROJECT_ROOT / CONFIG['paths']['label_dir'] / split / 'raw_alignment.jsonl'
record = None
with alignment_path.open('r', encoding='utf-8') as fh:
    for line in fh:
        rec = json.loads(line)
        if rec.get('sample_id') == sample_id:
            record = rec
            break
if record is None:
    raise ValueError('sample not found')
record


{'sample_id': '062fce2e3a36e053',
 'audio_path': '/data/MyProject/hallucination-data-synthesizer/assets/zeroth/test/test_000106.wav',
 'text': '대한주택보증 입장에서는 손실 가능성을 최소화하기 위해 수수료를 높게 요구할 수밖에 없는 구조다',
 'split': 'test',
 'language': 'ko',
 'alignment': {'words': [{'w': '대한주택보증',
    'start': 0.0,
    'end': 3.272,
    'score': 0.338},
   {'w': '입장에서는', 'start': 3.292, 'end': 4.075, 'score': 0.931},
   {'w': '손실', 'start': 4.335, 'end': 4.637, 'score': 1.0},
   {'w': '가능성을', 'start': 4.737, 'end': 6.343, 'score': 0.238},
   {'w': '최소화하기', 'start': 6.363, 'end': 6.463, 'score': 0.0},
   {'w': '위해', 'start': 6.483, 'end': 6.604, 'score': 0.692},
   {'w': '수수료를', 'start': 10.236, 'end': 10.317, 'score': 0.002},
   {'w': '높게', 'start': 10.337, 'end': 10.377, 'score': 0.004},
   {'w': '요구할', 'start': 10.397, 'end': 10.457, 'score': 0.003},
   {'w': '수밖에', 'start': 10.477, 'end': 10.538, 'score': 0.005},
   {'w': '없는', 'start': 10.558, 'end': 10.598, 'score': 0.003},
   {'w': '구조다', 'start': 10.

In [4]:
audio_path = Path(record['audio_path'])
if not audio_path.is_absolute():
    audio_path = (PROJECT_ROOT / CONFIG['paths']['input_audio_dir']) / audio_path
text = record['text']
audio_path, text


(PosixPath('/data/MyProject/hallucination-data-synthesizer/assets/zeroth/test/test_000106.wav'),
 '대한주택보증 입장에서는 손실 가능성을 최소화하기 위해 수수료를 높게 요구할 수밖에 없는 구조다')

## Helper: run alignment
`tokenizer_model`는 Whisper 토크나이저 기준 모델 (예: `large-v3`),
`align_model`에는 Hugging Face의 CTC 모델을 지정합니다.

In [5]:
def run_alignment(tokenizer_model: str, align_model: str, device: str = 'cuda', language: str = 'ko'):
    multilingual = not tokenizer_model.endswith('.en')
    tokenizer = whisper.tokenizer.get_tokenizer(multilingual=multilingual, language=language)
    align_model_obj, metadata = whisperx.load_align_model(
        model_name=align_model,
        language_code=language,
        device=device,
    )
    audio = whisperx.load_audio(str(audio_path))
    duration = len(audio) / 16000
    segments = [{
        'id': 0,
        'seek': 0,
        'start': 0.0,
        'end': duration,
        'text': text,
        'tokens': tokenizer.encode(text)[0],
        'temperature': 0.0,
        'avg_logprob': 0.0,
        'compression_ratio': 0.0,
        'no_speech_prob': 0.0,
    }]
    alignment = whisperx.align(segments, align_model_obj, metadata, audio, device=device)
    df = pd.DataFrame(alignment['word_segments'])
    return alignment, df


In [6]:
tokenizer_model = 'large-v3'
align_model = 'kresnik/wav2vec2-large-xlsr-korean'
alignment, df_words = run_alignment(tokenizer_model, align_model, device='cuda', language='ko')
df_words[['word', 'start', 'end', 'score']]


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,word,start,end,score
0,대한주택보증,0.0,3.272,0.338
1,입장에서는,3.292,4.075,0.931
2,손실,4.335,4.637,1.0
3,가능성을,4.737,6.343,0.238
4,최소화하기,6.363,6.463,0.0
5,위해,6.483,6.604,0.692
6,수수료를,10.236,10.317,0.002
7,높게,10.337,10.377,0.004
8,요구할,10.397,10.457,0.003
9,수밖에,10.477,10.538,0.005


In [7]:
df_words[['word', 'start', 'end', 'score']].style.format({'start': '{:.3f}', 'end': '{:.3f}', 'score': '{:.3f}'})


Unnamed: 0,word,start,end,score
0,대한주택보증,0.0,3.272,0.338
1,입장에서는,3.292,4.075,0.931
2,손실,4.335,4.637,1.0
3,가능성을,4.737,6.343,0.238
4,최소화하기,6.363,6.463,0.0
5,위해,6.483,6.604,0.692
6,수수료를,10.236,10.317,0.002
7,높게,10.337,10.377,0.004
8,요구할,10.397,10.457,0.003
9,수밖에,10.477,10.538,0.005


In [8]:
print('Average score:', df_words['score'].mean())
print('Duration seconds:', alignment.get('audio_duration_sec'))


Average score: 0.26866666666666666
Duration seconds: None


In [10]:
audio = whisperx.load_audio(str(audio_path))
audio.shape

(170527,)

In [14]:
display(Audio(audio, rate=16000))

### 참고
- 다른 모델을 시험하려면 `align_model` 값을 변경하세요.
- 자모 기반 모델일 경우, `text`를 자모로 변환해 주어야 할 수도 있습니다.