# Gap Analysis for Alignment Data
이 노트북은 `data/labels/train/raw_alignment.jsonl`을 기반으로 단어 간 gap 분포와 문장별 최대 gap 분포를 분석합니다.

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline


In [None]:
alignment_path = Path('../data/labels/train/raw_alignment.jsonl')
if not alignment_path.exists():
    raise FileNotFoundError('Alignment file not found: {0}'.format(alignment_path))

word_gaps = []
max_gap_per_sample = []
total_words = 0

with alignment_path.open('r', encoding='utf-8') as fh:
    for line in fh:
        if not line.strip():
            continue
        record = json.loads(line)
        words = record.get('alignment', {}).get('words', [])
        words = [w for w in words if w.get('start') is not None and w.get('end') is not None]
        if not words:
            continue
        words.sort(key=lambda w: float(w['start']))
        total_words += len(words)
        sample_gaps = []
        for prev, nxt in zip(words, words[1:]):
            gap = float(nxt['start']) - float(prev['end'])
            if gap > 0:
                word_gaps.append(gap)
                sample_gaps.append(gap)
        if sample_gaps:
            max_gap_per_sample.append(max(sample_gaps))

word_gaps = np.array(word_gaps, dtype=np.float32)
max_gap_per_sample = np.array(max_gap_per_sample, dtype=np.float32)
print(f'Total words processed: {total_words}')
print(f'Total word gaps: {len(word_gaps)}')
print(f'Samples with gap info: {len(max_gap_per_sample)}')


In [None]:
def summarize_gaps(values, label):
    if values.size == 0:
        print(f'{label}: no data')
        return
    series = pd.Series(values)
    summary = series.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
    print(f'--- {label} ---')
    display(summary.to_frame(name='seconds'))

summarize_gaps(word_gaps, 'Word-to-word gaps')
summarize_gaps(max_gap_per_sample, 'Max gap per sample')


In [None]:
plt.figure(figsize=(8, 4))
plt.hist(word_gaps, bins=100, color='steelblue', edgecolor='black')
plt.title('Word-to-Word Gap Distribution (Train)')
plt.xlabel('Gap length (seconds)')
plt.ylabel('Count')
plt.xlim(0, 2.0)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 4))
plt.hist(max_gap_per_sample, bins=50, color='darkorange', edgecolor='black')
plt.title('Max Gap per Sample (Train)')
plt.xlabel('Gap length (seconds)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
