In [None]:
import pandas as pd
from pathlib import Path

language = "english"
dataset = "test"
align_path = Path(f"../Data/alignments/{language}/{dataset}_boundaries.csv")
align_df = pd.read_csv(align_path)

phonetic_transcriptions = align_df["phones"].tolist()
print(f"Total phonetic transcriptions: {len(phonetic_transcriptions)}")
print(phonetic_transcriptions)
unique_set = set(phonetic_transcriptions)
print(f"Number of unique phonetic transcriptions: {len(unique_set)}")

Total phonetic transcriptions: 52385
0                 ay
1        b,ih,l,iy,v
2                 ay
3            hh,ae,v
4                 ah
            ...     
52380      d,eh,r,iy
52381         d,ao,r
52382    m,ih,s,t,er
52383    aa,t,l,iy,z
52384    ih,n,d,iy,d
Name: phones, Length: 52385, dtype: object
Number of unique phonetic transcriptions: 8184


In [6]:
from pathlib import Path
import textgrids
import re
from intervaltree import Interval

language = "english"
dataset = "dev"
align_dir = Path.home() / "Desktop" / "Data/Librispeech" / dataset / "alignments"
align_files = list(align_dir.glob("**/*.TextGrid"))

def _get_tier(tg, candidates):
    for name in candidates:
        if name in tg.keys():
            return tg[name]
    raise KeyError(f"None of the tiers {candidates} found in TextGrid.")

def _clean_phone(label: str) -> str:
    return re.sub(r"\d+", "", label).replace("\n", "").strip().lower()

def _overlap(a_start, a_end, b_start, b_end) -> bool:
    return not (a_end <= b_start or b_end <= a_start)

gold_fragments = []
for align_file in Path(align_dir).glob("**/*.TextGrid"):
    speaker = align_file.stem
    tg = textgrids.TextGrid(str(align_file))

    word_tier = _get_tier(tg, ["words", "word", "Words", "Word"])
    phone_tier = _get_tier(tg, ["phones", "phone", "Phones", "Phone"])

    words = [
        (w.xmin, w.xmax, w.text.strip())
        for w in word_tier
        if w.text and w.text.strip() not in {"", "<unk>"}
    ]
    raw_phones = [
        (p.xmin, p.xmax, _clean_phone(p.text))
        for p in phone_tier
        if p.text and p.text.strip() not in {"", "sil", "sp", "spn"}
    ]

    raw_phones.sort(key=lambda x: x[0])

    p_idx = 0
    n_phones = len(raw_phones)
    for w_start, w_end, w_text in words:
        while p_idx < n_phones and raw_phones[p_idx][1] <= w_start:
            p_idx += 1

        k = p_idx
        ph_seq = []
        while k < n_phones and raw_phones[k][0] < w_end:
            p_start, p_end, p_lbl = raw_phones[k]
            if _overlap(w_start, w_end, p_start, p_end):
                ph_seq.append(p_lbl)
            k += 1

        if ph_seq:
            gold_fragments.append(
                (speaker, Interval(w_start, w_end, (w_text,  "-".join(ph_seq))))
            )

print(f"Total gold fragments: {len(gold_fragments)}, Example: {gold_fragments[0]}")

Total gold fragments: 54402, Example: ('6313-66125-0015', Interval(0.2, 0.71, ('cautiously', 'k-ao-sh-ah-s-l-iy')))


In [9]:
align_type = "phones"
clustering_dict = {}
for speaker, interval in gold_fragments:
    word, ph_seq = interval.data
    if ph_seq not in clustering_dict:
        clustering_dict[ph_seq] = []
    clustering_dict[ph_seq].append((speaker, interval))

partition_file = Path(f"Data/partitions/{language}/{dataset}_{align_type}_partition.txt")
partition_file.parent.mkdir(parents=True, exist_ok=True)
with open(partition_file, "w") as f:
    id_nmr = 0
    for cluster_id in clustering_dict:
        f.write(f"Class {id_nmr}\n")
        entries = clustering_dict[cluster_id]
        for speaker, interval in entries:
            word, ph_seq = interval.data
            f.write(
                f"{speaker} {interval.begin:.3f} {interval.end:.3f} {word} {ph_seq}\n"
            )
        f.write("\n")
        id_nmr += 1

In [22]:
from utils.evaluate_partition import evaluate_partition_file
evaluate_partition_file(partition_file, language, dataset, f"GT-{align_type}", runtime=0)

Number of gold fragments: 54252


Building trees: 100%|██████████| 54252/54252 [00:00<00:00, 82085.35it/s]


54252 == 54252
Example transcription: (0, '6313-66125-0015', Interval(0.18, 0.73), ['k', 'ao', 'sh', 'ah', 's', 'l', 'iy'], 'cautiously')

===== Summary =====
Language         : english
Dataset          : dev
Model            : GT-phones

--- Evaluation Metrics ---
NED              : 0.0%
Purity*          : 98.0%
Completeness*    : 91.2%
Homogeinity      : 99.3%
Completeness     : 97.4%
V-measure        : 98.4%
Bitrate          : 33.5 bits/s

--- Clustering ---
Predicted clusters: 8358
True number types : 8216
Difference        : 142

--- Runtime ---
Total runtime (s): 0.00

--- Parameters ---


In [None]:
import editdistance
import itertools
from collections import Counter

def distance(seq1, seq2):
    return editdistance.eval(seq1.split("-"), seq2.split("-")) / max(len(seq1.split("-")), len(seq2.split("-")))

for cluster_id in clustering_dict:
    entries = clustering_dict[cluster_id]

    distances = []
    for (speaker1, interval1), (speaker2, interval2) in itertools.combinations(entries, 2):
        word1, ph_seq1 = interval1.data
        word2, ph_seq2 = interval2.data
        dist = distance(ph_seq1, ph_seq2)
        distances.append(dist)
    
    if distances:
        avg_distance = sum(distances) / len(distances)
        if avg_distance == 0:
            continue

        print(f"Cluster: {cluster_id} [length: {len(entries)}], NED = {avg_distance*100:.3f}%")

        # --- Count and show unique phone sequence proportions ---
        ph_seq_counts = Counter([interval.data[1] for _, interval in entries])
        total = sum(ph_seq_counts.values())

        print("Phonetic variants and proportions:")
        for ph_seq, count in ph_seq_counts.items():
            proportion = count / total * 100
            print(f"- {ph_seq} ({count}/{total}, {proportion:.1f}%)")

        print("-" * 50)


Cluster: a [length: 1198], NED = 11.893%
Phonetic variants and proportions:
- ah (1122/1198, 93.7%)
- ey (76/1198, 6.3%)
--------------------------------------------------
Cluster: the [length: 3448], NED = 15.230%
Phonetic variants and proportions:
- dh-ah (2802/3448, 81.3%)
- dh-iy (646/3448, 18.7%)
--------------------------------------------------
Cluster: to [length: 1378], NED = 32.853%
Phonetic variants and proportions:
- t-ah (565/1378, 41.0%)
- t-ih (441/1378, 32.0%)
- t-uw (372/1378, 27.0%)
--------------------------------------------------
Cluster: that [length: 642], NED = 16.687%
Phonetic variants and proportions:
- dh-ae-t (327/642, 50.9%)
- dh-ah-t (315/642, 49.1%)
--------------------------------------------------
Cluster: us [length: 67], NED = 2.239%
Phonetic variants and proportions:
- ah-s (66/67, 98.5%)
- y-uw-eh-s (1/67, 1.5%)
--------------------------------------------------
Cluster: can [length: 62], NED = 8.408%
Phonetic variants and proportions:
- k-ah-n (53/

In [41]:
def distance(seq1, seq2):
    print(f"Comparing '{seq1.split('-')}' and '{seq2.split("-")}'")
    return editdistance.eval(seq1.split("-"), seq2.split("-")) / max(len(seq1.split("-")), len(seq2.split("-")))

seq1 = "d-ih-m-ae-n-d-ih-d"
seq2 = "d-ih-m-ae-n-d-ah-d"
dist = distance(seq1, seq2)
print(f"Distance between '{seq1}' and '{seq2}': {dist*100:.3f}%")

Comparing '['d', 'ih', 'm', 'ae', 'n', 'd', 'ih', 'd']' and '['d', 'ih', 'm', 'ae', 'n', 'd', 'ah', 'd']'
Distance between 'd-ih-m-ae-n-d-ih-d' and 'd-ih-m-ae-n-d-ah-d': 12.500%
