In [None]:
import re
from collections import Counter, defaultdict


def parse_file(filepath):
    clusters = {}
    overall_ned = None
    current_cluster = None
    current_counter = Counter()
    current_speakers = defaultdict(list)
    avg_ned = None

    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()

        # New cluster start
        if line.startswith("🧩 Cluster"):
            if current_cluster is not None:
                clusters[current_cluster] = {
                    "phonemes": current_counter,
                    "speakers": dict(current_speakers),
                    "avg_ned": avg_ned,
                }
            current_cluster = int(re.findall(r"Cluster (\d+)", line)[0])
            current_counter = Counter()
            current_speakers = defaultdict(list)
            avg_ned = None

        # Match phoneme line with speaker list: 'DH AH T   [spk1, spk2] → 2 times'
        elif "→" in line and "[" in line and "]" in line:
            try:
                match = re.search(r"^(.*?)\s+\[", line)
                phoneme = match.group(1).strip() if match else ""

                speaker_list = re.search(r"\[(.*?)\]", line).group(1).split(",")
                speaker_list = [s.strip() for s in speaker_list if s.strip()]
                count = int(re.search(r"→\s+(\d+)\s+times", line).group(1))
                current_counter[phoneme] += count
                current_speakers[phoneme].extend(speaker_list)
            except Exception as e:
                print(f"⚠️ Failed to parse line: {line}")
                print(e)

        # Match Avg NED
        elif "→ Avg NED" in line:
            avg_ned = float(re.findall(r"[\d.]+", line)[1])

        # Match overall NED
        elif line.startswith("🔍 Overall NED"):
            overall_ned = float(re.findall(r"[\d.]+", line)[0])

    # Save last cluster
    if current_cluster is not None:
        clusters[current_cluster] = {
            "phonemes": current_counter,
            "speakers": dict(current_speakers),
            "avg_ned": avg_ned,
        }

    return clusters, overall_ned


def compare_clusters(file_a, file_b):
    a_clusters, a_ned = parse_file(file_a)
    b_clusters, b_ned = parse_file(file_b)

    all_cluster_ids = set(a_clusters.keys()).union(b_clusters.keys())

    for cid in sorted(all_cluster_ids):
        a_data = a_clusters.get(cid)
        b_data = b_clusters.get(cid)

        if a_data is None:
            print(f"🆕 Cluster {cid} appears only in {file_b}")
            continue
        if b_data is None:
            print(f"❌ Cluster {cid} appears only in {file_a}")
            continue

        # Compare phoneme content
        if a_data["phonemes"] != b_data["phonemes"]:
            print(f"🔁 Cluster {cid} phoneme content differs:")
            print(f"  {file_a}: {dict(a_data['phonemes'])}")
            print(f"  {file_b}: {dict(b_data['phonemes'])}")

        # Compare Avg NED
        if (a_data["avg_ned"] is not None or b_data["avg_ned"] is not None) and (
            a_data["avg_ned"] is None
            or b_data["avg_ned"] is None
            or abs(a_data["avg_ned"] - b_data["avg_ned"]) > 1e-4
        ):
            print(f"🔢 Cluster {cid} Avg NED differs:")
            print(f"  {file_a}: {a_data['avg_ned']}")
            print(f"  {file_b}: {b_data['avg_ned']}")

    if abs(a_ned - b_ned) > 1e-4:
        print("\n📉 Overall NED differs:")
        print(f"  {file_a}: {a_ned}")
        print(f"  {file_b}: {b_ned}")


# Run comparison
file_1 = "output/simon/00_simon_ned.txt"
file_2 = "output/simon/list/00_silences_ned.txt"
compare_clusters(file_1, file_2)


🔁 Cluster 4701 phoneme content differs:
  output/simon/00_simon_ned.txt: {'': 5}
  output/simon/list/00_silences_ned.txt: {'': 4}
🔁 Cluster 5301 phoneme content differs:
  output/simon/00_simon_ned.txt: {'': 5}
  output/simon/list/00_silences_ned.txt: {'': 4}
🔁 Cluster 5602 phoneme content differs:
  output/simon/00_simon_ned.txt: {'': 2}
  output/simon/list/00_silences_ned.txt: {'': 1}
🔢 Cluster 5602 Avg NED differs:
  output/simon/00_simon_ned.txt: 5602.0
  output/simon/list/00_silences_ned.txt: None
🔁 Cluster 9255 phoneme content differs:
  output/simon/00_simon_ned.txt: {'Y IH R': 5, 'Y AH NG G ER': 3, 'HH IY R': 1, 'P IY T ER': 1}
  output/simon/list/00_silences_ned.txt: {'Y IH R': 4, 'Y AH NG G ER': 3, 'HH IY R': 1, 'P IY T ER': 1, '': 1}

📉 Overall NED differs:
  output/simon/00_simon_ned.txt: 0.2916
  output/simon/list/00_silences_ned.txt: 0.3111
