In [None]:
# Minimal phonetic alphabet selector

import re, random, string
from collections import defaultdict
from nltk.corpus import cmudict

# Simple knobs
MIN_ZIPF_FREQ   = 3.4      # keep words with Zipf >= this
KEEP_SYL_RANGE  = (2, 3)   # keep words with 2–3 syllables
TOPK_PER_LETTER = 300      # cap pool size per letter
RESTARTS        = 50       # farthest-first restarts
REQUIRE_ALPHA   = True     # drop tokens with punctuation/digits
START_TOPK      = 5        # start from a random word among top-K by Zipf

# ARPAbet helpers
VOWEL_ARPA = {"AA","AE","AH","AO","AW","AY","EH","ER","EY","IH","IY","OW","OY","UH","UW"}
STRESS_RE  = re.compile(r"(\D+)([0-2]?)$")  # split base/stress digit

def arpa_base(phone):
    m = STRESS_RE.match(phone)
    return m.group(1) if m else phone

def count_syllables(phones):
    return sum(1 for p in phones if arpa_base(p) in VOWEL_ARPA)

# Initial-sound policy: allowed starting phoneme bases per letter
ALLOWED_INITIALS = {
    "A":{"EY","AE","AH"}, "B":{"B"}, "C":{"K","S"}, "D":{"D"}, "E":{"IY","EH"},
    "F":{"F"}, "G":{"G","JH"}, "H":{"HH"}, "I":{"AY","IH"}, "J":{"JH"}, "K":{"K"},
    "L":{"L"}, "M":{"M"}, "N":{"N"}, "O":{"OW","AO"}, "P":{"P"}, "Q":{"K"},
    "R":{"R"}, "S":{"S","Z"}, "T":{"T"}, "U":{"Y","UW"}, "V":{"V"}, "W":{"W"},
    "X":{"Z","EH"}, "Y":{"Y"}, "Z":{"Z"},
}

# Frequency (Zipf) lookup
def zipf(word):
    try:
        from wordfreq import zipf_frequency
        return zipf_frequency(word, "en")
    except Exception:
        return 0.0

# Load CMUdict
d = cmudict.dict()

# Filter → candidates per letter
candidates = defaultdict(list)
min_syl, max_syl = KEEP_SYL_RANGE

for w, prons in d.items():
    w_lc = w.lower()
    if REQUIRE_ALPHA and not w_lc.isalpha():
        continue
    L = w_lc[0].upper()
    if L < "A" or L > "Z":
        continue

    best = tuple(min(prons, key=len))                 # shortest pronunciation
    first_base = arpa_base(best[0])                   # drop silent-letter cases
    allowed = ALLOWED_INITIALS.get(L)
    if allowed and first_base not in allowed:
        continue

    syl = count_syllables(best)                       # syllable filter
    if not (min_syl <= syl <= max_syl):
        continue

    z = zipf(w_lc)                                    # frequency filter
    if z < MIN_ZIPF_FREQ:
        continue

    candidates[L].append({"word": w_lc, "phones": best, "syl": syl, "zipf": z})

# Reduce each letter pool to top-K by frequency
for L in list(candidates.keys()):
    pool = sorted(candidates[L], key=lambda r: r["zipf"], reverse=True)
    candidates[L] = pool[:TOPK_PER_LETTER]

# Ensure all letters exist
missing = [L for L in string.ascii_uppercase if not candidates.get(L)]
if missing:
    raise RuntimeError(f"Missing letters after filters: {missing}. "
                       f"Lower MIN_ZIPF_FREQ or widen KEEP_SYL_RANGE.")

# Levenshtein distance on phoneme sequences (with stress)
def lev(a, b):
    n, m = len(a), len(b)
    if n == 0: return m
    if m == 0: return n
    prev = list(range(m+1))
    for i in range(1, n+1):
        ai = a[i-1]
        cur = [i] + [0]*m
        for j in range(1, m+1):
            cur[j] = min(prev[j] + 1, cur[j-1] + 1, prev[j-1] + (0 if ai == b[j-1] else 1))
        prev = cur
    return prev[m]

def dist_row(a, b):
    return lev(a["phones"], b["phones"])

# Greedy farthest-first (simple)
def farthest_first(cand_by_letter, restarts=RESTARTS):
    letters = list(string.ascii_uppercase)
    best_sel, best_score = None, -1

    def min_pair_dist(sel):
        md = float("inf")
        for i in range(len(sel)):
            for j in range(i+1, len(sel)):
                md = min(md, dist_row(sel[i], sel[j]))
        return md

    for _ in range(restarts):
        order = letters[:]; random.shuffle(order)
        first_pool = sorted(cand_by_letter[order[0]], key=lambda r: r["zipf"], reverse=True)[:max(1, START_TOPK)]
        current = [random.choice(first_pool)]

        for L in order[1:]:
            best_c = None
            best_tuple = (-1, -1.0, -1.0)  # (dmin, zipf, jitter)
            for cand in cand_by_letter[L]:
                dmin = min(dist_row(cand, chosen) for chosen in current)
                score_tuple = (dmin, cand["zipf"], random.random())
                if score_tuple > best_tuple:
                    best_tuple, best_c = score_tuple, cand
            current.append(best_c)

        score = min_pair_dist(current)
        if score > best_score:
            best_sel, best_score = current, score

    return best_sel, best_score

# Run + print
selection, score = farthest_first(candidates, restarts=RESTARTS)
print(f"Min pairwise phoneme-distance (stress=on): {score}\n")
by_letter = {r["word"][0].upper(): r["word"] for r in selection}
for L in string.ascii_uppercase:
    print(f"{L}: {by_letter[L]}")



Min pairwise phoneme-distance (stress=on): 7

A: according
B: broadcasting
C: construction
D: distinguished
E: enforcement
F: fitzgerald
G: grandparents
H: heartbreaking
I: illustrates
J: japanese
K: kazakhstan
L: languages
M: manuscript
N: northwestern
O: orthodox
P: perspective
Q: quality
R: relevant
S: spiritual
T: translation
U: united
V: volcanic
W: westminster
X: xavier
Y: yourselves
Z: zimbabwe
