In [10]:
import numpy as np
import nltk
import requests
import random
import itertools as it

In [4]:
languages = ["en_US"]#, "de", "sw", "zh_hans"]

ipas = []
ipa_reverse = {}

print("Starting data fetch...")
for l in languages: 
    print(f"    Fetching language {l}...".ljust(40), end="", flush=True)
    url = f"https://github.com/open-dict-data/ipa-dict/raw/master/data/{l}.txt"
    data = requests.get(url, allow_redirects=True).content.decode('UTF-8')
    len_before = len(ipas)
    pairs = [ p.split("\t") for p in data.split("\n") ]
    for i in range(len(pairs)):
        if len(pairs[i]) == 2:
            word = pairs[i][0]
            
            cleaned = pairs[i][1].replace("ˈ", "").replace("/", "").replace("ˌ", "")
            if "," in cleaned:
                for w in cleaned.split(","):
                    ipas.append(w.strip())
                    ipa_reverse[w.strip()] = f"{l}_{word}"
            else:
                ipas.append(cleaned.strip())
                ipa_reverse[cleaned.strip()] = f"{l}_{word}"
    print(f"Got {(len(ipas) - len_before):,} IPA strings. Total = {len(ipas):,}")
    
print("Done.")

Starting data fetch...
    Fetching language en_US...          Got 135,006 IPA strings. Total = 135,006
Done.


In [5]:
random.shuffle(ipas)
for i in ipas[:10]:
    print(f"{i}".ljust(20), ipa_reverse[i])

wɪɫhɪɫmsən           en_US_wilhelmsen
bɑɹkoʊ               en_US_barkow
juθəneɪʒjə           en_US_euthanasia
sɛɫəbɹə              en_US_celebre
ɡɹeɪbiɹd             en_US_graybeard
maɪɫstoʊn            en_US_milestone
məɫtivaɪtəmən        en_US_multivitamin
sɛktɝz               en_US_sectors
ɫɪntʃ                en_US_lynch
bɹaɪɝz               en_US_bryars


In [105]:
# =======================
#   GENERATION METHODS
# =======================

devoice_dict = {
    "z" : "s",
    "d" : "t",
    "ʒ" : "ʃ",
    "v" : "f",
    "b" : "p",
    "g" : "k",
    "ð" : "θ",
    "ʤ" : "ʧ"
}

voice_dict = {v: k for (k,v) in  devoice_dict.items()}

def nochange(segment):
    return segment

def devoice(segment):
    if segment in devoice_dict:
        return devoice_dict[segment]
    else:
        return None
    
def voice(segment):
    if segment in voice_dict:
        return voice_dict[segment]
    else:
        return None
    
def omit(segment):
    return ""
    
generation_methods = (devoice, voice, nochange, omit)


# =======================
#   CONSTRAINT METHODS
# =======================

def remove_root(s):
    return s.replace("-", "")

def voicing_value(c):
    if c in devoice_dict.keys():
        return True
    if c in devoice_dict.values():
        return False
    else:
        return None

def ident_voice(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        i_v = voicing_value(i)
        o_v = voicing_value(o)
        
        if (i_v is None) or (o_v is None):
            continue
        
        if i_v != o_v:
            violations += 1
            
    return violations

def dont_delete(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if i != "" and o == "":
            violations += 1
    return violations


def voicing_agreement(orig, modified):
    violations = 0
    for i in range(len(modified)-1):
        v1 = voicing_value(modified[i])
        v2 = voicing_value(modified[i+1])
        
        if (v1 is None) or (v2 is None):
            continue
        
        if v1 != v2:
            violations += 1
            
    return violations
                    

constraints = (voicing_agreement, dont_delete, ident_voice,)
constraint_names = ("AGREE", "MAX", "IDENT_VOICE")

In [106]:
def gen(input_ipa):
    original = list(input_ipa)
    N = len(generation_methods)
    K = len(original)
    
    possibilities = set(it.combinations(generation_methods * K, K))
    
    output = []
    
    for p in possibilities:
        out = []
        for orig_char, transform in zip(original, p):
            out.append(transform(orig_char))
        if None not in out:
            output.append(out)    
    return output

In [119]:
original = random.choice(ipas)
print(f"Generating possibilities for {''.join(original)}: {ipa_reverse[original]}")

ranked = []

for s in sorted(gen(original)):
    violations = []
    for c in constraints:
        violations.append(c(original, s))
    ranked.append([s, violations])
    
print(f"Constraint ranking: {' '.join(constraint_names)}")
for (s,v) in sorted(ranked, key=lambda x: x[1]):
    print(f"'{''.join(s)}'\t\t{v}")

Generating possibilities for ɹibaʊndz: en_US_rebounds
Constraint ranking: AGREE MAX IDENT_VOICE
'ɹibaʊndz'		[0, 0, 0]
'ɹipaʊndz'		[0, 0, 1]
'ɹibaʊnts'		[0, 0, 2]
'ɹipaʊnts'		[0, 0, 3]
'ibaʊndz'		[0, 1, 0]
'ɹbaʊndz'		[0, 1, 0]
'ɹiaʊndz'		[0, 1, 0]
'ɹibʊndz'		[0, 1, 0]
'ɹibandz'		[0, 1, 0]
'ɹibaʊdz'		[0, 1, 0]
'ɹibaʊnz'		[0, 1, 0]
'ɹibaʊnd'		[0, 1, 0]
'ipaʊndz'		[0, 1, 1]
'ɹpaʊndz'		[0, 1, 1]
'ɹibaʊns'		[0, 1, 1]
'ɹibaʊnt'		[0, 1, 1]
'ɹipʊndz'		[0, 1, 1]
'ɹipandz'		[0, 1, 1]
'ɹipaʊdz'		[0, 1, 1]
'ɹipaʊnz'		[0, 1, 1]
'ɹipaʊnd'		[0, 1, 1]
'ibaʊnts'		[0, 1, 2]
'ɹbaʊnts'		[0, 1, 2]
'ɹiaʊnts'		[0, 1, 2]
'ɹibʊnts'		[0, 1, 2]
'ɹibants'		[0, 1, 2]
'ɹibaʊts'		[0, 1, 2]
'ɹipaʊns'		[0, 1, 2]
'ɹipaʊnt'		[0, 1, 2]
'ipaʊnts'		[0, 1, 3]
'ɹpaʊnts'		[0, 1, 3]
'ɹipʊnts'		[0, 1, 3]
'ɹipants'		[0, 1, 3]
'ɹipaʊts'		[0, 1, 3]
'baʊndz'		[0, 2, 0]
'iaʊndz'		[0, 2, 0]
'ibʊndz'		[0, 2, 0]
'ibandz'		[0, 2, 0]
'ibaʊdz'		[0, 2, 0]
'ibaʊnz'		[0, 2, 0]
'ibaʊnd'		[0, 2, 0]
'ɹaʊndz'		[0, 2, 0]
'ɹbʊndz'		[0, 2, 0]
'ɹband