In [1]:
import numpy as np
import requests
import random
import itertools as it
from tabulate import tabulate
from IPython.display import HTML, display

In [2]:
# This cell fetches data from the ipa-dict dataset, which is useful for generating input forms.
# We originally aimed to implement a transphoneticization that would make heavier use of this 
# dataset, but our we changed directions somewhat midway though the project.

languages = []#"es_MX", "de", "sw", "zh_hans"]

ipas = []
ipa_reverse = {}

print("Starting data fetch...")
for l in languages: 
    print(f"    Fetching language {l}...".ljust(40), end="", flush=True)
    url = f"https://github.com/open-dict-data/ipa-dict/raw/master/data/{l}.txt"
    data = requests.get(url, allow_redirects=True).content.decode('UTF-8')
    len_before = len(ipas)
    pairs = [ p.split("\t") for p in data.split("\n") ]
    for i in range(len(pairs)):
        if len(pairs[i]) == 2:
            word = pairs[i][0]
            
            cleaned = pairs[i][1].replace("ˈ", "").replace("/", "").replace("ˌ", "")
            if "," in cleaned:
                for w in cleaned.split(","):
                    ipas.append(w.strip())
                    ipa_reverse[w.strip()] = f"{l}_{word}"
            else:
                ipas.append(cleaned.strip())
                ipa_reverse[cleaned.strip()] = f"{l}_{word}"
    print(f"Got {(len(ipas) - len_before):,} IPA strings. Total = {len(ipas):,}")
    
print("Done.")
random.shuffle(ipas)
for i in ipas[:10]:
    print(f"{i}".ljust(20), ipa_reverse[i])

Starting data fetch...
Done.


In [3]:
# =======================
#   GENERATION METHODS
# =======================

devoice_dict = {
    "z" : "s",
    "d" : "t",
    "ʒ" : "ʃ",
    "v" : "f",
    "b" : "p",
    "g" : "k",
    "ɡ" : "k",
    "ð" : "θ",
    "ʤ" : "ʧ"
}

voice_dict = {v: k for (k,v) in  devoice_dict.items()}

def nochange(segment):
    return segment

def devoice(segment):
    if segment in devoice_dict:
        return devoice_dict[segment]
    else:
        return None
    
def voice(segment):
    if segment in voice_dict:
        return voice_dict[segment]
    else:
        return None
    
def omit(segment):
    if (segment == "-"):
        return None
    return ""

def epenthetic_i(segment):
    if (segment == "-"):
        return None
    return segment + "ɨ"

    
generation_methods = (devoice, voice, nochange, omit, epenthetic_i)


# =======================
#   CONSTRAINT METHODS
# =======================


SKIP_CHARS = ['-', '']

def remove_root(s):
    return list(str(s).replace("-", ""))

def voicing_value(c):
    if c in devoice_dict.keys():
        return True
    if c in devoice_dict.values():
        return False
    else:
        return None

def ident_voice(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        i_v = voicing_value(i)
        o_v = voicing_value(o)
        
        if (i_v is None) or (o_v is None):
            continue
        
        if i_v != o_v:
            violations += 1
            
    return violations

def ident_voice_root(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if i == "-":
            break
        i_v = voicing_value(i)
        o_v = voicing_value(o)
        
        if (i_v is None) or (o_v is None):
            continue
        
        if i_v != o_v:
            violations += 1
            
    return violations

def dont_delete(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if i != "" and o == "":
            violations += 1
    return violations

def dont_epenthesize(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if len(o) > len(i):
            violations += 1
    return violations


def voicing_agreement(_, modified):
    violations = 0
    modified = list(filter(lambda x: x not in SKIP_CHARS, ''.join(modified)))
    
    for i in range(len(modified)-1):
        a = modified[i]
        b = modified[i+1]
        
        if (a == "?" or b == "?"):
            continue
        
        v1 = voicing_value(a)
        v2 = voicing_value(b)
        
        if (v1 is None) or (v2 is None):
            continue
        
        if v1 != v2:
            violations += 1
            
    return violations


sibilants = "szʃʧʒʤ"

def no_SS(original, modified):
    violations = 0
    modified = list(filter(lambda x: x not in SKIP_CHARS, ''.join(modified)))
    
    for i in range(len(modified)-1):
        a = modified[i]
        b = modified[i+1]
        
        if (a == "?" or b == "?"):
            continue
        
        if (a == '' or b == ''):
            continue
        
        if ((a in sibilants) and (b in sibilants)):
            violations+=1
            
    return violations
                    

constraints = (no_SS, voicing_agreement, dont_delete, dont_epenthesize, ident_voice_root, ident_voice,)
constraint_names = ("*SS", "Agree(voice)", "MAX-IO", "DEP-IO", "Ident_{root}_-IO(voice)", "Ident-IO(voice)")

In [4]:
# Generate all output candidates for an input ipa string,
# pruning based on validity; for instance, no devoicing on vowels.
def assisted_gen(input_ipa):
    original = list(input_ipa)
    N = len(generation_methods)
    K = len(original)
    
    output = []
    
    possibilities = it.product(generation_methods, repeat=K)
    
    for p in possibilities:
        out = list(["*" for _ in original])
        terminated = False
        for i, (orig_char, transform) in enumerate(zip(original, p)):
            out[i] = transform(orig_char)
            if out[i] is None:
                terminated = True
                break
                
        if not terminated:
            output.append(out)
            if (len(output) % 100 == 0):
                print(f"Generating {len(output)}+ candidates...", end="\r")

    return output

In [5]:
# Generate the string of characters marking violations and fatal violations
# for a given numerical set of violations.
def genViolationStrings(violations, prevViolations):
    output = []
    found_fatal = False
    for i, n in enumerate(violations):
        if prevViolations is not None and not found_fatal and prevViolations[i] < n:
            output.append( "" +  (prevViolations[i]*"~{∗}~") + ("~{∗}~!") + ((n - prevViolations[i] - 1) * "~{∗}~") )
            found_fatal = True
        else:
            if n>0:
                output.append("~{" + (n * "∗") + "}~")
            else:
                output.append("")
            
    return output


# Generate a table from an input, generating all the possibilities
# and initially including them
def genTable(original):
    ranked = []

    for s in sorted(assisted_gen(original)):
        violations = []
        for c in constraints:
            violations.append(c(original, s))
        ranked.append([s, violations])

    ranked = sorted(ranked, key=lambda x: x[1])
    ranked.insert(1, [original, [c(original, original) for c in constraints]])

    table = []
    max_violation_seen = [0] * len(ranked[0][1])
    for i, (s,v) in enumerate(ranked):
        row = ["[" + ''.join(s).replace('-', '') + "]"] + genViolationStrings(v, None if i==0 else max_violation_seen)
        for j, n in enumerate(v):
            max_violation_seen[j] = max(max_violation_seen[j], n)
        if row not in table:
            table.append(row)
        
    table[0][0] = "~{☞}~  " + table[0][0]
    
    headers = ["/" + original + "/"] + list(constraint_names)
    
    return table, headers

In [6]:
# A helper function so we can CSS style the table
def expandMarkup(text):
     return text.replace("_{","<sub>").replace("}_", "</sub>").replace("+{","<span class='big'>").replace("}+", "</span>").replace("~{","<span class='medium'>").replace("}~", "</span>")

# Filter out only a represenative subset of the examples from the overall generated table
def filterTable(table):
    constraintsViolated = [False] * (len(table[0])-1)
    
    out = [table[0], table[1]]
    
    for t in table:
        violations = [len(c)>0 for c in t[1:]]
        violation_indices = [i for i, x in enumerate(violations) if x]
        
        if any( [constraintsViolated[i] is False for i in violation_indices] ):
            if t not in out:
                out.append(t)
            for i in violation_indices:
                constraintsViolated[i] = True
    return out

# Display the table inline in the Jupyter Notebook, along with some applied styling to make it 
# more standard-looking
def showTable(table, headers):
    css = """
    table {
        border: 1px solid black !important;
        font-family: sans-serif !important;
        font-size: 1.2em !important;
    }
    
    td {
        border-right: 1px solid grey !important;
        border-bottom: 1px solid grey !important;
        background-color: white !important;
        padding: 0 !important;
        text-align: center !important;
    }
    
    td:nth-child(1) {
        border-right: 1px solid grey !important;
        border-bottom: none !important;
        background-color: white !important;
        padding-left: 0.5em !important;
        padding-right: 0.5em !important;
        text-align: right !important;
    }
    
    .big {
        font-size: 2em !important;
    }
    
    .medium {
        font-size: 1.5em !important;
    }
    
    tr:nth-child(1) {
        background-color: lightgrey !important;
        border-top: 2px solid gray;
    }
    
    """
    html = tabulate(table, tablefmt="html", headers=headers)
    
    display(HTML(expandMarkup(f"<style>{css}</style>{html}")))

In [8]:
# Run the whole pipeline.
table, headers = genTable("szskæt-z")
table = filterTable(table)
showTable(table, headers)

Generating 12200+ candidates...

/szskæt-z/,*SS,Agree(voice),MAX-IO,DEP-IO,Identroot-IO(voice),Ident-IO(voice)
☞ [sɨzɨskæts],,,,∗∗,,∗
[szskætz],∗!∗,∗∗∗,,,,
[sɨzɨskædz],,,,∗∗,∗!,∗
[zɨskæts],,,∗!,∗,,∗
