In [165]:
import numpy as np
import nltk
import requests
import random
import itertools as it
from matplotlib import pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
languages = ["en_US"]#, "de", "sw", "zh_hans"]

ipas = []
ipa_reverse = {}

print("Starting data fetch...")
for l in languages: 
    print(f"    Fetching language {l}...".ljust(40), end="", flush=True)
    url = f"https://github.com/open-dict-data/ipa-dict/raw/master/data/{l}.txt"
    data = requests.get(url, allow_redirects=True).content.decode('UTF-8')
    len_before = len(ipas)
    pairs = [ p.split("\t") for p in data.split("\n") ]
    for i in range(len(pairs)):
        if len(pairs[i]) == 2:
            word = pairs[i][0]
            
            cleaned = pairs[i][1].replace("ˈ", "").replace("/", "").replace("ˌ", "")
            if "," in cleaned:
                for w in cleaned.split(","):
                    ipas.append(w.strip())
                    ipa_reverse[w.strip()] = f"{l}_{word}"
            else:
                ipas.append(cleaned.strip())
                ipa_reverse[cleaned.strip()] = f"{l}_{word}"
    print(f"Got {(len(ipas) - len_before):,} IPA strings. Total = {len(ipas):,}")
    
print("Done.")

Starting data fetch...
    Fetching language en_US...          

ConnectionError: HTTPSConnectionPool(host='github.com', port=443): Max retries exceeded with url: /open-dict-data/ipa-dict/raw/master/data/en_US.txt (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x10ccbb650>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [3]:
random.shuffle(ipas)
for i in ipas[:10]:
    print(f"{i}".ljust(20), ipa_reverse[i])

In [161]:
# =======================
#   GENERATION METHODS
# =======================

devoice_dict = {
    "z" : "s",
    "d" : "t",
    "ʒ" : "ʃ",
    "v" : "f",
    "b" : "p",
    "g" : "k",
    "ð" : "θ",
    "ʤ" : "ʧ"
}

voice_dict = {v: k for (k,v) in  devoice_dict.items()}

def nochange(segment):
    return segment

def devoice(segment):
    if segment in devoice_dict:
        return devoice_dict[segment]
    else:
        return None
    
def voice(segment):
    if segment in voice_dict:
        return voice_dict[segment]
    else:
        return None
    
def omit(segment):
    if (segment == "-"):
        return None
    return ""

def epenthetic_i(segment):
    if (segment == "-"):
        return None
    return segment + "ɨ"

    
generation_methods = (devoice, voice, nochange, omit, epenthetic_i)


# =======================
#   CONSTRAINT METHODS
# =======================


SKIP_CHARS = ['-', '']

def remove_root(s):
    return list(str(s).replace("-", ""))

def voicing_value(c):
    if c in devoice_dict.keys():
        return True
    if c in devoice_dict.values():
        return False
    else:
        return None

def ident_voice(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        i_v = voicing_value(i)
        o_v = voicing_value(o)
        
        if (i_v is None) or (o_v is None):
            continue
        
        if i_v != o_v:
            violations += 1
            
    return violations

def ident_voice_root(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if i == "-":
            break
        i_v = voicing_value(i)
        o_v = voicing_value(o)
        
        if (i_v is None) or (o_v is None):
            continue
        
        if i_v != o_v:
            violations += 1
            
    return violations

def dont_delete(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if i != "" and o == "":
            violations += 1
    return violations

def dont_epenthesize(orig, modified):
    violations = 0
    for i, o in zip(orig, modified):
        if len(o) > len(i):
            violations += 1
    return violations


def voicing_agreement(_, modified):
    violations = 0
    modified = list(filter(lambda x: x not in SKIP_CHARS, ''.join(modified)))
    
    for i in range(len(modified)-1):
        a = modified[i]
        b = modified[i+1]
        
        v1 = voicing_value(a)
        v2 = voicing_value(b)
        
        if (v1 is None) or (v2 is None):
            continue
        
        if v1 != v2:
            violations += 1
            
    return violations


sibilants = "szʃʧʒʤ"

def no_SS(original, modified):
    violations = 0
    modified = list(filter(lambda x: x not in SKIP_CHARS, ''.join(modified)))
    
    for i in range(len(modified)-1):
        a = modified[i]
        b = modified[i+1]
        
        if ((a in sibilants) and (b in sibilants)):
            violations+=1
            
    return violations
                    

constraints = (no_SS, voicing_agreement, dont_delete, dont_epenthesize, ident_voice_root, ident_voice,)
constraint_names = ("VAG", "*SS", "MAX", "DEP", "IVR", "IDV")

In [162]:
def gen(input_ipa):
    original = list(input_ipa)
    N = len(generation_methods)
    K = len(original)
    
    possibilities = set(it.combinations(generation_methods * K, K))
    
    output = []
    
    for p in possibilities:
        out = []
        for orig_char, transform in zip(original, p):
            out.append(transform(orig_char))
        if None not in out:
            output.append(out)    
    return output

In [164]:
original = "glas-z"
print(f"Generating possibilities for {''.join(original)}:")

ranked = []

for s in sorted(gen(original)):
    violations = []
    for c in constraints:
        violations.append(c(original, s))
    ranked.append([s, violations])
    
print(f"Constraint ranking: {'.'.join(constraint_names)}")
for (s,v) in sorted(ranked, key=lambda x: x[1]):
    print(f"'{''.join(s).replace('-', '')}'\t\t{ ' '.join(list(s[0] + ':' + str(s[1]) for s in list(zip(constraint_names, v))))}")
    
    
    

Generating possibilities for glas-z:
Constraint ranking: VAG.*SS.MAX.DEP.IVR.IDV
'glasɨz'		VAG:0 *SS:0 MAX:0 DEP:1 IVR:0 IDV:0
'glasɨs'		VAG:0 *SS:0 MAX:0 DEP:1 IVR:0 IDV:1
'klasɨz'		VAG:0 *SS:0 MAX:0 DEP:1 IVR:1 IDV:1
'klasɨs'		VAG:0 *SS:0 MAX:0 DEP:1 IVR:1 IDV:2
'glasɨzɨ'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:0
'glaɨsɨz'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:0
'glɨasɨz'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:0
'gɨlasɨz'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:0
'glaɨsɨs'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:1
'glɨasɨs'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:1
'gɨlasɨs'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:0 IDV:1
'klasɨzɨ'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:1 IDV:1
'klaɨsɨz'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:1 IDV:1
'klɨasɨz'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:1 IDV:1
'klaɨsɨs'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:1 IDV:2
'klɨasɨs'		VAG:0 *SS:0 MAX:0 DEP:2 IVR:1 IDV:2
'glaɨsɨzɨ'		VAG:0 *SS:0 MAX:0 DEP:3 IVR:0 IDV:0
'glɨasɨzɨ'		VAG:0 *SS:0 MAX:0 DEP:3 IVR:0 IDV:0
'glɨaɨsɨz'		VAG:0 *SS:0 MAX:0 DEP:3 IVR:0 IDV:0
'gɨlasɨzɨ'		VAG:0 *SS:0 MAX

In [85]:
no_SS("", ['g','l', 'a', '', '-', 's'])

1

In [60]:
"" in sibilants

True

In [148]:
generation_methods[1]("M")

'MI'

In [143]:
generation_methods

[<function __main__.nochange(segment)>,
 <function __main__.<lambda>(segment)>,
 <function __main__.<lambda>(segment)>]