In [6]:
import re


def normalize_phonemes(phonemes: str) -> str:
    phonemes = phonemes.replace(".", " ")
    phonemes = re.sub("\s+", " ", phonemes)
    diacritics = ["ː", "ˑ", "̆", "̯", "͡", "‿", "͜", "̩", "ˈ", "ˌ", "↓"]
    digraphs = {
        "a i": "aɪ",
        "a j": "aɪ",
        "a u": "aʊ",
        "a ɪ": "aɪ",
        "a ɪ̯": "aɪ",
        "a ʊ": "aʊ",
        "a ʊ̯": "aʊ",
        "d ʒ": "dʒ",
        "e i": "eɪ",
        "e ɪ": "eɪ",
        "e ɪ̯": "eɪ",
        "e ɪ̪": "eɪ",
        "o i": "ɔɪ",
        "o u": "oʊ",
        "o w": "oʊ",
        "o ɪ": "ɔɪ",
        "o ʊ": "oʊ",
        "o ʊ̯": "oʊ",
        "t ʃ": "tʃ",
        "ɑ ɪ": "aɪ",
        "ɔ i": "ɔɪ",
        "ɔ ɪ": "ɔɪ",
        "ɔ ɪ̯": "ɔɪ",
    }
    consonants = {
        "pʰ": "p",
        "b̥": "b",
        "tʰ": "t",
        "d̥": "d",
        "tʃʰ": "tʃ",
        "d̥ʒ̊": "dʒ",
        "kʰ": "k",
        "ɡ̊": "ɡ",
        "ɸ": "f",
        "β": "v",
        "v̥": "v",
        "t̪": "θ",
        "ð̥": "ð",
        "d̪": "ð",
        "z̥": "z",
        "ʒ̊": "ʒ",
        "ɦ": "h",
        "ç": "h",
        "x": "h",
        "χ": "h",
        "ɱ": "m",
        "ɫ": "l",
        "l̥": "l",
        "ɫ̥": "l",
        "ɤ": "l",
        "ɹʷ" : "ɹ",
        "r": "ɹ",
        "ɻ": "ɹ",
        "ɹ̥ʷ": "ɹ",
        "ɹ̥": "ɹ",
        "ɾ̥": "ɹ",
        "ɻ̊": "ɹ",
        "ʋ": "ɹ",
        "ʍ": "w",
        "h w": "w",
        "ɜ ɹ": "ɚ",
    }
    vowels = {
        "ɐ": "ʌ",
        "ɒ": "ɔ",
        "ɜ": "ə",
        "ɵ": "oʊ",
        "ɘ": "ə",
        "ɯ": "u",
        "ä": "ɑ",
    }
    leftover_vowels = {
        "a": "æ",
        "o": "ɔ",
        "e": "ɛ",
    }
    for i, j in digraphs.items():
        phonemes = phonemes.replace(i, j)
    for d in diacritics:
        phonemes = phonemes.replace(d, "")
    for i, j in consonants.items():
        phonemes = phonemes.replace(i, j)
    for i, j in vowels.items():
        phonemes = phonemes.replace(i, j)
    for i, j in leftover_vowels.items():
        phonemes = " ".join([j if p == i else p for p in phonemes.split()])
    phonemes = phonemes.strip()
    phonemes = re.sub("\s+", " ", phonemes)
    return phonemes

In [15]:
import pandas as pd

lang = "nz"
df1 = pd.read_csv(f"../lexikos/dict/wikipron/eng_latn_{lang}_broad.tsv", header=None, sep="\t", names=["word", "ipa"], keep_default_na=False)
df2 = pd.read_csv(f"../lexikos/dict/wikipron/eng_latn_{lang}_narrow.tsv", header=None, sep="\t", names=["word", "ipa"], keep_default_na=False)
df = pd.concat([df1, df2]).drop_duplicates().reset_index(drop=True)
df["word"] = df["word"].apply(lambda x: x.lower())
df = df.assign(ipa=df["ipa"].str.split(" ~ ")).explode("ipa")
df["ipa"] = df["ipa"].apply(normalize_phonemes)
df = df.drop_duplicates().reset_index(drop=True)
df = df.sort_values("word").reset_index(drop=True)
df

Unnamed: 0,word,ipa
0,'bout,b aʊ t
1,'d,d
2,'dswounds,d z w u n d z
3,'em,ɛ m
4,'em,ə m
...,...,...
42734,œconomy,ɪ k ɔ n ɔ m i
42735,œneus,ɛ n j u s
42736,świętochłowice,ʃ v i ɛ n t ə ʊ w ə v ɪ t s eɪ
42737,šiauliai,ʃ aʊ l eɪ


In [16]:
model_vocab = set(['aɪ', 'aʊ', 'b', 'b̚', 'd', 'dʒ', 'd̚', 'eɪ', 'f', 'h', 'i', 'j', 'k', 'k̚', 'l', 'l̩', 'm', 'm̩', 'n', 'n̩', 'oʊ', 'p', 'p̚', 's', 't', 'tʃ', 't̚', 'u', 'v', 'w', 'z', 'æ', 'ð', 'ŋ', 'ɑ', 'ɔ', 'ɔɪ', 'ə', 'ə̥', 'ɚ', 'ɛ', 'ɝ', 'ɡ', 'ɡ̚', 'ɦ', 'ɨ', 'ɪ', 'ɹ', 'ɾ', 'ɾ̃', 'ʃ', 'ʉ', 'ʊ', 'ʌ', 'ʒ', 'ʔ', 'θ'])

In [17]:
vocab = set(p for word in df["ipa"] for p in word.split())

In [18]:
drops = df[df["ipa"].apply(lambda word: not all(p in model_vocab for p in word.split()))]
drops

Unnamed: 0,word,ipa
179,absinthe,æ b s æ̃ θ
1380,am,e̞ m
1381,am,æ̝ m
1726,andrew,æ n d̠ ɹ̠ʷ ˔ ʊ̈ u
1727,andrew,eᵊ n d̠ ɹ̠ ˔ʷ ʊ̈ u
...,...,...
41856,work,w ø k
41857,work,w œ k
42161,y'all'dn't've,j ɔ lᵈ n t ə v
42481,zarina,z ʌ ɹʲ i n ə


In [19]:
cleaned_df = df[df["ipa"].apply(lambda word: all(p in model_vocab for p in word.split()))]
# only include words with a-z characters with no accented characters
cleaned_df = cleaned_df[cleaned_df["word"].str.match("^[a-z]+$")]

In [20]:
cleaned_df

Unnamed: 0,word,ipa
39,a,eɪ
40,a,ɑ
41,a,æ
42,a,ə
45,aaawesome,ɔ s ə m
...,...,...
42709,zymophyte,z aɪ m ə ʊ f aɪ t
42710,zymurgy,z aɪ m ɚ dʒ i
42711,zythum,z aɪ θ ə m
42712,zyzzyva,z ɪ z ɪ v ə


In [21]:
cleaned_df.to_csv(f"standardized_wikipron/eng_latn_{lang}.tsv", sep="\t", index=False, header=False)