In [11]:
import re


def normalize_phonemes(phonemes: str) -> str:
    phonemes = phonemes.replace(".", " ")
    phonemes = re.sub("\s+", " ", phonemes)
    diacritics = ["ː", "ˑ", "̆", "̯", "͡", "‿", "͜", "̩", "ˈ", "ˌ", "↓"]
    digraphs = {
        "a i": "aɪ",
        "a j": "aɪ",
        "a u": "aʊ",
        "a ɪ": "aɪ",
        "a ɪ̯": "aɪ",
        "a ʊ": "aʊ",
        "a ʊ̯": "aʊ",
        "d ʒ": "dʒ",
        "e i": "eɪ",
        "e ɪ": "eɪ",
        "e ɪ̯": "eɪ",
        "e ɪ̪": "eɪ",
        "o i": "ɔɪ",
        "o u": "oʊ",
        "o w": "oʊ",
        "o ɪ": "ɔɪ",
        "o ʊ": "oʊ",
        "o ʊ̯": "oʊ",
        "t ʃ": "tʃ",
        "ɑ ɪ": "aɪ",
        "ɔ i": "ɔɪ",
        "ɔ ɪ": "ɔɪ",
        "ɔ ɪ̯": "ɔɪ",
    }
    consonants = {
        "pʰ": "p",
        "b̥": "b",
        "tʰ": "t",
        "d̥": "d",
        "tʃʰ": "tʃ",
        "d̥ʒ̊": "dʒ",
        "kʰ": "k",
        "ɡ̊": "ɡ",
        "ɸ": "f",
        "β": "v",
        "v̥": "v",
        "t̪": "θ",
        "ð̥": "ð",
        "d̪": "ð",
        "z̥": "z",
        "ʒ̊": "ʒ",
        "ɦ": "h",
        "ç": "h",
        "x": "h",
        "χ": "h",
        "ɱ": "m",
        "ɫ": "l",
        "l̥": "l",
        "ɫ̥": "l",
        "ɤ": "l",
        "ɹʷ" : "ɹ",
        "r": "ɹ",
        "ɻ": "ɹ",
        "ɹ̥ʷ": "ɹ",
        "ɹ̥": "ɹ",
        "ɾ̥": "ɹ",
        "ɻ̊": "ɹ",
        "ʋ": "ɹ",
        "ʍ": "w",
        "h w": "w",
        "ɜ ɹ": "ɚ",
    }
    vowels = {
        "ɐ": "ʌ",
        "ɒ": "ɔ",
        "ɜ": "ə",
        "ɵ": "oʊ",
        "ɘ": "ə",
    }
    leftover_vowels = {
        "a": "æ",
        "o": "ɔ",
        "e": "ɛ",
    }
    for i, j in digraphs.items():
        phonemes = phonemes.replace(i, j)
    for d in diacritics:
        phonemes = phonemes.replace(d, "")
    for i, j in consonants.items():
        phonemes = phonemes.replace(i, j)
    for i, j in vowels.items():
        phonemes = phonemes.replace(i, j)
    for i, j in leftover_vowels.items():
        phonemes = " ".join([j if p == i else p for p in phonemes.split()])
    phonemes = phonemes.strip()
    phonemes = re.sub("\s+", " ", phonemes)
    return phonemes

In [12]:
import pandas as pd

lang = "au"
df1 = pd.read_csv(f"../lexikos/dict/wikipron/eng_latn_{lang}_broad.tsv", header=None, sep="\t", names=["word", "ipa"], keep_default_na=False)
df2 = pd.read_csv(f"../lexikos/dict/wikipron/eng_latn_{lang}_narrow.tsv", header=None, sep="\t", names=["word", "ipa"], keep_default_na=False)
df = pd.concat([df1, df2]).drop_duplicates().reset_index(drop=True)
df["word"] = df["word"].apply(lambda x: x.lower())
df = df.assign(ipa=df["ipa"].str.split(" ~ ")).explode("ipa")
df["ipa"] = df["ipa"].apply(normalize_phonemes)
df = df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,word,ipa
0,++,p l ʌ s p l ʌ s
1,a,æ ɪ
2,a,æ
3,a,ɑ
4,a,eɪ
...,...,...
43027,zemfira,z ɪ m f i ɹ ə
43028,zemfira,z ɛ m f i ɹ ə
43029,ziti,z i ɾ i
43030,zun,z ʌ n


In [13]:
model_vocab = set(['aɪ', 'aʊ', 'b', 'b̚', 'd', 'dʒ', 'd̚', 'eɪ', 'f', 'h', 'i', 'j', 'k', 'k̚', 'l', 'l̩', 'm', 'm̩', 'n', 'n̩', 'oʊ', 'p', 'p̚', 's', 't', 'tʃ', 't̚', 'u', 'v', 'w', 'z', 'æ', 'ð', 'ŋ', 'ɑ', 'ɔ', 'ɔɪ', 'ə', 'ə̥', 'ɚ', 'ɛ', 'ɝ', 'ɡ', 'ɡ̚', 'ɦ', 'ɨ', 'ɪ', 'ɹ', 'ɾ', 'ɾ̃', 'ʃ', 'ʉ', 'ʊ', 'ʌ', 'ʒ', 'ʔ', 'θ'])

In [14]:
vocab = set(p for word in df["ipa"] for p in word.split())

In [15]:
drops = df[df["ipa"].apply(lambda word: not all(p in model_vocab for p in word.split()))]
drops

Unnamed: 0,word,ipa
139,absinthe,æ b s æ̃ θ
3098,ayacut,ɑ j ʌ k ʌ ʈ ʈ u
4205,betel,b i tᵊ l
4298,bianqing,b i æ n tɕ ɪ ŋ
4299,bianqing,b j æ n tɕ ɪ ŋ
...,...,...
43002,whew,w ĭ̥ ŭ̥
43003,whew,f ĭ̥ ŭ̥
43005,whine,w ä n
43014,work,w œ k


In [16]:
cleaned_df = df[df["ipa"].apply(lambda word: all(p in model_vocab for p in word.split()))]

In [17]:
cleaned_df.tail(30)

Unnamed: 0,word,ipa
42994,wain,w ɛ n
42995,waiting,w eɪ ɾ ɪ ŋ
42996,water,w ɔ ɾ ə
42998,wean,w ɛ n
43000,welp,w ɛ l p̚
43004,whine,w ʌ ɪ n
43006,whine,w ɑ ɛ n
43007,wholemeal,h ɔ ʊ l m i ə l
43008,width,w ɪ ð θ
43009,wilde,w aɪ ə l d
