In [1]:
import re


def normalize_phonemes(phonemes: str) -> str:
    phonemes = phonemes.replace(".", " ")
    phonemes = re.sub("\s+", " ", phonemes)
    diacritics = ["ː", "ˑ", "̆", "̯", "͡", "‿", "͜", "̩", "ˈ", "ˌ", "↓"]
    digraphs = {
        "a i": "aɪ",
        "a j": "aɪ",
        "a u": "aʊ",
        "a ɪ": "aɪ",
        "a ɪ̯": "aɪ",
        "a ʊ": "aʊ",
        "a ʊ̯": "aʊ",
        "d ʒ": "dʒ",
        "e i": "eɪ",
        "e ɪ": "eɪ",
        "e ɪ̯": "eɪ",
        "e ɪ̪": "eɪ",
        "o i": "ɔɪ",
        "o u": "oʊ",
        "o w": "oʊ",
        "o ɪ": "ɔɪ",
        "o ʊ": "oʊ",
        "o ʊ̯": "oʊ",
        "t ʃ": "tʃ",
        "ɑ ɪ": "aɪ",
        "ɔ i": "ɔɪ",
        "ɔ ɪ": "ɔɪ",
        "ɔ ɪ̯": "ɔɪ",
    }
    consonants = {
        "pʰ": "p",
        "b̥": "b",
        "tʰ": "t",
        "d̥": "d",
        "tʃʰ": "tʃ",
        "d̥ʒ̊": "dʒ",
        "kʰ": "k",
        "ɡ̊": "ɡ",
        "ɸ": "f",
        "β": "v",
        "v̥": "v",
        "t̪": "θ",
        "ð̥": "ð",
        "d̪": "ð",
        "z̥": "z",
        "ʒ̊": "ʒ",
        "ɦ": "h",
        "ç": "h",
        "x": "h",
        "χ": "h",
        "ɱ": "m",
        "ɫ": "l",
        "l̥": "l",
        "ɫ̥": "l",
        "ɤ": "l",
        "ɹʷ" : "ɹ",
        "r": "ɹ",
        "ɻ": "ɹ",
        "ɹ̥ʷ": "ɹ",
        "ɹ̥": "ɹ",
        "ɾ̥": "ɹ",
        "ɻ̊": "ɹ",
        "ʋ": "ɹ",
        "ʍ": "w",
        "h w": "w",
        "ɜ ɹ": "ɚ",
    }
    vowels = {
        "ɐ": "ʌ",
        "ɒ": "ɔ",
        "ɜ": "ə",
        "ɵ": "oʊ",
        "ɘ": "ə",
    }
    leftover_vowels = {
        "a": "æ",
        "o": "ɔ",
        "e": "ɛ",
    }
    for i, j in digraphs.items():
        phonemes = phonemes.replace(i, j)
    for d in diacritics:
        phonemes = phonemes.replace(d, "")
    for i, j in consonants.items():
        phonemes = phonemes.replace(i, j)
    for i, j in vowels.items():
        phonemes = phonemes.replace(i, j)
    for i, j in leftover_vowels.items():
        phonemes = " ".join([j if p == i else p for p in phonemes.split()])
    phonemes = phonemes.strip()
    phonemes = re.sub("\s+", " ", phonemes)
    return phonemes

In [2]:
import pandas as pd

lang = "uk"
df1 = pd.read_csv(f"../lexikos/dict/wikipron/eng_latn_{lang}_broad.tsv", header=None, sep="\t", names=["word", "ipa"], keep_default_na=False)
df2 = pd.read_csv(f"../lexikos/dict/wikipron/eng_latn_{lang}_narrow.tsv", header=None, sep="\t", names=["word", "ipa"], keep_default_na=False)
df = pd.concat([df1, df2]).drop_duplicates().reset_index(drop=True)
df["word"] = df["word"].apply(lambda x: x.lower())
df = df.assign(ipa=df["ipa"].str.split(" ~ ")).explode("ipa")
df["ipa"] = df["ipa"].apply(normalize_phonemes)
df = df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,word,ipa
0,'bout,b aʊ t
1,'cause,k ɔ z
2,'cause,k ə z
3,'d,d
4,'dswounds,d z w u n d z
...,...,...
72373,zemfira,z ɛ m f i ɹ ə
72374,zemfira,z ɪ m f i ɹ ə
72375,ziti,z i ɾ i
72376,zooplasty,z ə ʊ ə p l æ s t i


In [3]:
model_vocab = set(['aɪ', 'aʊ', 'b', 'b̚', 'd', 'dʒ', 'd̚', 'eɪ', 'f', 'h', 'i', 'j', 'k', 'k̚', 'l', 'l̩', 'm', 'm̩', 'n', 'n̩', 'oʊ', 'p', 'p̚', 's', 't', 'tʃ', 't̚', 'u', 'v', 'w', 'z', 'æ', 'ð', 'ŋ', 'ɑ', 'ɔ', 'ɔɪ', 'ə', 'ə̥', 'ɚ', 'ɛ', 'ɝ', 'ɡ', 'ɡ̚', 'ɦ', 'ɨ', 'ɪ', 'ɹ', 'ɾ', 'ɾ̃', 'ʃ', 'ʉ', 'ʊ', 'ʌ', 'ʒ', 'ʔ', 'θ'])

In [4]:
vocab = set(p for word in df["ipa"] for p in word.split())

In [5]:
drops = df[df["ipa"].apply(lambda word: not all(p in model_vocab for p in word.split()))]
drops

Unnamed: 0,word,ipa
379,absinthe,æ b s æ̃ θ
471,abune,ə b ʏ n
2471,amazigh,ʔ æ m æ z i ʁ
2472,amazigh,ʔ ɑ m ɑ z i ʁ
2922,andouille,ɔ̃ d u j
...,...,...
72359,wouldn't,w ʊ dⁿ n t
72362,writer,ɹ̠ aɪ t ə
72363,www,d ʌ ɥ ʊ d ʌ ɥ ʊ d ʌ bˡ j u
72366,ye,ʝ ɪ i


In [6]:
cleaned_df = df[df["ipa"].apply(lambda word: all(p in model_vocab for p in word.split()))]
# only include words with a-z characters with no accented characters
cleaned_df = cleaned_df[cleaned_df["word"].str.match("^[a-z]+$")]

In [7]:
cleaned_df

Unnamed: 0,word,ipa
40,a,eɪ
41,a,æ
42,a,ɑ
43,a,ɔ
44,a,ə
...,...,...
72373,zemfira,z ɛ m f i ɹ ə
72374,zemfira,z ɪ m f i ɹ ə
72375,ziti,z i ɾ i
72376,zooplasty,z ə ʊ ə p l æ s t i


In [8]:
cleaned_df.to_csv(f"standardized_wikipron/eng_latn_{lang}.tsv", sep="\t", index=False, header=False)